DPC++ Runtime
Runtime libraries for oneAPI DPC++
commands.cpp
Go to the documentation of this file.
1 //===----------- commands.cpp - SYCL commands -------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 
11 #include <detail/context_impl.hpp>
12 #include <detail/event_impl.hpp>
15 #include <detail/kernel_impl.hpp>
16 #include <detail/kernel_info.hpp>
18 #include <detail/program_impl.hpp>
20 #include <detail/queue_impl.hpp>
21 #include <detail/sampler_impl.hpp>
24 #include <detail/stream_impl.hpp>
25 #include <detail/xpti_registry.hpp>
26 #include <sycl/access/access.hpp>
27 #include <sycl/backend_types.hpp>
28 #include <sycl/detail/cg_types.hpp>
30 #include <sycl/sampler.hpp>
31 
32 #include <cassert>
33 #include <optional>
34 #include <string>
35 #include <vector>
36 
37 #ifdef __has_include
38 #if __has_include(<cxxabi.h>)
39 #define __SYCL_ENABLE_GNU_DEMANGLING
40 #include <cstdlib>
41 #include <cxxabi.h>
42 #include <memory>
43 #endif
44 #endif
45 
46 #ifdef XPTI_ENABLE_INSTRUMENTATION
47 #include "xpti/xpti_trace_framework.hpp"
48 #include <detail/xpti_registry.hpp>
49 #endif
50 
51 namespace sycl {
52 inline namespace _V1 {
53 namespace detail {
54 
55 #ifdef XPTI_ENABLE_INSTRUMENTATION
56 // Global graph for the application
57 extern xpti::trace_event_data_t *GSYCLGraphEvent;
58 
59 bool CurrentCodeLocationValid() {
60  detail::tls_code_loc_t Tls;
61  auto CodeLoc = Tls.query();
62  auto FileName = CodeLoc.fileName();
63  auto FunctionName = CodeLoc.functionName();
64  return (FileName && FileName[0] != '\0') ||
65  (FunctionName && FunctionName[0] != '\0');
66 }
67 #endif
68 
69 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
70 struct DemangleHandle {
71  char *p;
72  DemangleHandle(char *ptr) : p(ptr) {}
73 
74  DemangleHandle &operator=(const DemangleHandle &) = delete;
75 
76  ~DemangleHandle() { std::free(p); }
77 };
78 static std::string demangleKernelName(std::string Name) {
79  int Status = -1; // some arbitrary value to eliminate the compiler warning
80  DemangleHandle result(abi::__cxa_demangle(Name.c_str(), NULL, NULL, &Status));
81  return (Status == 0) ? result.p : Name;
82 }
83 #else
84 static std::string demangleKernelName(std::string Name) { return Name; }
85 #endif
86 
87 static std::string deviceToString(device Device) {
88  if (getSyclObjImpl(Device)->is_host())
89  return "HOST";
90  else if (Device.is_cpu())
91  return "CPU";
92  else if (Device.is_gpu())
93  return "GPU";
94  else if (Device.is_accelerator())
95  return "ACCELERATOR";
96  else
97  return "UNKNOWN";
98 }
99 
101  const KernelArgMask *EliminatedArgMask, std::vector<ArgDesc> &Args,
102  std::function<void(detail::ArgDesc &Arg, int NextTrueIndex)> Func) {
103  if (!EliminatedArgMask) {
104  for (ArgDesc &Arg : Args) {
105  Func(Arg, Arg.MIndex);
106  }
107  } else {
108  // TODO this is not necessary as long as we can guarantee that the
109  // arguments are already sorted (e. g. handle the sorting in handler
110  // if necessary due to set_arg(...) usage).
111  std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
112  return A.MIndex < B.MIndex;
113  });
114  int LastIndex = -1;
115  size_t NextTrueIndex = 0;
116 
117  for (ArgDesc &Arg : Args) {
118  // Handle potential gaps in set arguments (e. g. if some of them are
119  // set on the user side).
120  for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx)
121  if (!(*EliminatedArgMask)[Idx])
122  ++NextTrueIndex;
123  LastIndex = Arg.MIndex;
124 
125  if ((*EliminatedArgMask)[Arg.MIndex])
126  continue;
127 
128  Func(Arg, NextTrueIndex);
129  ++NextTrueIndex;
130  }
131  }
132 }
133 
134 #ifdef XPTI_ENABLE_INSTRUMENTATION
135 static size_t deviceToID(const device &Device) {
136  if (getSyclObjImpl(Device)->is_host())
137  return 0;
138  else
139  return reinterpret_cast<size_t>(getSyclObjImpl(Device)->getHandleRef());
140 }
141 #endif
142 
143 static std::string accessModeToString(access::mode Mode) {
144  switch (Mode) {
145  case access::mode::read:
146  return "read";
147  case access::mode::write:
148  return "write";
150  return "read_write";
152  return "discard_write";
154  return "discard_read_write";
155  default:
156  return "unknown";
157  }
158 }
159 
160 #ifdef XPTI_ENABLE_INSTRUMENTATION
161 // Using the command group type to create node types for the asynchronous task
162 // graph modeling
163 static std::string commandToNodeType(Command::CommandType Type) {
164  switch (Type) {
165  case Command::CommandType::RUN_CG:
166  return "command_group_node";
167  case Command::CommandType::COPY_MEMORY:
168  return "memory_transfer_node";
169  case Command::CommandType::ALLOCA:
170  return "memory_allocation_node";
171  case Command::CommandType::ALLOCA_SUB_BUF:
172  return "sub_buffer_creation_node";
173  case Command::CommandType::RELEASE:
174  return "memory_deallocation_node";
175  case Command::CommandType::MAP_MEM_OBJ:
176  return "memory_transfer_node";
177  case Command::CommandType::UNMAP_MEM_OBJ:
178  return "memory_transfer_node";
179  case Command::CommandType::UPDATE_REQUIREMENT:
180  return "host_acc_create_buffer_lock_node";
181  case Command::CommandType::EMPTY_TASK:
182  return "host_acc_destroy_buffer_release_node";
183  case Command::CommandType::FUSION:
184  return "kernel_fusion_placeholder_node";
185  default:
186  return "unknown_node";
187  }
188 }
189 
190 // Using the names being generated and the string are subject to change to
191 // something more meaningful to end-users as this will be visible in analysis
192 // tools that subscribe to this data
193 static std::string commandToName(Command::CommandType Type) {
194  switch (Type) {
195  case Command::CommandType::RUN_CG:
196  return "Command Group Action";
197  case Command::CommandType::COPY_MEMORY:
198  return "Memory Transfer (Copy)";
199  case Command::CommandType::ALLOCA:
200  return "Memory Allocation";
201  case Command::CommandType::ALLOCA_SUB_BUF:
202  return "Sub Buffer Creation";
203  case Command::CommandType::RELEASE:
204  return "Memory Deallocation";
205  case Command::CommandType::MAP_MEM_OBJ:
206  return "Memory Transfer (Map)";
207  case Command::CommandType::UNMAP_MEM_OBJ:
208  return "Memory Transfer (Unmap)";
209  case Command::CommandType::UPDATE_REQUIREMENT:
210  return "Host Accessor Creation/Buffer Lock";
211  case Command::CommandType::EMPTY_TASK:
212  return "Host Accessor Destruction/Buffer Lock Release";
213  case Command::CommandType::FUSION:
214  return "Kernel Fusion Placeholder";
215  default:
216  return "Unknown Action";
217  }
218 }
219 #endif
220 
221 std::vector<sycl::detail::pi::PiEvent>
222 Command::getPiEvents(const std::vector<EventImplPtr> &EventImpls) const {
223  std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
224  for (auto &EventImpl : EventImpls) {
225  if (EventImpl->getHandleRef() == nullptr)
226  continue;
227 
228  // Do not add redundant event dependencies for in-order queues.
229  // At this stage dependency is definitely pi task and need to check if
230  // current one is a host task. In this case we should not skip pi event due
231  // to different sync mechanisms for different task types on in-order queue.
232  const QueueImplPtr &WorkerQueue = getWorkerQueue();
233  // MWorkerQueue in command is always not null. So check if
234  // EventImpl->getWorkerQueue != nullptr is implicit.
235  if (EventImpl->getWorkerQueue() == WorkerQueue &&
236  WorkerQueue->isInOrder() && !isHostTask())
237  continue;
238 
239  RetPiEvents.push_back(EventImpl->getHandleRef());
240  }
241 
242  return RetPiEvents;
243 }
244 
245 // This function is implemented (duplicating getPiEvents a lot) as short term
246 // solution for the issue that barrier with wait list could not
247 // handle empty pi event handles when kernel is enqueued on host task
248 // completion.
249 std::vector<sycl::detail::pi::PiEvent> Command::getPiEventsBlocking(
250  const std::vector<EventImplPtr> &EventImpls) const {
251  std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
252  for (auto &EventImpl : EventImpls) {
253  // Throwaway events created with empty constructor will not have a context
254  // (which is set lazily) calling getContextImpl() would set that
255  // context, which we wish to avoid as it is expensive.
256  // Skip host task also.
257  if (!EventImpl->isContextInitialized() || EventImpl->is_host())
258  continue;
259  // In this path nullptr native event means that the command has not been
260  // enqueued. It may happen if async enqueue in a host task is involved.
261  if (EventImpl->getHandleRef() == nullptr) {
262  if (!EventImpl->getCommand() ||
263  !static_cast<Command *>(EventImpl->getCommand())->producesPiEvent())
264  continue;
265  std::vector<Command *> AuxCmds;
266  Scheduler::getInstance().enqueueCommandForCG(EventImpl, AuxCmds,
267  BLOCKING);
268  }
269  // Do not add redundant event dependencies for in-order queues.
270  // At this stage dependency is definitely pi task and need to check if
271  // current one is a host task. In this case we should not skip pi event due
272  // to different sync mechanisms for different task types on in-order queue.
273  const QueueImplPtr &WorkerQueue = getWorkerQueue();
274  // MWorkerQueue in command is always not null. So check if
275  // EventImpl->getWorkerQueue != nullptr is implicit.
276  if (EventImpl->getWorkerQueue() == WorkerQueue &&
277  WorkerQueue->isInOrder() && !isHostTask())
278  continue;
279 
280  RetPiEvents.push_back(EventImpl->getHandleRef());
281  }
282 
283  return RetPiEvents;
284 }
285 
286 bool Command::isHostTask() const {
287  return (MType == CommandType::RUN_CG) /* host task has this type also */ &&
288  ((static_cast<const ExecCGCommand *>(this))->getCG().getType() ==
289  CG::CGTYPE::CodeplayHostTask);
290 }
291 
292 static void flushCrossQueueDeps(const std::vector<EventImplPtr> &EventImpls,
293  const QueueImplPtr &Queue) {
294  for (auto &EventImpl : EventImpls) {
295  EventImpl->flushIfNeeded(Queue);
296  }
297 }
298 
300  ExecCGCommand *MThisCmd;
301  std::vector<interop_handle::ReqToMem> MReqToMem;
302 
303  pi_result waitForEvents() const {
304  std::map<const PluginPtr, std::vector<EventImplPtr>>
305  RequiredEventsPerPlugin;
306 
307  for (const EventImplPtr &Event : MThisCmd->MPreparedDepsEvents) {
308  const PluginPtr &Plugin = Event->getPlugin();
309  RequiredEventsPerPlugin[Plugin].push_back(Event);
310  }
311 
312  // wait for dependency device events
313  // FIXME Current implementation of waiting for events will make the thread
314  // 'sleep' until all of dependency events are complete. We need a bit more
315  // sophisticated waiting mechanism to allow to utilize this thread for any
316  // other available job and resume once all required events are ready.
317  for (auto &PluginWithEvents : RequiredEventsPerPlugin) {
318  std::vector<sycl::detail::pi::PiEvent> RawEvents =
319  MThisCmd->getPiEvents(PluginWithEvents.second);
320  try {
321  PluginWithEvents.first->call<PiApiKind::piEventsWait>(RawEvents.size(),
322  RawEvents.data());
323  } catch (const sycl::exception &E) {
324  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
325  HostTask.MQueue->reportAsyncException(std::current_exception());
326  return (pi_result)E.get_cl_code();
327  } catch (...) {
328  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
329  HostTask.MQueue->reportAsyncException(std::current_exception());
330  return PI_ERROR_UNKNOWN;
331  }
332  }
333 
334  // Wait for dependency host events.
335  // Host events can't throw exceptions so don't try to catch it.
336  for (const EventImplPtr &Event : MThisCmd->MPreparedHostDepsEvents) {
337  Event->waitInternal();
338  }
339 
340  return PI_SUCCESS;
341  }
342 
343 public:
345  std::vector<interop_handle::ReqToMem> ReqToMem)
346  : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)) {}
347 
348  void operator()() const {
349  assert(MThisCmd->getCG().getType() == CG::CGTYPE::CodeplayHostTask);
350 
351  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
352 
353 #ifdef XPTI_ENABLE_INSTRUMENTATION
354  // Host task is executed async and in a separate thread that do not allow to
355  // use code location data stored in TLS. So we keep submission code location
356  // as Command field and put it here to TLS so that thrown exception could
357  // query and report it.
358  std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
359  if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
360  AsyncCodeLocationPtr.reset(
362  }
363 #endif
364 
365  pi_result WaitResult = waitForEvents();
366  if (WaitResult != PI_SUCCESS) {
367  std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error(
368  std::string("Couldn't wait for host-task's dependencies"),
369  WaitResult));
370  HostTask.MQueue->reportAsyncException(EPtr);
371  // reset host-task's lambda and quit
372  HostTask.MHostTask.reset();
374  return;
375  }
376 
377  try {
378  // we're ready to call the user-defined lambda now
379  if (HostTask.MHostTask->isInteropTask()) {
380  interop_handle IH{MReqToMem, HostTask.MQueue,
381  HostTask.MQueue->getDeviceImplPtr(),
382  HostTask.MQueue->getContextImplPtr()};
383 
384  HostTask.MHostTask->call(IH);
385  } else
386  HostTask.MHostTask->call();
387  } catch (...) {
388  auto CurrentException = std::current_exception();
389 #ifdef XPTI_ENABLE_INSTRUMENTATION
390  // sycl::exception emit tracing of message with code location if
391  // available. For other types of exception we need to explicitly trigger
392  // tracing by calling TraceEventXPTI.
393  if (xptiTraceEnabled()) {
394  try {
395  rethrow_exception(CurrentException);
396  } catch (const sycl::exception &) {
397  // it is already traced, nothing to care about
398  } catch (const std::exception &StdException) {
399  GlobalHandler::instance().TraceEventXPTI(StdException.what());
400  } catch (...) {
402  "Host task lambda thrown non standard exception");
403  }
404  }
405 #endif
406  HostTask.MQueue->reportAsyncException(CurrentException);
407  }
408 
409  HostTask.MHostTask.reset();
410 
411 #ifdef XPTI_ENABLE_INSTRUMENTATION
412  // Host Task is done, clear its submittion location to not interfere with
413  // following dependent kernels submission.
414  AsyncCodeLocationPtr.reset();
415 #endif
416 
417  try {
418  // If we enqueue blocked users - pi level could throw exception that
419  // should be treated as async now.
421  } catch (...) {
422  auto CurrentException = std::current_exception();
423  HostTask.MQueue->reportAsyncException(CurrentException);
424  }
425  }
426 };
427 
429  for (const EventImplPtr &HostEvent : MPreparedHostDepsEvents)
430  HostEvent->waitInternal();
431 }
432 
434  std::vector<EventImplPtr> &EventImpls,
435  sycl::detail::pi::PiEvent &Event) {
436  if (!EventImpls.empty()) {
437  if (Queue->is_host()) {
438  // Host queue can wait for events from different contexts, i.e. it may
439  // contain events with different contexts in its MPreparedDepsEvents.
440  // OpenCL 2.1 spec says that clWaitForEvents will return
441  // CL_INVALID_CONTEXT if events specified in the list do not belong to
442  // the same context. Thus we split all the events into per-context map.
443  // An example. We have two queues for the same CPU device: Q1, Q2. Thus
444  // we will have two different contexts for the same CPU device: C1, C2.
445  // Also we have default host queue. This queue is accessible via
446  // Scheduler. Now, let's assume we have three different events: E1(C1),
447  // E2(C1), E3(C2). The command's MPreparedDepsEvents will contain all
448  // three events (E1, E2, E3). Now, if piEventsWait is called for all
449  // three events we'll experience failure with CL_INVALID_CONTEXT 'cause
450  // these events refer to different contexts.
451  std::map<context_impl *, std::vector<EventImplPtr>>
452  RequiredEventsPerContext;
453 
454  for (const EventImplPtr &Event : EventImpls) {
455  ContextImplPtr Context = Event->getContextImpl();
456  assert(Context.get() &&
457  "Only non-host events are expected to be waited for here");
458  RequiredEventsPerContext[Context.get()].push_back(Event);
459  }
460 
461  for (auto &CtxWithEvents : RequiredEventsPerContext) {
462  std::vector<sycl::detail::pi::PiEvent> RawEvents =
463  getPiEvents(CtxWithEvents.second);
464  CtxWithEvents.first->getPlugin()->call<PiApiKind::piEventsWait>(
465  RawEvents.size(), RawEvents.data());
466  }
467  } else {
468 #ifndef NDEBUG
469  for (const EventImplPtr &Event : EventImpls)
470  assert(Event->getContextImpl().get() &&
471  "Only non-host events are expected to be waited for here");
472 #endif
473 
474  std::vector<sycl::detail::pi::PiEvent> RawEvents =
475  getPiEvents(EventImpls);
476  flushCrossQueueDeps(EventImpls, getWorkerQueue());
477  const PluginPtr &Plugin = Queue->getPlugin();
478 
479  if (MEvent != nullptr)
480  MEvent->setHostEnqueueTime();
481  Plugin->call<PiApiKind::piEnqueueEventsWait>(
482  Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
483  }
484  }
485 }
486 
491  CommandType Type, QueueImplPtr Queue,
493  const std::vector<sycl::detail::pi::PiExtSyncPoint> &SyncPoints)
494  : MQueue(std::move(Queue)),
495  MEvent(std::make_shared<detail::event_impl>(MQueue)),
496  MPreparedDepsEvents(MEvent->getPreparedDepsEvents()),
497  MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()), MType(Type),
498  MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints) {
500  MEvent->setWorkerQueue(MWorkerQueue);
501  MEvent->setSubmittedQueue(MWorkerQueue);
502  MEvent->setCommand(this);
503  MEvent->setContextImpl(MQueue->getContextImplPtr());
504  MEvent->setStateIncomplete();
506 
507 #ifdef XPTI_ENABLE_INSTRUMENTATION
508  if (!xptiTraceEnabled())
509  return;
510  // Obtain the stream ID so all commands can emit traces to that stream
511  MStreamID = xptiRegisterStream(SYCL_STREAM_NAME);
512 #endif
513 }
514 
516 #ifdef XPTI_ENABLE_INSTRUMENTATION
518 #endif
519 }
520 
532  Command *Cmd, void *ObjAddr, bool IsCommand,
533  std::optional<access::mode> AccMode) {
534 #ifdef XPTI_ENABLE_INSTRUMENTATION
535  // Bail early if either the source or the target node for the given
536  // dependency is undefined or NULL
537  constexpr uint16_t NotificationTraceType = xpti::trace_edge_create;
538  if (!(xptiCheckTraceEnabled(MStreamID, NotificationTraceType) &&
539  MTraceEvent && Cmd && Cmd->MTraceEvent))
540  return;
541 
542  // If all the information we need for creating an edge event is available,
543  // then go ahead with creating it; if not, bail early!
544  xpti::utils::StringHelper SH;
545  std::string AddressStr = SH.addressAsString<void *>(ObjAddr);
546  std::string Prefix = AccMode ? accessModeToString(AccMode.value()) : "Event";
547  std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
548  // Create an edge with the dependent buffer address for which a command
549  // object has been created as one of the properties of the edge
550  xpti::payload_t Payload(TypeString.c_str(), MAddress);
551  uint64_t EdgeInstanceNo;
552  xpti_td *EdgeEvent =
553  xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
554  xpti_at::active, &EdgeInstanceNo);
555  if (EdgeEvent) {
556  xpti_td *SrcEvent = static_cast<xpti_td *>(Cmd->MTraceEvent);
557  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
558  EdgeEvent->source_id = SrcEvent->unique_id;
559  EdgeEvent->target_id = TgtEvent->unique_id;
560  if (IsCommand) {
561  xpti::addMetadata(EdgeEvent, "access_mode",
562  static_cast<int>(AccMode.value()));
563  xpti::addMetadata(EdgeEvent, "memory_object",
564  reinterpret_cast<size_t>(ObjAddr));
565  } else {
566  xpti::addMetadata(EdgeEvent, "event", reinterpret_cast<size_t>(ObjAddr));
567  }
568  xptiNotifySubscribers(MStreamID, NotificationTraceType,
569  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
570  nullptr);
571  }
572  // General comment - None of these are serious errors as the instrumentation
573  // layer MUST be tolerant of errors. If we need to let the end user know, we
574  // throw exceptions in the future
575 #endif
576 }
577 
583  Command *Cmd, sycl::detail::pi::PiEvent &PiEventAddr) {
584 #ifdef XPTI_ENABLE_INSTRUMENTATION
585  // If we have failed to create an event to represent the Command, then we
586  // cannot emit an edge event. Bail early!
587  if (!(xptiCheckTraceEnabled(MStreamID) && MTraceEvent))
588  return;
589 
590  if (Cmd && Cmd->MTraceEvent) {
591  // If the event is associated with a command, we use this command's trace
592  // event as the source of edge, hence modeling the control flow
593  emitEdgeEventForCommandDependence(Cmd, (void *)PiEventAddr, false);
594  return;
595  }
596  if (PiEventAddr) {
597  xpti::utils::StringHelper SH;
598  std::string AddressStr =
599  SH.addressAsString<sycl::detail::pi::PiEvent>(PiEventAddr);
600  // This is the case when it is a OCL event enqueued by the user or another
601  // event is registered by the runtime as a dependency The dependency on
602  // this occasion is an OCL event; so we build a virtual node in the graph
603  // with the event as the metadata for the node
604  std::string NodeName = SH.nameWithAddressString("virtual_node", AddressStr);
605  // Node name is "virtual_node[<event_addr>]"
606  xpti::payload_t VNPayload(NodeName.c_str(), MAddress);
607  uint64_t VNodeInstanceNo;
608  xpti_td *NodeEvent =
609  xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
610  xpti_at::active, &VNodeInstanceNo);
611  // Emit the virtual node first
612  xpti::addMetadata(NodeEvent, "kernel_name", NodeName);
613  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
614  detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
615  nullptr);
616  // Create a new event for the edge
617  std::string EdgeName = SH.nameWithAddressString("Event", AddressStr);
618  xpti::payload_t EdgePayload(EdgeName.c_str(), MAddress);
619  uint64_t EdgeInstanceNo;
620  xpti_td *EdgeEvent =
621  xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
622  xpti_at::active, &EdgeInstanceNo);
623  if (EdgeEvent && NodeEvent) {
624  // Source node represents the event and this event needs to be completed
625  // before target node can execute
626  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
627  EdgeEvent->source_id = NodeEvent->unique_id;
628  EdgeEvent->target_id = TgtEvent->unique_id;
629  xpti::addMetadata(EdgeEvent, "event",
630  reinterpret_cast<size_t>(PiEventAddr));
631  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
632  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
633  nullptr);
634  }
635  return;
636  }
637 #endif
638 }
639 
640 uint64_t Command::makeTraceEventProlog(void *MAddress) {
641  uint64_t CommandInstanceNo = 0;
642 #ifdef XPTI_ENABLE_INSTRUMENTATION
643  if (!xptiCheckTraceEnabled(MStreamID, xpti::trace_node_create))
644  return CommandInstanceNo;
645 
647  // Setup the member variables with information needed for event notification
648  MCommandNodeType = commandToNodeType(MType);
649  MCommandName = commandToName(MType);
650  xpti::utils::StringHelper SH;
651  MAddressString = SH.addressAsString<void *>(MAddress);
652  std::string CommandString =
653  SH.nameWithAddressString(MCommandName, MAddressString);
654 
655  xpti::payload_t p(CommandString.c_str(), MAddress);
656  xpti_td *CmdTraceEvent =
657  xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
658  xpti_at::active, &CommandInstanceNo);
659  MInstanceID = CommandInstanceNo;
660  if (CmdTraceEvent) {
661  MTraceEvent = (void *)CmdTraceEvent;
662  // If we are seeing this event again, then the instance ID will be greater
663  // than 1; in this case, we must skip sending a notification to create a
664  // node as this node has already been created. We return this value so the
665  // epilog method can be called selectively.
666  MFirstInstance = (CommandInstanceNo == 1);
667  }
668 #endif
669  return CommandInstanceNo;
670 }
671 
673 #ifdef XPTI_ENABLE_INSTRUMENTATION
674  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
675  if (!(xptiCheckTraceEnabled(MStreamID, NotificationTraceType) && MTraceEvent))
676  return;
678  xptiNotifySubscribers(MStreamID, NotificationTraceType,
679  detail::GSYCLGraphEvent,
680  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
681  static_cast<const void *>(MCommandNodeType.c_str()));
682 #endif
683 }
684 
686  std::vector<Command *> &ToCleanUp) {
687  const QueueImplPtr &WorkerQueue = getWorkerQueue();
688  const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();
689 
690  // 1. Async work is not supported for host device.
691  // 2. Non-host events can be ignored if they are not fully initialized.
692  // 3. Some types of commands do not produce PI events after they are
693  // enqueued
694  // (e.g. alloca). Note that we can't check the pi event to make that
695  // distinction since the command might still be unenqueued at this point.
696  bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized());
697  if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
698  PiEventExpected &= DepCmd->producesPiEvent();
699 
700  if (!PiEventExpected) {
701  // call to waitInternal() is in waitForPreparedHostEvents() as it's called
702  // from enqueue process functions
703  MPreparedHostDepsEvents.push_back(DepEvent);
704  return nullptr;
705  }
706 
707  Command *ConnectionCmd = nullptr;
708 
709  ContextImplPtr DepEventContext = DepEvent->getContextImpl();
710  // If contexts don't match we'll connect them using host task
711  if (DepEventContext != WorkerContext && !WorkerContext->is_host()) {
713  ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp);
714  } else
715  MPreparedDepsEvents.push_back(std::move(DepEvent));
716 
717  return ConnectionCmd;
718 }
719 
721  return MQueue->getContextImplPtr();
722 }
723 
725  assert(MWorkerQueue && "MWorkerQueue must not be nullptr");
726  return MWorkerQueue;
727 }
728 
729 bool Command::producesPiEvent() const { return true; }
730 
731 bool Command::supportsPostEnqueueCleanup() const { return true; }
732 
734  return MLeafCounter == 0 &&
736 }
737 
738 Command *Command::addDep(DepDesc NewDep, std::vector<Command *> &ToCleanUp) {
739  Command *ConnectionCmd = nullptr;
740 
741  if (NewDep.MDepCommand) {
742  ConnectionCmd =
743  processDepEvent(NewDep.MDepCommand->getEvent(), NewDep, ToCleanUp);
744  }
745  // ConnectionCmd insertion builds the following dependency structure:
746  // this -> emptyCmd (for ConnectionCmd) -> ConnectionCmd -> NewDep
747  // that means that this and NewDep are already dependent
748  if (!ConnectionCmd) {
749  MDeps.push_back(NewDep);
750  if (NewDep.MDepCommand)
751  NewDep.MDepCommand->addUser(this);
752  }
753 
754 #ifdef XPTI_ENABLE_INSTRUMENTATION
756  (void *)NewDep.MDepRequirement->MSYCLMemObj,
757  true, NewDep.MDepRequirement->MAccessMode);
758 #endif
759 
760  return ConnectionCmd;
761 }
762 
764  std::vector<Command *> &ToCleanUp) {
765 #ifdef XPTI_ENABLE_INSTRUMENTATION
766  // We need this for just the instrumentation, so guarding it will prevent
767  // unused variable warnings when instrumentation is turned off
768  Command *Cmd = (Command *)Event->getCommand();
769  sycl::detail::pi::PiEvent &PiEventAddr = Event->getHandleRef();
770  // Now make an edge for the dependent event
771  emitEdgeEventForEventDependence(Cmd, PiEventAddr);
772 #endif
773 
774  return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr},
775  ToCleanUp);
776 }
777 
779 #ifdef XPTI_ENABLE_INSTRUMENTATION
780  constexpr uint16_t NotificationTraceType = xpti::trace_signal;
781  if (!(xptiCheckTraceEnabled(MStreamID, NotificationTraceType) &&
782  MTraceEvent && PiEventAddr))
783  return;
784  // Asynchronous call, so send a signal with the event information as
785  // user_data
786  xptiNotifySubscribers(
787  MStreamID, NotificationTraceType, detail::GSYCLGraphEvent,
788  static_cast<xpti_td *>(MTraceEvent), MInstanceID, (void *)PiEventAddr);
789 #endif
790 }
791 
792 void Command::emitInstrumentation(uint16_t Type, const char *Txt) {
793 #ifdef XPTI_ENABLE_INSTRUMENTATION
794  if (!(xptiCheckTraceEnabled(MStreamID, Type) && MTraceEvent))
795  return;
796  // Trace event notifier that emits a Type event
797  xptiNotifySubscribers(MStreamID, Type, detail::GSYCLGraphEvent,
798  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
799  static_cast<const void *>(Txt));
800 #endif
801 }
802 
803 bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking,
804  std::vector<Command *> &ToCleanUp) {
805 #ifdef XPTI_ENABLE_INSTRUMENTATION
806  // If command is enqueued from host task thread - it will not have valid
807  // submission code location set. So we set it manually to properly trace
808  // failures if pi level report any.
809  std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
810  if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
811  AsyncCodeLocationPtr.reset(
813  }
814 #endif
815  // Exit if already enqueued
817  return true;
818 
819  // If the command is blocked from enqueueing
821  // Exit if enqueue type is not blocking
822  if (!Blocking) {
823  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, this);
824  return false;
825  }
826 
827 #ifdef XPTI_ENABLE_INSTRUMENTATION
828  // Scoped trace event notifier that emits a barrier begin and barrier end
829  // event, which models the barrier while enqueuing along with the blocked
830  // reason, as determined by the scheduler
831  std::string Info = "enqueue.barrier[";
832  Info += std::string(getBlockReason()) + "]";
833  emitInstrumentation(xpti::trace_barrier_begin, Info.c_str());
834 #endif
835 
836  // Wait if blocking
838  ;
839 #ifdef XPTI_ENABLE_INSTRUMENTATION
840  emitInstrumentation(xpti::trace_barrier_end, Info.c_str());
841 #endif
842  }
843 
844  std::lock_guard<std::mutex> Lock(MEnqueueMtx);
845 
846  // Exit if the command is already enqueued
848  return true;
849 
850 #ifdef XPTI_ENABLE_INSTRUMENTATION
851  emitInstrumentation(xpti::trace_task_begin, nullptr);
852 #endif
853 
855  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueFailed, this);
856  return false;
857  }
858 
859  // Command status set to "failed" beforehand, so this command
860  // has already been marked as "failed" if enqueueImp throws an exception.
861  // This will avoid execution of the same failed command twice.
864  pi_int32 Res = enqueueImp();
865 
866  if (PI_SUCCESS != Res)
867  EnqueueResult =
869  else {
871  (MEvent->is_host() || MEvent->getHandleRef() == nullptr))
872  MEvent->setComplete();
873 
874  // Consider the command is successfully enqueued if return code is
875  // PI_SUCCESS
880  assert(!MMarkedForCleanup);
881  MMarkedForCleanup = true;
882  ToCleanUp.push_back(this);
883  }
884  }
885 
886  // Emit this correlation signal before the task end
887  emitEnqueuedEventSignal(MEvent->getHandleRef());
888 #ifdef XPTI_ENABLE_INSTRUMENTATION
889  emitInstrumentation(xpti::trace_task_end, nullptr);
890 #endif
892 }
893 
894 void Command::resolveReleaseDependencies(std::set<Command *> &DepList) {
895 #ifdef XPTI_ENABLE_INSTRUMENTATION
896  assert(MType == CommandType::RELEASE && "Expected release command");
897  if (!MTraceEvent)
898  return;
899  // The current command is the target node for all dependencies as the source
900  // nodes have to be completed first before the current node can begin to
901  // execute; these edges model control flow
902  xpti_td *TgtTraceEvent = static_cast<xpti_td *>(MTraceEvent);
903  // We have all the Commands that must be completed before the release
904  // command can be enqueued; here we'll find the command that is an Alloca
905  // with the same SYCLMemObject address and create a dependency line (edge)
906  // between them in our sematic modeling
907  for (auto &Item : DepList) {
908  if (Item->MTraceEvent && Item->MAddress == MAddress) {
909  xpti::utils::StringHelper SH;
910  std::string AddressStr = SH.addressAsString<void *>(MAddress);
911  std::string TypeString =
912  "Edge:" + SH.nameWithAddressString(commandToName(MType), AddressStr);
913 
914  // Create an edge with the dependent buffer address being one of the
915  // properties of the edge
916  xpti::payload_t p(TypeString.c_str(), MAddress);
917  uint64_t EdgeInstanceNo;
918  xpti_td *EdgeEvent =
919  xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
920  xpti_at::active, &EdgeInstanceNo);
921  if (EdgeEvent) {
922  xpti_td *SrcTraceEvent = static_cast<xpti_td *>(Item->MTraceEvent);
923  EdgeEvent->target_id = TgtTraceEvent->unique_id;
924  EdgeEvent->source_id = SrcTraceEvent->unique_id;
925  xpti::addMetadata(EdgeEvent, "memory_object",
926  reinterpret_cast<size_t>(MAddress));
927  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
928  detail::GSYCLGraphEvent, EdgeEvent,
929  EdgeInstanceNo, nullptr);
930  }
931  }
932  }
933 #endif
934 }
935 
936 const char *Command::getBlockReason() const {
937  switch (MBlockReason) {
939  return "A Buffer is locked by the host accessor";
941  return "Blocked by host task";
942  }
943 
944  return "Unknown block reason";
945 }
946 
948 #ifdef XPTI_ENABLE_INSTRUMENTATION
949  if (!xptiTraceEnabled())
950  return;
951 
953  auto TData = Tls.query();
954  if (TData.fileName())
955  MSubmissionFileName = TData.fileName();
956  if (TData.functionName())
957  MSubmissionFunctionName = TData.functionName();
958  if (MSubmissionFileName.size() || MSubmissionFunctionName.size())
961  (int)TData.lineNumber(), (int)TData.columnNumber()};
962 #endif
963 }
964 
966  Requirement Req,
967  AllocaCommandBase *LinkedAllocaCmd,
968  bool IsConst)
969  : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
970  MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst),
971  MRequirement(std::move(Req)), MReleaseCmd(Queue, this) {
974 }
975 
977 #ifdef XPTI_ENABLE_INSTRUMENTATION
978  if (!xptiCheckTraceEnabled(MStreamID))
979  return;
980  // Create a payload with the command name and an event using this payload to
981  // emit a node_create
984  // Set the relevant meta data properties for this command
985  if (MTraceEvent && MFirstInstance) {
986  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
987  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
988  xpti::addMetadata(TE, "sycl_device_type",
989  deviceToString(MQueue->get_device()));
990  xpti::addMetadata(TE, "sycl_device_name",
991  getSyclObjImpl(MQueue->get_device())->getDeviceName());
992  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
993  }
994 #endif
995 }
996 
997 bool AllocaCommandBase::producesPiEvent() const { return false; }
998 
999 bool AllocaCommandBase::supportsPostEnqueueCleanup() const { return false; }
1000 
1001 bool AllocaCommandBase::readyForCleanup() const { return false; }
1002 
1004  bool InitFromUserData,
1005  AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
1006  : AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
1007  LinkedAllocaCmd, IsConst),
1008  MInitFromUserData(InitFromUserData) {
1009  // Node event must be created before the dependent edge is added to this
1010  // node, so this call must be before the addDep() call.
1012  // "Nothing to depend on"
1013  std::vector<Command *> ToCleanUp;
1014  Command *ConnectionCmd =
1015  addDep(DepDesc(nullptr, getRequirement(), this), ToCleanUp);
1016  assert(ConnectionCmd == nullptr);
1017  assert(ToCleanUp.empty());
1018  (void)ConnectionCmd;
1019 }
1020 
1022 #ifdef XPTI_ENABLE_INSTRUMENTATION
1023  if (!xptiCheckTraceEnabled(MStreamID))
1024  return;
1025 
1026  // Only if it is the first event, we emit a node create event
1027  if (MFirstInstance) {
1029  }
1030 #endif
1031 }
1032 
1033 pi_int32 AllocaCommand::enqueueImp() {
1035  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1036 
1037  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1038 
1039  void *HostPtr = nullptr;
1040  if (!MIsLeaderAlloca) {
1041 
1042  if (MQueue->is_host()) {
1043  // Do not need to make allocation if we have a linked device allocation
1044  Command::waitForEvents(MQueue, EventImpls, Event);
1045 
1046  return PI_SUCCESS;
1047  }
1048  HostPtr = MLinkedAllocaCmd->getMemAllocation();
1049  }
1050  // TODO: Check if it is correct to use std::move on stack variable and
1051  // delete it RawEvents below.
1053  MQueue->getContextImplPtr(), getSYCLMemObj(), MInitFromUserData, HostPtr,
1054  std::move(EventImpls), Event);
1055 
1056  return PI_SUCCESS;
1057 }
1058 
1059 void AllocaCommand::printDot(std::ostream &Stream) const {
1060  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1061 
1062  Stream << "ID = " << this << "\\n";
1063  Stream << "ALLOCA ON " << deviceToString(MQueue->get_device()) << "\\n";
1064  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
1065  Stream << " Link : " << this->MLinkedAllocaCmd << "\\n";
1066  Stream << "\"];" << std::endl;
1067 
1068  for (const auto &Dep : MDeps) {
1069  if (Dep.MDepCommand == nullptr)
1070  continue;
1071  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1072  << " [ label = \"Access mode: "
1073  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1074  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1075  << std::endl;
1076  }
1077 }
1078 
1080  AllocaCommandBase *ParentAlloca,
1081  std::vector<Command *> &ToEnqueue,
1082  std::vector<Command *> &ToCleanUp)
1083  : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue),
1084  std::move(Req),
1085  /*LinkedAllocaCmd*/ nullptr, /*IsConst*/ false),
1086  MParentAlloca(ParentAlloca) {
1087  // Node event must be created before the dependent edge
1088  // is added to this node, so this call must be before
1089  // the addDep() call.
1091  Command *ConnectionCmd = addDep(
1092  DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp);
1093  if (ConnectionCmd)
1094  ToEnqueue.push_back(ConnectionCmd);
1095 }
1096 
1098 #ifdef XPTI_ENABLE_INSTRUMENTATION
1099  if (!xptiCheckTraceEnabled(MStreamID))
1100  return;
1101 
1102  // Only if it is the first event, we emit a node create event and any meta
1103  // data that is available for the command
1104  if (MFirstInstance) {
1105  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1106  xpti::addMetadata(TE, "offset", this->MRequirement.MOffsetInBytes);
1107  xpti::addMetadata(TE, "access_range_start",
1108  this->MRequirement.MAccessRange[0]);
1109  xpti::addMetadata(TE, "access_range_end",
1110  this->MRequirement.MAccessRange[1]);
1112  }
1113 #endif
1114 }
1115 
1117  // In some cases parent`s memory allocation might change (e.g., after
1118  // map/unmap operations). If parent`s memory allocation changes, sub-buffer
1119  // memory allocation should be changed as well.
1120  if (MQueue->is_host()) {
1121  return static_cast<void *>(
1122  static_cast<char *>(MParentAlloca->getMemAllocation()) +
1124  }
1125  return MMemAllocation;
1126 }
1127 
1128 pi_int32 AllocaSubBufCommand::enqueueImp() {
1130  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1131  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1132 
1134  MQueue->getContextImplPtr(), MParentAlloca->getMemAllocation(),
1136  MRequirement.MAccessRange, std::move(EventImpls), Event);
1137 
1139  MMemAllocation);
1140  return PI_SUCCESS;
1141 }
1142 
1143 void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
1144  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1145 
1146  Stream << "ID = " << this << "\\n";
1147  Stream << "ALLOCA SUB BUF ON " << deviceToString(MQueue->get_device())
1148  << "\\n";
1149  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
1150  Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n";
1151  Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n";
1152  Stream << "\"];" << std::endl;
1153 
1154  for (const auto &Dep : MDeps) {
1155  if (Dep.MDepCommand == nullptr)
1156  continue;
1157  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1158  << " [ label = \"Access mode: "
1159  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1160  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1161  << std::endl;
1162  }
1163 }
1164 
1166  : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) {
1168 }
1169 
1171 #ifdef XPTI_ENABLE_INSTRUMENTATION
1172  if (!xptiCheckTraceEnabled(MStreamID))
1173  return;
1174  // Create a payload with the command name and an event using this payload to
1175  // emit a node_create
1176  MAddress = MAllocaCmd->getSYCLMemObj();
1178 
1179  if (MFirstInstance) {
1180  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1181  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1182  xpti::addMetadata(TE, "sycl_device_type",
1183  deviceToString(MQueue->get_device()));
1184  xpti::addMetadata(TE, "sycl_device_name",
1185  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1186  xpti::addMetadata(TE, "allocation_type",
1187  commandToName(MAllocaCmd->getType()));
1189  }
1190 #endif
1191 }
1192 
1193 pi_int32 ReleaseCommand::enqueueImp() {
1195  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1196  std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
1197  bool SkipRelease = false;
1198 
1199  // On host side we only allocate memory for full buffers.
1200  // Thus, deallocating sub buffers leads to double memory freeing.
1201  SkipRelease |= MQueue->is_host() && MAllocaCmd->getType() == ALLOCA_SUB_BUF;
1202 
1203  const bool CurAllocaIsHost = MAllocaCmd->getQueue()->is_host();
1204  bool NeedUnmap = false;
1205  if (MAllocaCmd->MLinkedAllocaCmd) {
1206 
1207  // When releasing one of the "linked" allocations special rules take
1208  // place:
1209  // 1. Device allocation should always be released.
1210  // 2. Host allocation should be released if host allocation is "leader".
1211  // 3. Device alloca in the pair should be in active state in order to be
1212  // correctly released.
1213 
1214  // There is no actual memory allocation if a host alloca command is
1215  // created being linked to a device allocation.
1216  SkipRelease |= CurAllocaIsHost && !MAllocaCmd->MIsLeaderAlloca;
1217 
1218  NeedUnmap |= CurAllocaIsHost == MAllocaCmd->MIsActive;
1219  }
1220 
1221  if (NeedUnmap) {
1222  const QueueImplPtr &Queue = CurAllocaIsHost
1223  ? MAllocaCmd->MLinkedAllocaCmd->getQueue()
1224  : MAllocaCmd->getQueue();
1225 
1226  EventImplPtr UnmapEventImpl(new event_impl(Queue));
1227  UnmapEventImpl->setContextImpl(Queue->getContextImplPtr());
1228  UnmapEventImpl->setStateIncomplete();
1229  sycl::detail::pi::PiEvent &UnmapEvent = UnmapEventImpl->getHandleRef();
1230 
1231  void *Src = CurAllocaIsHost
1232  ? MAllocaCmd->getMemAllocation()
1233  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1234 
1235  void *Dst = !CurAllocaIsHost
1236  ? MAllocaCmd->getMemAllocation()
1237  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1238 
1239  MemoryManager::unmap(MAllocaCmd->getSYCLMemObj(), Dst, Queue, Src,
1240  RawEvents, UnmapEvent);
1241 
1242  std::swap(MAllocaCmd->MIsActive, MAllocaCmd->MLinkedAllocaCmd->MIsActive);
1243  EventImpls.clear();
1244  EventImpls.push_back(UnmapEventImpl);
1245  }
1246  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1247  if (SkipRelease)
1248  Command::waitForEvents(MQueue, EventImpls, Event);
1249  else {
1251  MQueue->getContextImplPtr(), MAllocaCmd->getSYCLMemObj(),
1252  MAllocaCmd->getMemAllocation(), std::move(EventImpls), Event);
1253  }
1254  return PI_SUCCESS;
1255 }
1256 
1257 void ReleaseCommand::printDot(std::ostream &Stream) const {
1258  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\"";
1259 
1260  Stream << "ID = " << this << " ; ";
1261  Stream << "RELEASE ON " << deviceToString(MQueue->get_device()) << "\\n";
1262  Stream << " Alloca : " << MAllocaCmd << "\\n";
1263  Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n";
1264  Stream << "\"];" << std::endl;
1265 
1266  for (const auto &Dep : MDeps) {
1267  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1268  << " [ label = \"Access mode: "
1269  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1270  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1271  << std::endl;
1272  }
1273 }
1274 
1275 bool ReleaseCommand::producesPiEvent() const { return false; }
1276 
1277 bool ReleaseCommand::supportsPostEnqueueCleanup() const { return false; }
1278 
1279 bool ReleaseCommand::readyForCleanup() const { return false; }
1280 
1282  void **DstPtr, QueueImplPtr Queue,
1283  access::mode MapMode)
1284  : Command(CommandType::MAP_MEM_OBJ, std::move(Queue)),
1285  MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr),
1286  MMapMode(MapMode) {
1288 }
1289 
1291 #ifdef XPTI_ENABLE_INSTRUMENTATION
1292  if (!xptiCheckTraceEnabled(MStreamID))
1293  return;
1294  // Create a payload with the command name and an event using this payload to
1295  // emit a node_create
1296  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1298 
1299  if (MFirstInstance) {
1300  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1301  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1302  xpti::addMetadata(TE, "sycl_device_type",
1303  deviceToString(MQueue->get_device()));
1304  xpti::addMetadata(TE, "sycl_device_name",
1305  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1306  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1308  }
1309 #endif
1310 }
1311 
1312 pi_int32 MapMemObject::enqueueImp() {
1314  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1315  std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
1316  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1317 
1318  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1319  *MDstPtr = MemoryManager::map(
1320  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(), MQueue,
1321  MMapMode, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1322  MSrcReq.MOffset, MSrcReq.MElemSize, std::move(RawEvents), Event);
1323 
1324  return PI_SUCCESS;
1325 }
1326 
1327 void MapMemObject::printDot(std::ostream &Stream) const {
1328  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
1329 
1330  Stream << "ID = " << this << " ; ";
1331  Stream << "MAP ON " << deviceToString(MQueue->get_device()) << "\\n";
1332 
1333  Stream << "\"];" << std::endl;
1334 
1335  for (const auto &Dep : MDeps) {
1336  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1337  << " [ label = \"Access mode: "
1338  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1339  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1340  << std::endl;
1341  }
1342 }
1343 
1345  void **SrcPtr, QueueImplPtr Queue)
1346  : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)),
1347  MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) {
1349 }
1350 
1352 #ifdef XPTI_ENABLE_INSTRUMENTATION
1353  if (!xptiCheckTraceEnabled(MStreamID))
1354  return;
1355  // Create a payload with the command name and an event using this payload to
1356  // emit a node_create
1357  MAddress = MDstAllocaCmd->getSYCLMemObj();
1359 
1360  if (MFirstInstance) {
1361  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1362  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1363  xpti::addMetadata(TE, "sycl_device_type",
1364  deviceToString(MQueue->get_device()));
1365  xpti::addMetadata(TE, "sycl_device_name",
1366  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1367  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1369  }
1370 #endif
1371 }
1372 
1374  // TODO remove this workaround once the batching issue is addressed in Level
1375  // Zero plugin.
1376  // Consider the following scenario on Level Zero:
1377  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1378  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1379  // 3. queueA.wait().
1380  // 4. queueB.wait().
1381  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1382  // dependencies (i.e. they were waited for prior to enqueueing any command
1383  // that's dependent on them). This allowed Level Zero plugin to detect that
1384  // each queue is idle on steps 1/2 and submit the command list right away.
1385  // This is no longer the case since we started passing these dependencies in
1386  // an event waitlist and Level Zero plugin attempts to batch these commands,
1387  // so the execution of kernel B starts only on step 4. This workaround
1388  // restores the old behavior in this case until this is resolved.
1389  return MQueue->getDeviceImplPtr()->getBackend() !=
1391  MEvent->getHandleRef() != nullptr;
1392 }
1393 
1394 pi_int32 UnMapMemObject::enqueueImp() {
1396  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1397  std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
1398  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1399 
1400  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1401  MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(),
1402  MDstAllocaCmd->getMemAllocation(), MQueue, *MSrcPtr,
1403  std::move(RawEvents), Event);
1404 
1405  return PI_SUCCESS;
1406 }
1407 
1408 void UnMapMemObject::printDot(std::ostream &Stream) const {
1409  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
1410 
1411  Stream << "ID = " << this << " ; ";
1412  Stream << "UNMAP ON " << deviceToString(MQueue->get_device()) << "\\n";
1413 
1414  Stream << "\"];" << std::endl;
1415 
1416  for (const auto &Dep : MDeps) {
1417  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1418  << " [ label = \"Access mode: "
1419  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1420  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1421  << std::endl;
1422  }
1423 }
1424 
1426  AllocaCommandBase *SrcAllocaCmd,
1427  Requirement DstReq,
1428  AllocaCommandBase *DstAllocaCmd,
1429  QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
1430  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1431  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1432  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)),
1433  MDstAllocaCmd(DstAllocaCmd) {
1434  if (!MSrcQueue->is_host()) {
1435  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1436  }
1437 
1438  MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue;
1439  MEvent->setWorkerQueue(MWorkerQueue);
1440 
1442 }
1443 
1445 #ifdef XPTI_ENABLE_INSTRUMENTATION
1446  if (!xptiCheckTraceEnabled(MStreamID))
1447  return;
1448  // Create a payload with the command name and an event using this payload to
1449  // emit a node_create
1450  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1452 
1453  if (MFirstInstance) {
1454  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1455  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1456  deviceToID(MQueue->get_device()));
1457  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1458  deviceToString(MQueue->get_device()));
1459  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1460  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1461  xpti::addMetadata(CmdTraceEvent, "memory_object",
1462  reinterpret_cast<size_t>(MAddress));
1463  xpti::addMetadata(CmdTraceEvent, "copy_from",
1464  reinterpret_cast<size_t>(
1465  getSyclObjImpl(MSrcQueue->get_device()).get()));
1466  xpti::addMetadata(
1467  CmdTraceEvent, "copy_to",
1468  reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()));
1470  }
1471 #endif
1472 }
1473 
1475  return getWorkerQueue()->getContextImplPtr();
1476 }
1477 
1479  // TODO remove this workaround once the batching issue is addressed in Level
1480  // Zero plugin.
1481  // Consider the following scenario on Level Zero:
1482  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1483  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1484  // 3. queueA.wait().
1485  // 4. queueB.wait().
1486  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1487  // dependencies (i.e. they were waited for prior to enqueueing any command
1488  // that's dependent on them). This allowed Level Zero plugin to detect that
1489  // each queue is idle on steps 1/2 and submit the command list right away.
1490  // This is no longer the case since we started passing these dependencies in
1491  // an event waitlist and Level Zero plugin attempts to batch these commands,
1492  // so the execution of kernel B starts only on step 4. This workaround
1493  // restores the old behavior in this case until this is resolved.
1494  return MQueue->is_host() ||
1495  MQueue->getDeviceImplPtr()->getBackend() !=
1497  MEvent->getHandleRef() != nullptr;
1498 }
1499 
1500 pi_int32 MemCpyCommand::enqueueImp() {
1502  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1503 
1504  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1505 
1506  auto RawEvents = getPiEvents(EventImpls);
1507  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1508 
1510  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1511  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1512  MSrcReq.MOffset, MSrcReq.MElemSize, MDstAllocaCmd->getMemAllocation(),
1513  MQueue, MDstReq.MDims, MDstReq.MMemoryRange, MDstReq.MAccessRange,
1514  MDstReq.MOffset, MDstReq.MElemSize, std::move(RawEvents), Event, MEvent);
1515 
1516  return PI_SUCCESS;
1517 }
1518 
1519 void MemCpyCommand::printDot(std::ostream &Stream) const {
1520  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
1521 
1522  Stream << "ID = " << this << " ; ";
1523  Stream << "MEMCPY ON " << deviceToString(MQueue->get_device()) << "\\n";
1524  Stream << "From: " << MSrcAllocaCmd << " is host: " << MSrcQueue->is_host()
1525  << "\\n";
1526  Stream << "To: " << MDstAllocaCmd << " is host: " << MQueue->is_host()
1527  << "\\n";
1528 
1529  Stream << "\"];" << std::endl;
1530 
1531  for (const auto &Dep : MDeps) {
1532  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1533  << " [ label = \"Access mode: "
1534  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1535  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1536  << std::endl;
1537  }
1538 }
1539 
1540 AllocaCommandBase *ExecCGCommand::getAllocaForReq(Requirement *Req) {
1541  for (const DepDesc &Dep : MDeps) {
1542  if (Dep.MDepRequirement == Req)
1543  return Dep.MAllocaCmd;
1544  }
1545  // Default constructed accessors do not add dependencies, but they can be
1546  // passed to commands. Simply return nullptr, since they are empty and don't
1547  // really require any memory.
1548  return nullptr;
1549 }
1550 
1551 std::vector<std::shared_ptr<const void>>
1553  if (MCommandGroup->getType() == CG::Kernel)
1554  return ((CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
1555  return {};
1556 }
1557 
1559  if (MCommandGroup->getType() == CG::Kernel)
1560  ((CGExecKernel *)MCommandGroup.get())->clearAuxiliaryResources();
1561 }
1562 
1563 pi_int32 UpdateHostRequirementCommand::enqueueImp() {
1565  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1566  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1567  Command::waitForEvents(MQueue, EventImpls, Event);
1568 
1569  assert(MSrcAllocaCmd && "Expected valid alloca command");
1570  assert(MSrcAllocaCmd->getMemAllocation() && "Expected valid source pointer");
1571  assert(MDstPtr && "Expected valid target pointer");
1572  *MDstPtr = MSrcAllocaCmd->getMemAllocation();
1573 
1574  return PI_SUCCESS;
1575 }
1576 
1577 void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const {
1578  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\"";
1579 
1580  Stream << "ID = " << this << "\\n";
1581  Stream << "UPDATE REQ ON " << deviceToString(MQueue->get_device()) << "\\n";
1582  bool IsReqOnBuffer =
1583  MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer;
1584  Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n";
1585  if (IsReqOnBuffer)
1586  Stream << "Is sub buffer: " << std::boolalpha << MDstReq.MIsSubBuffer
1587  << "\\n";
1588 
1589  Stream << "\"];" << std::endl;
1590 
1591  for (const auto &Dep : MDeps) {
1592  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1593  << " [ label = \"Access mode: "
1594  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1595  << "MemObj: " << Dep.MAllocaCmd->getSYCLMemObj() << " \" ]"
1596  << std::endl;
1597  }
1598 }
1599 
1601  AllocaCommandBase *SrcAllocaCmd,
1602  Requirement DstReq, void **DstPtr,
1603  QueueImplPtr SrcQueue,
1604  QueueImplPtr DstQueue)
1605  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1606  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1607  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) {
1608  if (!MSrcQueue->is_host()) {
1609  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1610  }
1611 
1612  MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue;
1613  MEvent->setWorkerQueue(MWorkerQueue);
1614 
1616 }
1617 
1619 #ifdef XPTI_ENABLE_INSTRUMENTATION
1620  if (!xptiCheckTraceEnabled(MStreamID))
1621  return;
1622  // Create a payload with the command name and an event using this payload to
1623  // emit a node_create
1624  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1626 
1627  if (MFirstInstance) {
1628  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1629  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1630  deviceToID(MQueue->get_device()));
1631  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1632  deviceToString(MQueue->get_device()));
1633  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1634  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1635  xpti::addMetadata(CmdTraceEvent, "memory_object",
1636  reinterpret_cast<size_t>(MAddress));
1637  xpti::addMetadata(CmdTraceEvent, "copy_from",
1638  reinterpret_cast<size_t>(
1639  getSyclObjImpl(MSrcQueue->get_device()).get()));
1640  xpti::addMetadata(
1641  CmdTraceEvent, "copy_to",
1642  reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()));
1644  }
1645 #endif
1646 }
1647 
1649  return getWorkerQueue()->getContextImplPtr();
1650 }
1651 
1652 pi_int32 MemCpyCommandHost::enqueueImp() {
1653  const QueueImplPtr &Queue = getWorkerQueue();
1655  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1656  std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
1657 
1658  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1659  // Omit copying if mode is discard one.
1660  // TODO: Handle this at the graph building time by, for example, creating
1661  // empty node instead of memcpy.
1664  Command::waitForEvents(Queue, EventImpls, Event);
1665 
1666  return PI_SUCCESS;
1667  }
1668 
1669  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1671  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1672  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1673  MSrcReq.MOffset, MSrcReq.MElemSize, *MDstPtr, MQueue, MDstReq.MDims,
1674  MDstReq.MMemoryRange, MDstReq.MAccessRange, MDstReq.MOffset,
1675  MDstReq.MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent);
1676 
1677  return PI_SUCCESS;
1678 }
1679 
1681  : Command(CommandType::EMPTY_TASK, std::move(Queue)) {
1683 }
1684 
1685 pi_int32 EmptyCommand::enqueueImp() {
1687  waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
1688 
1689  return PI_SUCCESS;
1690 }
1691 
1693  const Requirement *Req) {
1694  const Requirement &ReqRef = *Req;
1695  MRequirements.emplace_back(ReqRef);
1696  const Requirement *const StoredReq = &MRequirements.back();
1697 
1698  // EmptyCommand is always host one, so we believe that result of addDep is
1699  // nil
1700  std::vector<Command *> ToCleanUp;
1701  Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp);
1702  assert(Cmd == nullptr && "Conection command should be null for EmptyCommand");
1703  assert(ToCleanUp.empty() && "addDep should add a command for cleanup only if "
1704  "there's a connection command");
1705  (void)Cmd;
1706 }
1707 
1709 #ifdef XPTI_ENABLE_INSTRUMENTATION
1710  if (!xptiCheckTraceEnabled(MStreamID))
1711  return;
1712  // Create a payload with the command name and an event using this payload to
1713  // emit a node_create
1714  if (MRequirements.empty())
1715  return;
1716 
1717  Requirement &Req = *MRequirements.begin();
1718 
1719  MAddress = Req.MSYCLMemObj;
1721 
1722  if (MFirstInstance) {
1723  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1724  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1725  deviceToID(MQueue->get_device()));
1726  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1727  deviceToString(MQueue->get_device()));
1728  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1729  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1730  xpti::addMetadata(CmdTraceEvent, "memory_object",
1731  reinterpret_cast<size_t>(MAddress));
1733  }
1734 #endif
1735 }
1736 
1737 void EmptyCommand::printDot(std::ostream &Stream) const {
1738  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
1739 
1740  Stream << "ID = " << this << "\\n";
1741  Stream << "EMPTY NODE"
1742  << "\\n";
1743 
1744  Stream << "\"];" << std::endl;
1745 
1746  for (const auto &Dep : MDeps) {
1747  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1748  << " [ label = \"Access mode: "
1749  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1750  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1751  << std::endl;
1752  }
1753 }
1754 
1755 bool EmptyCommand::producesPiEvent() const { return false; }
1756 
1757 void MemCpyCommandHost::printDot(std::ostream &Stream) const {
1758  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
1759 
1760  Stream << "ID = " << this << "\\n";
1761  Stream << "MEMCPY HOST ON " << deviceToString(MQueue->get_device()) << "\\n";
1762 
1763  Stream << "\"];" << std::endl;
1764 
1765  for (const auto &Dep : MDeps) {
1766  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1767  << " [ label = \"Access mode: "
1768  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1769  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1770  << std::endl;
1771  }
1772 }
1773 
1775  QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd,
1776  void **DstPtr)
1777  : Command(CommandType::UPDATE_REQUIREMENT, std::move(Queue)),
1778  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(Req)), MDstPtr(DstPtr) {
1779 
1781 }
1782 
1784 #ifdef XPTI_ENABLE_INSTRUMENTATION
1785  if (!xptiCheckTraceEnabled(MStreamID))
1786  return;
1787  // Create a payload with the command name and an event using this payload to
1788  // emit a node_create
1789  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1791 
1792  if (MFirstInstance) {
1793  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1794  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1795  deviceToID(MQueue->get_device()));
1796  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1797  deviceToString(MQueue->get_device()));
1798  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1799  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1800  xpti::addMetadata(CmdTraceEvent, "memory_object",
1801  reinterpret_cast<size_t>(MAddress));
1803  }
1804 #endif
1805 }
1806 
1807 static std::string cgTypeToString(detail::CG::CGTYPE Type) {
1808  switch (Type) {
1809  case detail::CG::Kernel:
1810  return "Kernel";
1811  break;
1813  return "update_host";
1814  break;
1815  case detail::CG::Fill:
1816  return "fill";
1817  break;
1819  return "copy acc to acc";
1820  break;
1822  return "copy acc to ptr";
1823  break;
1825  return "copy ptr to acc";
1826  break;
1827  case detail::CG::CopyUSM:
1828  return "copy usm";
1829  break;
1830  case detail::CG::FillUSM:
1831  return "fill usm";
1832  break;
1834  return "prefetch usm";
1835  break;
1837  return "host task";
1838  break;
1839  case detail::CG::Copy2DUSM:
1840  return "copy 2d usm";
1841  break;
1842  case detail::CG::Fill2DUSM:
1843  return "fill 2d usm";
1844  break;
1846  return "memset 2d usm";
1847  break;
1849  return "copy to device_global";
1850  break;
1852  return "copy from device_global";
1853  break;
1854  default:
1855  return "unknown";
1856  break;
1857  }
1858 }
1859 
1861  std::unique_ptr<detail::CG> CommandGroup, QueueImplPtr Queue,
1863  const std::vector<sycl::detail::pi::PiExtSyncPoint> &Dependencies)
1864  : Command(CommandType::RUN_CG, std::move(Queue), CommandBuffer,
1865  Dependencies),
1866  MCommandGroup(std::move(CommandGroup)) {
1867  if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) {
1868  MEvent->setSubmittedQueue(
1869  static_cast<detail::CGHostTask *>(MCommandGroup.get())->MQueue);
1870  }
1871 
1873 }
1874 
1875 #ifdef XPTI_ENABLE_INSTRUMENTATION
1876 std::string instrumentationGetKernelName(
1877  const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1878  const std::string &FunctionName, const std::string &SyclKernelName,
1879  void *&Address, std::optional<bool> &FromSource) {
1880  std::string KernelName;
1881  if (SyclKernel && SyclKernel->isCreatedFromSource()) {
1882  FromSource = true;
1883  pi_kernel KernelHandle = SyclKernel->getHandleRef();
1884  Address = KernelHandle;
1885  KernelName = FunctionName;
1886  } else {
1887  FromSource = false;
1888  KernelName = demangleKernelName(SyclKernelName);
1889  }
1890  return KernelName;
1891 }
1892 
1893 void instrumentationAddExtraKernelMetadata(
1894  xpti_td *&CmdTraceEvent, const NDRDescT &NDRDesc,
1895  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
1896  const std::string &KernelName,
1897  const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1898  const QueueImplPtr &Queue,
1899  std::vector<ArgDesc> &CGArgs) // CGArgs are not const since they could be
1900  // sorted in this function
1901 {
1902  std::vector<ArgDesc> Args;
1903 
1904  auto FilterArgs = [&Args](detail::ArgDesc &Arg, int NextTrueIndex) {
1905  Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex});
1906  };
1907  sycl::detail::pi::PiProgram Program = nullptr;
1908  sycl::detail::pi::PiKernel Kernel = nullptr;
1909  std::mutex *KernelMutex = nullptr;
1910  const KernelArgMask *EliminatedArgMask = nullptr;
1911 
1912  std::shared_ptr<kernel_impl> SyclKernelImpl;
1913  std::shared_ptr<device_image_impl> DeviceImageImpl;
1914 
1915  // Use kernel_bundle if available unless it is interop.
1916  // Interop bundles can't be used in the first branch, because the
1917  // kernels in interop kernel bundles (if any) do not have kernel_id and
1918  // can therefore not be looked up, but since they are self-contained
1919  // they can simply be launched directly.
1920  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
1921  kernel_id KernelID =
1923  kernel SyclKernel =
1924  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
1925  std::shared_ptr<kernel_impl> KernelImpl =
1926  detail::getSyclObjImpl(SyclKernel);
1927 
1928  EliminatedArgMask = KernelImpl->getKernelArgMask();
1929  Program = KernelImpl->getDeviceImage()->get_program_ref();
1930  } else if (nullptr != SyclKernel) {
1931  auto SyclProg = SyclKernel->getProgramImpl();
1932  Program = SyclProg->getHandleRef();
1933  if (!SyclKernel->isCreatedFromSource())
1934  EliminatedArgMask = SyclKernel->getKernelArgMask();
1935  } else {
1936  std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
1938  Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName,
1939  nullptr);
1940  }
1941 
1942  applyFuncOnFilteredArgs(EliminatedArgMask, CGArgs, FilterArgs);
1943 
1944  xpti::offload_kernel_enqueue_data_t KernelData{
1945  {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
1946  {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
1947  {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
1948  NDRDesc.GlobalOffset[2]},
1949  Args.size()};
1950  xpti::addMetadata(CmdTraceEvent, "enqueue_kernel_data", KernelData);
1951  for (size_t i = 0; i < Args.size(); i++) {
1952  std::string Prefix("arg");
1953  xpti::offload_kernel_arg_data_t arg{(int)Args[i].MType, Args[i].MPtr,
1954  Args[i].MSize, Args[i].MIndex};
1955  xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i), arg);
1956  }
1957 }
1958 
1959 void instrumentationFillCommonData(const std::string &KernelName,
1960  const std::string &FileName, uint64_t Line,
1961  uint64_t Column, const void *const Address,
1962  const QueueImplPtr &Queue,
1963  std::optional<bool> &FromSource,
1964  uint64_t &OutInstanceID,
1965  xpti_td *&OutTraceEvent) {
1966  // Get source file, line number information from the CommandGroup object
1967  // and create payload using name, address, and source info
1968  //
1969  // On Windows, since the support for builtin functions is not available in
1970  // MSVC, the MFileName, MLine will be set to nullptr and "0" respectively.
1971  // Handle this condition explicitly here.
1972  bool HasSourceInfo = false;
1973  xpti::payload_t Payload;
1974  if (!FileName.empty()) {
1975  // File name has a valid string
1976  Payload = xpti::payload_t(KernelName.c_str(), FileName.c_str(), Line,
1977  Column, Address);
1978  HasSourceInfo = true;
1979  } else if (Address) {
1980  // We have a valid function name and an address
1981  Payload = xpti::payload_t(KernelName.c_str(), Address);
1982  } else {
1983  // In any case, we will have a valid function name and we'll use that to
1984  // create the hash
1985  Payload = xpti::payload_t(KernelName.c_str());
1986  }
1987 
1988  uint64_t CGKernelInstanceNo;
1989  // Create event using the payload
1990  xpti_td *CmdTraceEvent =
1991  xptiMakeEvent("ExecCG", &Payload, xpti::trace_graph_event,
1992  xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
1993 
1994  if (CmdTraceEvent) {
1995  OutInstanceID = CGKernelInstanceNo;
1996  OutTraceEvent = CmdTraceEvent;
1997  // If we are seeing this event again, then the instance ID will be greater
1998  // than 1; in this case, we will skip sending a notification to create a
1999  // node as this node has already been created.
2000  if (CGKernelInstanceNo > 1)
2001  return;
2002 
2003  xpti::addMetadata(CmdTraceEvent, "sycl_device",
2004  deviceToID(Queue->get_device()));
2005  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
2006  deviceToString(Queue->get_device()));
2007  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
2008  getSyclObjImpl(Queue->get_device())->getDeviceName());
2009  if (!KernelName.empty()) {
2010  xpti::addMetadata(CmdTraceEvent, "kernel_name", KernelName);
2011  }
2012  if (FromSource.has_value()) {
2013  xpti::addMetadata(CmdTraceEvent, "from_source", FromSource.value());
2014  }
2015  if (HasSourceInfo) {
2016  xpti::addMetadata(CmdTraceEvent, "sym_function_name", KernelName);
2017  xpti::addMetadata(CmdTraceEvent, "sym_source_file_name", FileName);
2018  xpti::addMetadata(CmdTraceEvent, "sym_line_no", static_cast<int>(Line));
2019  xpti::addMetadata(CmdTraceEvent, "sym_column_no",
2020  static_cast<int>(Column));
2021  }
2022  }
2023 }
2024 #endif
2025 
2027  const std::shared_ptr<detail::kernel_impl> &SyclKernel,
2028  const detail::code_location &CodeLoc, const std::string &SyclKernelName,
2029  const QueueImplPtr &Queue, const NDRDescT &NDRDesc,
2030  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
2031  std::vector<ArgDesc> &CGArgs) {
2032 #ifdef XPTI_ENABLE_INSTRUMENTATION
2033  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2034  int32_t StreamID = xptiRegisterStream(SYCL_STREAM_NAME);
2035  if (!xptiCheckTraceEnabled(StreamID, NotificationTraceType))
2036  return;
2037 
2038  void *Address = nullptr;
2039  std::optional<bool> FromSource;
2040  std::string KernelName = instrumentationGetKernelName(
2041  SyclKernel, std::string(CodeLoc.functionName()), SyclKernelName, Address,
2042  FromSource);
2043 
2044  xpti_td *CmdTraceEvent = nullptr;
2045  uint64_t InstanceID = -1;
2046  std::string FileName =
2047  CodeLoc.fileName() ? CodeLoc.fileName() : std::string();
2048  instrumentationFillCommonData(KernelName, FileName, CodeLoc.lineNumber(),
2049  CodeLoc.columnNumber(), Address, Queue,
2050  FromSource, InstanceID, CmdTraceEvent);
2051 
2052  if (CmdTraceEvent) {
2053  instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc,
2054  KernelBundleImplPtr, SyclKernelName,
2055  SyclKernel, Queue, CGArgs);
2056 
2057  xptiNotifySubscribers(
2058  StreamID, NotificationTraceType, detail::GSYCLGraphEvent, CmdTraceEvent,
2059  InstanceID,
2060  static_cast<const void *>(
2061  commandToNodeType(Command::CommandType::RUN_CG).c_str()));
2062  }
2063 #else
2064  std::ignore = SyclKernel;
2065  std::ignore = CodeLoc;
2066  std::ignore = SyclKernelName;
2067  std::ignore = Queue;
2068  std::ignore = NDRDesc;
2069  std::ignore = KernelBundleImplPtr;
2070  std::ignore = CGArgs;
2071 #endif
2072 }
2073 
2075 #ifdef XPTI_ENABLE_INSTRUMENTATION
2076  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2077  if (!xptiCheckTraceEnabled(MStreamID, NotificationTraceType))
2078  return;
2079 
2080  std::string KernelName;
2081  std::optional<bool> FromSource;
2082  switch (MCommandGroup->getType()) {
2083  case detail::CG::Kernel: {
2084  auto KernelCG =
2085  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2086  KernelName = instrumentationGetKernelName(
2087  KernelCG->MSyclKernel, MCommandGroup->MFunctionName,
2088  KernelCG->getKernelName(), MAddress, FromSource);
2089  } break;
2090  default:
2091  KernelName = cgTypeToString(MCommandGroup->getType());
2092  break;
2093  }
2094 
2095  xpti_td *CmdTraceEvent = nullptr;
2096  instrumentationFillCommonData(KernelName, MCommandGroup->MFileName,
2097  MCommandGroup->MLine, MCommandGroup->MColumn,
2098  MAddress, MQueue, FromSource, MInstanceID,
2099  CmdTraceEvent);
2100 
2101  if (CmdTraceEvent) {
2102  MTraceEvent = static_cast<void *>(CmdTraceEvent);
2103  if (MCommandGroup->getType() == detail::CG::Kernel) {
2104  auto KernelCG =
2105  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2106  instrumentationAddExtraKernelMetadata(
2107  CmdTraceEvent, KernelCG->MNDRDesc, KernelCG->getKernelBundle(),
2108  KernelCG->MKernelName, KernelCG->MSyclKernel, MQueue,
2109  KernelCG->MArgs);
2110  }
2111 
2112  xptiNotifySubscribers(
2113  MStreamID, NotificationTraceType, detail::GSYCLGraphEvent,
2114  CmdTraceEvent, MInstanceID,
2115  static_cast<const void *>(commandToNodeType(MType).c_str()));
2116  }
2117 #endif
2118 }
2119 
2120 void ExecCGCommand::printDot(std::ostream &Stream) const {
2121  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2122 
2123  Stream << "ID = " << this << "\\n";
2124  Stream << "EXEC CG ON " << deviceToString(MQueue->get_device()) << "\\n";
2125 
2126  switch (MCommandGroup->getType()) {
2127  case detail::CG::Kernel: {
2128  auto KernelCG =
2129  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2130  Stream << "Kernel name: ";
2131  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
2132  Stream << "created from source";
2133  else
2134  Stream << demangleKernelName(KernelCG->getKernelName());
2135  Stream << "\\n";
2136  break;
2137  }
2138  default:
2139  Stream << "CG type: " << cgTypeToString(MCommandGroup->getType()) << "\\n";
2140  break;
2141  }
2142 
2143  Stream << "\"];" << std::endl;
2144 
2145  for (const auto &Dep : MDeps) {
2146  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
2147  << " [ label = \"Access mode: "
2148  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
2149  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
2150  << std::endl;
2151  }
2152 }
2153 
2154 // SYCL has a parallel_for_work_group variant where the only NDRange
2155 // characteristics set by a user is the number of work groups. This does not
2156 // map to the OpenCL clEnqueueNDRangeAPI, which requires global work size to
2157 // be set as well. This function determines local work size based on the
2158 // device characteristics and the number of work groups requested by the user,
2159 // then calculates the global work size. SYCL specification (from 4.8.5.3):
2160 // The member function handler::parallel_for_work_group is parameterized by
2161 // the number of work - groups, such that the size of each group is chosen by
2162 // the runtime, or by the number of work - groups and number of work - items
2163 // for users who need more control.
2166  const device_impl &DeviceImpl) {
2167  if (NDR.GlobalSize[0] != 0)
2168  return; // GlobalSize is set - no need to adjust
2169  // check the prerequisites:
2170  assert(NDR.LocalSize[0] == 0);
2171  // TODO might be good to cache this info together with the kernel info to
2172  // avoid get_kernel_work_group_info on every kernel run
2174  sycl::info::kernel_device_specific::compile_work_group_size>(
2175  Kernel, DeviceImpl.getHandleRef(), DeviceImpl.getPlugin());
2176 
2177  if (WGSize[0] == 0) {
2178  WGSize = {1, 1, 1};
2179  }
2180  NDR.set(NDR.Dims, nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize));
2181 }
2182 
2183 // We have the following mapping between dimensions with SPIR-V builtins:
2184 // 1D: id[0] -> x
2185 // 2D: id[0] -> y, id[1] -> x
2186 // 3D: id[0] -> z, id[1] -> y, id[2] -> x
2187 // So in order to ensure the correctness we update all the kernel
2188 // parameters accordingly.
2189 // Initially we keep the order of NDRDescT as it provided by the user, this
2190 // simplifies overall handling and do the reverse only when
2191 // the kernel is enqueued.
2193  if (NDR.Dims > 1) {
2194  std::swap(NDR.GlobalSize[0], NDR.GlobalSize[NDR.Dims - 1]);
2195  std::swap(NDR.LocalSize[0], NDR.LocalSize[NDR.Dims - 1]);
2196  std::swap(NDR.GlobalOffset[0], NDR.GlobalOffset[NDR.Dims - 1]);
2197  }
2198 }
2199 
2201  switch (AccessorMode) {
2202  case access::mode::read:
2203  return PI_ACCESS_READ_ONLY;
2204  case access::mode::write:
2206  return PI_ACCESS_WRITE_ONLY;
2207  default:
2208  return PI_ACCESS_READ_WRITE;
2209  }
2210 }
2211 
2213  const PluginPtr &Plugin, sycl::detail::pi::PiKernel Kernel,
2214  const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2215  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2216  const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg,
2217  size_t NextTrueIndex) {
2218  switch (Arg.MType) {
2220  break;
2222  Requirement *Req = (Requirement *)(Arg.MPtr);
2223 
2224  // getMemAllocationFunc is nullptr when there are no requirements. However,
2225  // we may pass default constructed accessors to a command, which don't add
2226  // requirements. In such case, getMemAllocationFunc is nullptr, but it's a
2227  // valid case, so we need to properly handle it.
2228  sycl::detail::pi::PiMem MemArg =
2229  getMemAllocationFunc
2230  ? (sycl::detail::pi::PiMem)getMemAllocationFunc(Req)
2231  : nullptr;
2232  if (Context.get_backend() == backend::opencl) {
2233  // clSetKernelArg (corresponding to piKernelSetArg) returns an error
2234  // when MemArg is null, which is the case when zero-sized buffers are
2235  // handled. Below assignment provides later call to clSetKernelArg with
2236  // acceptable arguments.
2237  if (!MemArg)
2238  MemArg = sycl::detail::pi::PiMem();
2239 
2240  Plugin->call<PiApiKind::piKernelSetArg>(
2241  Kernel, NextTrueIndex, sizeof(sycl::detail::pi::PiMem), &MemArg);
2242  } else {
2243  pi_mem_obj_property MemObjData{};
2244  MemObjData.mem_access = AccessModeToPi(Req->MAccessMode);
2245  MemObjData.type = PI_KERNEL_ARG_MEM_OBJ_ACCESS;
2246  Plugin->call<PiApiKind::piextKernelSetArgMemObj>(Kernel, NextTrueIndex,
2247  &MemObjData, &MemArg);
2248  }
2249  break;
2250  }
2252  Plugin->call<PiApiKind::piKernelSetArg>(Kernel, NextTrueIndex, Arg.MSize,
2253  Arg.MPtr);
2254  break;
2255  }
2257  sampler *SamplerPtr = (sampler *)Arg.MPtr;
2258  sycl::detail::pi::PiSampler Sampler =
2259  detail::getSyclObjImpl(*SamplerPtr)->getOrCreateSampler(Context);
2260  Plugin->call<PiApiKind::piextKernelSetArgSampler>(Kernel, NextTrueIndex,
2261  &Sampler);
2262  break;
2263  }
2265  Plugin->call<PiApiKind::piextKernelSetArgPointer>(Kernel, NextTrueIndex,
2266  Arg.MSize, Arg.MPtr);
2267  break;
2268  }
2270  if (IsHost) {
2271  throw sycl::exception(
2272  sycl::make_error_code(sycl::errc::feature_not_supported),
2273  "SYCL2020 specialization constants are not yet supported on host "
2274  "device " +
2275  codeToString(PI_ERROR_INVALID_OPERATION));
2276  }
2277  assert(DeviceImageImpl != nullptr);
2278  sycl::detail::pi::PiMem SpecConstsBuffer =
2279  DeviceImageImpl->get_spec_const_buffer_ref();
2280  // Avoid taking an address of nullptr
2281  sycl::detail::pi::PiMem *SpecConstsBufferArg =
2282  SpecConstsBuffer ? &SpecConstsBuffer : nullptr;
2283 
2284  pi_mem_obj_property MemObjData{};
2285  MemObjData.mem_access = PI_ACCESS_READ_ONLY;
2286  MemObjData.type = PI_KERNEL_ARG_MEM_OBJ_ACCESS;
2287  Plugin->call<PiApiKind::piextKernelSetArgMemObj>(
2288  Kernel, NextTrueIndex, &MemObjData, SpecConstsBufferArg);
2289  break;
2290  }
2292  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
2293  "Invalid kernel param kind " +
2294  codeToString(PI_ERROR_INVALID_VALUE));
2295  break;
2296  }
2297 }
2298 
2300  const QueueImplPtr &Queue, std::vector<ArgDesc> &Args,
2301  const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2302  sycl::detail::pi::PiKernel Kernel, NDRDescT &NDRDesc,
2303  std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2304  const detail::EventImplPtr &OutEventImpl,
2305  const KernelArgMask *EliminatedArgMask,
2306  const std::function<void *(Requirement *Req)> &getMemAllocationFunc) {
2307  const PluginPtr &Plugin = Queue->getPlugin();
2308 
2309  auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
2310  &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
2311  SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc,
2312  Queue->get_context(), Queue->is_host(), Arg,
2313  NextTrueIndex);
2314  };
2315 
2316  applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
2317 
2318  adjustNDRangePerKernel(NDRDesc, Kernel, *(Queue->getDeviceImplPtr()));
2319 
2320  // Remember this information before the range dimensions are reversed
2321  const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
2322 
2324 
2325  size_t RequiredWGSize[3] = {0, 0, 0};
2326  size_t *LocalSize = nullptr;
2327 
2328  if (HasLocalSize)
2329  LocalSize = &NDRDesc.LocalSize[0];
2330  else {
2331  Plugin->call<PiApiKind::piKernelGetGroupInfo>(
2332  Kernel, Queue->getDeviceImplPtr()->getHandleRef(),
2333  PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize),
2334  RequiredWGSize, /* param_value_size_ret = */ nullptr);
2335 
2336  const bool EnforcedLocalSize =
2337  (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2338  RequiredWGSize[2] != 0);
2339  if (EnforcedLocalSize)
2340  LocalSize = RequiredWGSize;
2341  }
2342  if (OutEventImpl != nullptr)
2343  OutEventImpl->setHostEnqueueTime();
2344  pi_result Error = Plugin->call_nocheck<PiApiKind::piEnqueueKernelLaunch>(
2345  Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
2346  &NDRDesc.GlobalSize[0], LocalSize, RawEvents.size(),
2347  RawEvents.empty() ? nullptr : &RawEvents[0],
2348  OutEventImpl ? &OutEventImpl->getHandleRef() : nullptr);
2349  return Error;
2350 }
2351 
2352 // The function initialize accessors and calls lambda.
2353 void DispatchNativeKernel(void *Blob) {
2354  void **CastedBlob = (void **)Blob;
2355 
2356  std::vector<Requirement *> *Reqs =
2357  static_cast<std::vector<Requirement *> *>(CastedBlob[0]);
2358 
2359  std::shared_ptr<HostKernelBase> *HostKernel =
2360  static_cast<std::shared_ptr<HostKernelBase> *>(CastedBlob[1]);
2361 
2362  NDRDescT *NDRDesc = static_cast<NDRDescT *>(CastedBlob[2]);
2363 
2364  // Other value are pointer to the buffers.
2365  void **NextArg = CastedBlob + 3;
2366  for (detail::Requirement *Req : *Reqs)
2367  Req->MData = *(NextArg++);
2368 
2369  (*HostKernel)->call(*NDRDesc, nullptr);
2370 
2371  // The ownership of these objects have been passed to us, need to cleanup
2372  delete Reqs;
2373  delete HostKernel;
2374  delete NDRDesc;
2375 }
2376 
2378  context Ctx, DeviceImplPtr DeviceImpl,
2380  const CGExecKernel &CommandGroup,
2381  std::vector<sycl::detail::pi::PiExtSyncPoint> &SyncPoints,
2382  sycl::detail::pi::PiExtSyncPoint *OutSyncPoint,
2383  const std::function<void *(Requirement *Req)> &getMemAllocationFunc) {
2384  auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx);
2385  const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin();
2386  pi_kernel PiKernel = nullptr;
2387  std::mutex *KernelMutex = nullptr;
2388  pi_program PiProgram = nullptr;
2389 
2390  auto Kernel = CommandGroup.MSyclKernel;
2391  const KernelArgMask *EliminatedArgMask = nullptr;
2392  if (Kernel != nullptr) {
2393  PiKernel = Kernel->getHandleRef();
2394  } else {
2395  std::tie(PiKernel, KernelMutex, EliminatedArgMask, PiProgram) =
2396  sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
2397  ContextImpl, DeviceImpl, CommandGroup.MKernelName, nullptr);
2398  }
2399 
2400  auto SetFunc = [&Plugin, &PiKernel, &Ctx, &getMemAllocationFunc](
2401  sycl::detail::ArgDesc &Arg, size_t NextTrueIndex) {
2403  Plugin, PiKernel,
2404  nullptr /* TODO: Handle spec constants and pass device image here */
2405  ,
2406  getMemAllocationFunc, Ctx, false, Arg, NextTrueIndex);
2407  };
2408  // Copy args for modification
2409  auto Args = CommandGroup.MArgs;
2410  sycl::detail::applyFuncOnFilteredArgs(EliminatedArgMask, Args, SetFunc);
2411 
2412  // Remember this information before the range dimensions are reversed
2413  const bool HasLocalSize = (CommandGroup.MNDRDesc.LocalSize[0] != 0);
2414 
2415  // Copy NDRDesc for modification
2416  auto NDRDesc = CommandGroup.MNDRDesc;
2417  // Reverse kernel dims
2419 
2420  size_t RequiredWGSize[3] = {0, 0, 0};
2421  size_t *LocalSize = nullptr;
2422 
2423  if (HasLocalSize)
2424  LocalSize = &NDRDesc.LocalSize[0];
2425  else {
2427  PiKernel, DeviceImpl->getHandleRef(),
2428  PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize),
2429  RequiredWGSize,
2430  /* param_value_size_ret = */ nullptr);
2431 
2432  const bool EnforcedLocalSize =
2433  (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2434  RequiredWGSize[2] != 0);
2435  if (EnforcedLocalSize)
2436  LocalSize = RequiredWGSize;
2437  }
2438 
2439  pi_result Res = Plugin->call_nocheck<
2441  CommandBuffer, PiKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
2442  &NDRDesc.GlobalSize[0], LocalSize, SyncPoints.size(),
2443  SyncPoints.size() ? SyncPoints.data() : nullptr, OutSyncPoint);
2444 
2445  if (Res != pi_result::PI_SUCCESS) {
2447  "Failed to add kernel to PI command-buffer");
2448  }
2449 
2450  return Res;
2451 }
2452 
2454  const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector<ArgDesc> &Args,
2455  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
2456  const std::shared_ptr<detail::kernel_impl> &MSyclKernel,
2457  const std::string &KernelName,
2458  std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2459  const detail::EventImplPtr &OutEventImpl,
2460  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2461  sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig) {
2462 
2463  // Run OpenCL kernel
2464  auto ContextImpl = Queue->getContextImplPtr();
2465  auto DeviceImpl = Queue->getDeviceImplPtr();
2466  sycl::detail::pi::PiKernel Kernel = nullptr;
2467  std::mutex *KernelMutex = nullptr;
2468  sycl::detail::pi::PiProgram Program = nullptr;
2469  const KernelArgMask *EliminatedArgMask;
2470 
2471  std::shared_ptr<kernel_impl> SyclKernelImpl;
2472  std::shared_ptr<device_image_impl> DeviceImageImpl;
2473 
2474  // Use kernel_bundle if available unless it is interop.
2475  // Interop bundles can't be used in the first branch, because the kernels
2476  // in interop kernel bundles (if any) do not have kernel_id
2477  // and can therefore not be looked up, but since they are self-contained
2478  // they can simply be launched directly.
2479  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
2480  kernel_id KernelID =
2482  kernel SyclKernel =
2483  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
2484 
2485  SyclKernelImpl = detail::getSyclObjImpl(SyclKernel);
2486 
2487  Kernel = SyclKernelImpl->getHandleRef();
2488  DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2489 
2490  Program = DeviceImageImpl->get_program_ref();
2491 
2492  std::tie(Kernel, KernelMutex, EliminatedArgMask) =
2494  KernelBundleImplPtr->get_context(), KernelName,
2495  /*PropList=*/{}, Program);
2496  } else if (nullptr != MSyclKernel) {
2497  assert(MSyclKernel->get_info<info::kernel::context>() ==
2498  Queue->get_context());
2499  Kernel = MSyclKernel->getHandleRef();
2500  auto SyclProg = MSyclKernel->getProgramImpl();
2501  Program = SyclProg->getHandleRef();
2502  if (SyclProg->is_cacheable()) {
2503  sycl::detail::pi::PiKernel FoundKernel = nullptr;
2504  std::tie(FoundKernel, KernelMutex, EliminatedArgMask, std::ignore) =
2506  ContextImpl, DeviceImpl, KernelName, SyclProg.get());
2507  assert(FoundKernel == Kernel);
2508  } else {
2509  // Non-cacheable kernels use mutexes from kernel_impls.
2510  // TODO this can still result in a race condition if multiple SYCL
2511  // kernels are created with the same native handle. To address this,
2512  // we need to either store and use a pi_native_handle -> mutex map or
2513  // reuse and return existing SYCL kernels from make_native to avoid
2514  // their duplication in such cases.
2515  KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex();
2516  EliminatedArgMask = MSyclKernel->getKernelArgMask();
2517  }
2518  } else {
2519  std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
2521  ContextImpl, DeviceImpl, KernelName, nullptr);
2522  }
2523 
2524  // We may need more events for the launch, so we make another reference.
2525  std::vector<sycl::detail::pi::PiEvent> &EventsWaitList = RawEvents;
2526 
2527  // Initialize device globals associated with this.
2528  std::vector<sycl::detail::pi::PiEvent> DeviceGlobalInitEvents =
2529  ContextImpl->initializeDeviceGlobals(Program, Queue);
2530  std::vector<sycl::detail::pi::PiEvent> EventsWithDeviceGlobalInits;
2531  if (!DeviceGlobalInitEvents.empty()) {
2532  EventsWithDeviceGlobalInits.reserve(RawEvents.size() +
2533  DeviceGlobalInitEvents.size());
2534  EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2535  RawEvents.begin(), RawEvents.end());
2536  EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2537  DeviceGlobalInitEvents.begin(),
2538  DeviceGlobalInitEvents.end());
2539  EventsWaitList = EventsWithDeviceGlobalInits;
2540  }
2541 
2542  pi_result Error = PI_SUCCESS;
2543  {
2544  assert(KernelMutex);
2545  std::lock_guard<std::mutex> Lock(*KernelMutex);
2546 
2547  // Set SLM/Cache configuration for the kernel if non-default value is
2548  // provided.
2549  if (KernelCacheConfig == PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM ||
2550  KernelCacheConfig == PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA) {
2551  const PluginPtr &Plugin = Queue->getPlugin();
2552  Plugin->call<PiApiKind::piKernelSetExecInfo>(
2554  sizeof(sycl::detail::pi::PiKernelCacheConfig), &KernelCacheConfig);
2555  }
2556 
2557  Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel,
2558  NDRDesc, EventsWaitList, OutEventImpl,
2559  EliminatedArgMask, getMemAllocationFunc);
2560  }
2561  if (PI_SUCCESS != Error) {
2562  // If we have got non-success error code, let's analyze it to emit nice
2563  // exception explaining what was wrong
2564  const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr());
2566  Kernel, NDRDesc);
2567  }
2568 
2569  return PI_SUCCESS;
2570 }
2571 
2572 pi_int32
2573 enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName,
2574  bool blocking, void *ptr, size_t size,
2575  std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2576  const detail::EventImplPtr &OutEventImpl, bool read) {
2577  detail::HostPipeMapEntry *hostPipeEntry =
2579 
2580  sycl::detail::pi::PiProgram Program = nullptr;
2581  device Device = Queue->get_device();
2582  ContextImplPtr ContextImpl = Queue->getContextImplPtr();
2583  std::optional<sycl::detail::pi::PiProgram> CachedProgram =
2584  ContextImpl->getProgramForHostPipe(Device, hostPipeEntry);
2585  if (CachedProgram)
2586  Program = *CachedProgram;
2587  else {
2588  // If there was no cached program, build one.
2589  device_image_plain devImgPlain =
2591  hostPipeEntry->getDevBinImage(), Queue->get_context(),
2592  Queue->get_device());
2593  device_image_plain BuiltImage =
2594  ProgramManager::getInstance().build(devImgPlain, {Device}, {});
2595  Program = getSyclObjImpl(BuiltImage)->get_program_ref();
2596  }
2597  assert(Program && "Program for this hostpipe is not compiled.");
2598 
2599  // Get plugin for calling opencl functions
2600  const PluginPtr &Plugin = Queue->getPlugin();
2601 
2602  pi_queue pi_q = Queue->getHandleRef();
2603  pi_result Error;
2604 
2605  auto OutEvent = OutEventImpl ? &OutEventImpl->getHandleRef() : nullptr;
2606  if (OutEventImpl != nullptr)
2607  OutEventImpl->setHostEnqueueTime();
2608  if (read) {
2609  Error =
2611  pi_q, Program, PipeName.c_str(), blocking, ptr, size,
2612  RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2613  OutEvent);
2614  } else {
2615  Error =
2616  Plugin
2618  pi_q, Program, PipeName.c_str(), blocking, ptr, size,
2619  RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2620  OutEvent);
2621  }
2622  return Error;
2623 }
2624 
2625 pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
2626  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
2627  auto RawEvents = getPiEvents(EventImpls);
2628  flushCrossQueueDeps(EventImpls, getWorkerQueue());
2629 
2630  sycl::detail::pi::PiEvent *Event =
2631  (MQueue->has_discard_events_support() &&
2632  MCommandGroup->getRequirements().size() == 0)
2633  ? nullptr
2634  : &MEvent->getHandleRef();
2635  sycl::detail::pi::PiExtSyncPoint OutSyncPoint;
2636  switch (MCommandGroup->getType()) {
2637  case CG::CGTYPE::Kernel: {
2638  CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2639 
2640  auto getMemAllocationFunc = [this](Requirement *Req) {
2641  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2642  return AllocaCmd->getMemAllocation();
2643  };
2644 
2645  if (!Event) {
2646  // Kernel only uses assert if it's non interop one
2647  bool KernelUsesAssert =
2648  !(ExecKernel->MSyclKernel && ExecKernel->MSyclKernel->isInterop()) &&
2650  ExecKernel->MKernelName);
2651  if (KernelUsesAssert) {
2652  Event = &MEvent->getHandleRef();
2653  }
2654  }
2655  auto result = enqueueImpCommandBufferKernel(
2656  MQueue->get_context(), MQueue->getDeviceImplPtr(), MCommandBuffer,
2657  *ExecKernel, MSyncPointDeps, &OutSyncPoint, getMemAllocationFunc);
2658  MEvent->setSyncPoint(OutSyncPoint);
2659  return result;
2660  }
2661  case CG::CGTYPE::CopyUSM: {
2662  CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2664  MQueue->getContextImplPtr(), Copy->getSrc(), MCommandBuffer,
2665  Copy->getLength(), Copy->getDst(), MSyncPointDeps, &OutSyncPoint);
2666  MEvent->setSyncPoint(OutSyncPoint);
2667  return PI_SUCCESS;
2668  }
2669  case CG::CGTYPE::CopyAccToAcc: {
2670  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2671  Requirement *ReqSrc = (Requirement *)(Copy->getSrc());
2672  Requirement *ReqDst = (Requirement *)(Copy->getDst());
2673 
2674  AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2675  AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2676 
2678  MQueue->getContextImplPtr(), MCommandBuffer,
2679  AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
2680  ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2681  ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2682  ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2683  ReqDst->MOffset, ReqDst->MElemSize, std::move(MSyncPointDeps),
2684  &OutSyncPoint);
2685  MEvent->setSyncPoint(OutSyncPoint);
2686  return PI_SUCCESS;
2687  }
2688  case CG::CGTYPE::CopyAccToPtr: {
2689  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2690  Requirement *Req = (Requirement *)Copy->getSrc();
2691  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2692 
2694  MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(),
2695  AllocaCmd->getMemAllocation(), Req->MDims, Req->MMemoryRange,
2696  Req->MAccessRange, Req->MOffset, Req->MElemSize, (char *)Copy->getDst(),
2697  Req->MDims, Req->MAccessRange,
2698  /*DstOffset=*/{0, 0, 0}, Req->MElemSize, std::move(MSyncPointDeps),
2699  &OutSyncPoint);
2700 
2701  return PI_SUCCESS;
2702  }
2703  case CG::CGTYPE::CopyPtrToAcc: {
2704  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2705  Requirement *Req = (Requirement *)(Copy->getDst());
2706  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2707 
2709  MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(),
2710  (char *)Copy->getSrc(), Req->MDims, Req->MAccessRange,
2711  /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2712  Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2713  Req->MElemSize, std::move(MSyncPointDeps), &OutSyncPoint);
2714 
2715  return PI_SUCCESS;
2716  }
2717  default:
2718  throw runtime_error("CG type not implemented for command buffers.",
2719  PI_ERROR_INVALID_OPERATION);
2720  }
2721 }
2722 
2723 pi_int32 ExecCGCommand::enqueueImp() {
2724  if (MCommandBuffer) {
2725  return enqueueImpCommandBuffer();
2726  } else {
2727  return enqueueImpQueue();
2728  }
2729 }
2730 
2731 pi_int32 ExecCGCommand::enqueueImpQueue() {
2732  if (getCG().getType() != CG::CGTYPE::CodeplayHostTask)
2734  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
2735  auto RawEvents = getPiEvents(EventImpls);
2736  flushCrossQueueDeps(EventImpls, getWorkerQueue());
2737 
2738  bool DiscardEvent = (MQueue->has_discard_events_support() &&
2739  MCommandGroup->getRequirements().size() == 0);
2740  sycl::detail::pi::PiEvent *Event =
2741  DiscardEvent ? nullptr : &MEvent->getHandleRef();
2742  detail::EventImplPtr EventImpl = DiscardEvent ? nullptr : MEvent;
2743 
2744  switch (MCommandGroup->getType()) {
2745 
2746  case CG::CGTYPE::UpdateHost: {
2747  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
2748  "Update host should be handled by the Scheduler. " +
2749  codeToString(PI_ERROR_INVALID_VALUE));
2750  }
2751  case CG::CGTYPE::CopyAccToPtr: {
2752  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2753  Requirement *Req = (Requirement *)Copy->getSrc();
2754  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2755 
2757  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2758  Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2759  Req->MElemSize, Copy->getDst(),
2761  Req->MAccessRange, Req->MAccessRange, /*DstOffset=*/{0, 0, 0},
2762  Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent);
2763 
2764  return PI_SUCCESS;
2765  }
2766  case CG::CGTYPE::CopyPtrToAcc: {
2767  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2768  Requirement *Req = (Requirement *)(Copy->getDst());
2769  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2770 
2772 
2774  AllocaCmd->getSYCLMemObj(), Copy->getSrc(),
2776  Req->MAccessRange, Req->MAccessRange,
2777  /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2778  MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2779  Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent);
2780 
2781  return PI_SUCCESS;
2782  }
2783  case CG::CGTYPE::CopyAccToAcc: {
2784  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2785  Requirement *ReqSrc = (Requirement *)(Copy->getSrc());
2786  Requirement *ReqDst = (Requirement *)(Copy->getDst());
2787 
2788  AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2789  AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2790 
2792  AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(), MQueue,
2793  ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2794  ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2795  MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2796  ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents),
2797  MEvent->getHandleRef(), MEvent);
2798 
2799  return PI_SUCCESS;
2800  }
2801  case CG::CGTYPE::Fill: {
2802  CGFill *Fill = (CGFill *)MCommandGroup.get();
2803  Requirement *Req = (Requirement *)(Fill->getReqToFill());
2804  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2805 
2807  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2808  Fill->MPattern.size(), Fill->MPattern.data(), Req->MDims,
2809  Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
2810  std::move(RawEvents), MEvent->getHandleRef(), MEvent);
2811 
2812  return PI_SUCCESS;
2813  }
2814  case CG::CGTYPE::Kernel: {
2815  CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2816 
2817  NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
2818  std::vector<ArgDesc> &Args = ExecKernel->MArgs;
2819 
2820  if (MQueue->is_host() || (MQueue->getDeviceImplPtr()->getBackend() ==
2821  backend::ext_intel_esimd_emulator)) {
2822  for (ArgDesc &Arg : Args)
2823  if (kernel_param_kind_t::kind_accessor == Arg.MType) {
2824  Requirement *Req = (Requirement *)(Arg.MPtr);
2825  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2826  Req->MData = AllocaCmd->getMemAllocation();
2827  }
2828  if (!RawEvents.empty()) {
2829  // Assuming that the events are for devices to the same Plugin.
2830  const PluginPtr &Plugin = EventImpls[0]->getPlugin();
2831  Plugin->call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
2832  }
2833 
2834  if (MQueue->is_host()) {
2835  ExecKernel->MHostKernel->call(NDRDesc,
2836  getEvent()->getHostProfilingInfo());
2837  } else {
2838  assert(MQueue->getDeviceImplPtr()->getBackend() ==
2839  backend::ext_intel_esimd_emulator);
2840  if (MEvent != nullptr)
2841  MEvent->setHostEnqueueTime();
2842  MQueue->getPlugin()->call<PiApiKind::piEnqueueKernelLaunch>(
2843  nullptr,
2844  reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
2845  NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
2846  &NDRDesc.LocalSize[0], 0, nullptr, nullptr);
2847  }
2848  return PI_SUCCESS;
2849  }
2850 
2851  auto getMemAllocationFunc = [this](Requirement *Req) {
2852  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2853  // getAllocaForReq may return nullptr if Req is a default constructed
2854  // accessor. Simply return nullptr in such a case.
2855  return AllocaCmd ? AllocaCmd->getMemAllocation() : nullptr;
2856  };
2857 
2858  const std::shared_ptr<detail::kernel_impl> &SyclKernel =
2859  ExecKernel->MSyclKernel;
2860  const std::string &KernelName = ExecKernel->MKernelName;
2861 
2862  if (!EventImpl) {
2863  // Kernel only uses assert if it's non interop one
2864  bool KernelUsesAssert =
2865  !(SyclKernel && SyclKernel->isInterop()) &&
2867  if (KernelUsesAssert) {
2868  EventImpl = MEvent;
2869  }
2870  }
2871 
2872  return enqueueImpKernel(
2873  MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel,
2874  KernelName, RawEvents, EventImpl, getMemAllocationFunc,
2875  ExecKernel->MKernelCacheConfig);
2876  }
2877  case CG::CGTYPE::CopyUSM: {
2878  CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2879  MemoryManager::copy_usm(Copy->getSrc(), MQueue, Copy->getLength(),
2880  Copy->getDst(), std::move(RawEvents), Event,
2881  MEvent);
2882 
2883  return PI_SUCCESS;
2884  }
2885  case CG::CGTYPE::FillUSM: {
2886  CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
2887  MemoryManager::fill_usm(Fill->getDst(), MQueue, Fill->getLength(),
2888  Fill->getFill(), std::move(RawEvents), Event,
2889  MEvent);
2890 
2891  return PI_SUCCESS;
2892  }
2893  case CG::CGTYPE::PrefetchUSM: {
2894  CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
2895  MemoryManager::prefetch_usm(Prefetch->getDst(), MQueue,
2896  Prefetch->getLength(), std::move(RawEvents),
2897  Event, MEvent);
2898 
2899  return PI_SUCCESS;
2900  }
2901  case CG::CGTYPE::AdviseUSM: {
2902  CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
2903  MemoryManager::advise_usm(Advise->getDst(), MQueue, Advise->getLength(),
2904  Advise->getAdvice(), std::move(RawEvents), Event,
2905  MEvent);
2906 
2907  return PI_SUCCESS;
2908  }
2909  case CG::CGTYPE::Copy2DUSM: {
2910  CGCopy2DUSM *Copy = (CGCopy2DUSM *)MCommandGroup.get();
2911  MemoryManager::copy_2d_usm(Copy->getSrc(), Copy->getSrcPitch(), MQueue,
2912  Copy->getDst(), Copy->getDstPitch(),
2913  Copy->getWidth(), Copy->getHeight(),
2914  std::move(RawEvents), Event, MEvent);
2915  return PI_SUCCESS;
2916  }
2917  case CG::CGTYPE::Fill2DUSM: {
2918  CGFill2DUSM *Fill = (CGFill2DUSM *)MCommandGroup.get();
2919  MemoryManager::fill_2d_usm(Fill->getDst(), MQueue, Fill->getPitch(),
2920  Fill->getWidth(), Fill->getHeight(),
2921  Fill->getPattern(), std::move(RawEvents), Event,
2922  MEvent);
2923  return PI_SUCCESS;
2924  }
2925  case CG::CGTYPE::Memset2DUSM: {
2926  CGMemset2DUSM *Memset = (CGMemset2DUSM *)MCommandGroup.get();
2927  MemoryManager::memset_2d_usm(Memset->getDst(), MQueue, Memset->getPitch(),
2928  Memset->getWidth(), Memset->getHeight(),
2929  Memset->getValue(), std::move(RawEvents),
2930  Event, MEvent);
2931  return PI_SUCCESS;
2932  }
2933  case CG::CGTYPE::CodeplayHostTask: {
2934  CGHostTask *HostTask = static_cast<CGHostTask *>(MCommandGroup.get());
2935 
2936  for (ArgDesc &Arg : HostTask->MArgs) {
2937  switch (Arg.MType) {
2939  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
2940  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2941 
2942  if (AllocaCmd)
2943  Req->MData = AllocaCmd->getMemAllocation();
2944  break;
2945  }
2946  default:
2947  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
2948  "Unsupported arg type " +
2949  codeToString(PI_ERROR_INVALID_VALUE));
2950  }
2951  }
2952 
2953  std::vector<interop_handle::ReqToMem> ReqToMem;
2954 
2955  if (HostTask->MHostTask->isInteropTask()) {
2956  // Extract the Mem Objects for all Requirements, to ensure they are
2957  // available if a user asks for them inside the interop task scope
2958  const std::vector<Requirement *> &HandlerReq =
2959  HostTask->getRequirements();
2960  auto ReqToMemConv = [&ReqToMem, HostTask](Requirement *Req) {
2961  const std::vector<AllocaCommandBase *> &AllocaCmds =
2962  Req->MSYCLMemObj->MRecord->MAllocaCommands;
2963 
2964  for (AllocaCommandBase *AllocaCmd : AllocaCmds)
2965  if (HostTask->MQueue->getContextImplPtr() ==
2966  AllocaCmd->getQueue()->getContextImplPtr()) {
2967  auto MemArg =
2968  reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
2969  ReqToMem.emplace_back(std::make_pair(Req, MemArg));
2970 
2971  return;
2972  }
2973 
2974  assert(false &&
2975  "Can't get memory object due to no allocation available");
2976 
2977  throw sycl::exception(
2978  sycl::make_error_code(sycl::errc::runtime),
2979  "Can't get memory object due to no allocation available " +
2980  codeToString(PI_ERROR_INVALID_MEM_OBJECT));
2981  };
2982  std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
2983  std::sort(std::begin(ReqToMem), std::end(ReqToMem));
2984  }
2985 
2986  // Host task is executed asynchronously so we should record where it was
2987  // submitted to report exception origin properly.
2989 
2990  MQueue->getThreadPool().submit<DispatchHostTask>(
2991  DispatchHostTask(this, std::move(ReqToMem)));
2992 
2994 
2995  return PI_SUCCESS;
2996  }
2997  case CG::CGTYPE::Barrier: {
2998  if (MQueue->getDeviceImplPtr()->is_host()) {
2999  // NOP for host device.
3000  return PI_SUCCESS;
3001  }
3002  const PluginPtr &Plugin = MQueue->getPlugin();
3003  if (MEvent != nullptr)
3004  MEvent->setHostEnqueueTime();
3006  MQueue->getHandleRef(), 0, nullptr, Event);
3007 
3008  return PI_SUCCESS;
3009  }
3010  case CG::CGTYPE::BarrierWaitlist: {
3011  CGBarrier *Barrier = static_cast<CGBarrier *>(MCommandGroup.get());
3012  std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
3013  std::vector<sycl::detail::pi::PiEvent> PiEvents =
3014  getPiEventsBlocking(Events);
3015  if (MQueue->getDeviceImplPtr()->is_host() || PiEvents.empty()) {
3016  // NOP for host device.
3017  // If Events is empty, then the barrier has no effect.
3018  return PI_SUCCESS;
3019  }
3020  const PluginPtr &Plugin = MQueue->getPlugin();
3021  if (MEvent != nullptr)
3022  MEvent->setHostEnqueueTime();
3024  MQueue->getHandleRef(), PiEvents.size(), &PiEvents[0], Event);
3025 
3026  return PI_SUCCESS;
3027  }
3028  case CG::CGTYPE::CopyToDeviceGlobal: {
3029  CGCopyToDeviceGlobal *Copy = (CGCopyToDeviceGlobal *)MCommandGroup.get();
3031  Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(), MQueue,
3032  Copy->getNumBytes(), Copy->getOffset(), Copy->getSrc(),
3033  std::move(RawEvents), Event, MEvent);
3034 
3035  return CL_SUCCESS;
3036  }
3037  case CG::CGTYPE::CopyFromDeviceGlobal: {
3038  CGCopyFromDeviceGlobal *Copy =
3039  (CGCopyFromDeviceGlobal *)MCommandGroup.get();
3041  Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(), MQueue,
3042  Copy->getNumBytes(), Copy->getOffset(), Copy->getDest(),
3043  std::move(RawEvents), Event, MEvent);
3044 
3045  return CL_SUCCESS;
3046  }
3047  case CG::CGTYPE::ReadWriteHostPipe: {
3048  CGReadWriteHostPipe *ExecReadWriteHostPipe =
3049  (CGReadWriteHostPipe *)MCommandGroup.get();
3050  std::string pipeName = ExecReadWriteHostPipe->getPipeName();
3051  void *hostPtr = ExecReadWriteHostPipe->getHostPtr();
3052  size_t typeSize = ExecReadWriteHostPipe->getTypeSize();
3053  bool blocking = ExecReadWriteHostPipe->isBlocking();
3054  bool read = ExecReadWriteHostPipe->isReadHostPipe();
3055 
3056  if (!EventImpl) {
3057  EventImpl = MEvent;
3058  }
3059  return enqueueReadWriteHostPipe(MQueue, pipeName, blocking, hostPtr,
3060  typeSize, RawEvents, EventImpl, read);
3061  }
3062  case CG::CGTYPE::ExecCommandBuffer: {
3063  CGExecCommandBuffer *CmdBufferCG =
3064  static_cast<CGExecCommandBuffer *>(MCommandGroup.get());
3065  if (MEvent != nullptr)
3066  MEvent->setHostEnqueueTime();
3067  return MQueue->getPlugin()
3069  CmdBufferCG->MCommandBuffer, MQueue->getHandleRef(),
3070  RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
3071  Event);
3072  }
3073  case CG::CGTYPE::CopyImage: {
3074  CGCopyImage *Copy = (CGCopyImage *)MCommandGroup.get();
3075 
3076  sycl::detail::pi::PiMemImageDesc Desc = Copy->getDesc();
3077 
3079  Copy->getSrc(), MQueue, Copy->getDst(), Desc, Copy->getFormat(),
3080  Copy->getCopyFlags(), Copy->getSrcOffset(), Copy->getDstOffset(),
3081  Copy->getHostExtent(), Copy->getCopyExtent(), std::move(RawEvents),
3082  Event);
3083  return PI_SUCCESS;
3084  }
3085  case CG::CGTYPE::SemaphoreWait: {
3086  CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get();
3087  if (MQueue->getDeviceImplPtr()->is_host()) {
3088  // NOP for host device.
3089  return PI_SUCCESS;
3090  }
3091 
3092  const detail::PluginPtr &Plugin = MQueue->getPlugin();
3094  MQueue->getHandleRef(), SemWait->getInteropSemaphoreHandle(), 0,
3095  nullptr, nullptr);
3096 
3097  return PI_SUCCESS;
3098  }
3099  case CG::CGTYPE::SemaphoreSignal: {
3100  CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get();
3101  if (MQueue->getDeviceImplPtr()->is_host()) {
3102  // NOP for host device.
3103  return PI_SUCCESS;
3104  }
3105 
3106  const detail::PluginPtr &Plugin = MQueue->getPlugin();
3108  MQueue->getHandleRef(), SemSignal->getInteropSemaphoreHandle(), 0,
3109  nullptr, nullptr);
3110 
3111  return PI_SUCCESS;
3112  }
3113  case CG::CGTYPE::None:
3114  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
3115  "CG type not implemented. " +
3116  codeToString(PI_ERROR_INVALID_OPERATION));
3117  }
3118  return PI_ERROR_INVALID_OPERATION;
3119 }
3120 
3122  return !MCommandBuffer &&
3123  MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask;
3124 }
3125 
3127  // Host tasks are cleaned up upon completion instead.
3129  (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask);
3130 }
3131 
3133  if (MCommandGroup->getType() == CG::CGTYPE::CodeplayHostTask)
3134  return MLeafCounter == 0 && MEvent->isCompleted();
3135  return Command::readyForCleanup();
3136 }
3137 
3139  : Command(Command::CommandType::FUSION, Queue),
3140  MStatus(FusionStatus::ACTIVE) {
3142 }
3143 
3144 std::vector<Command *> &KernelFusionCommand::auxiliaryCommands() {
3145  return MAuxiliaryCommands;
3146 }
3147 
3149  MFusionList.push_back(Kernel);
3150 }
3151 
3152 std::vector<ExecCGCommand *> &KernelFusionCommand::getFusionList() {
3153  return MFusionList;
3154 }
3155 
3156 bool KernelFusionCommand::producesPiEvent() const { return false; }
3157 
3158 pi_int32 KernelFusionCommand::enqueueImp() {
3160  waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
3161 
3162  // We need to release the queue here because KernelFusionCommands are
3163  // held back by the scheduler thus prevent the deallocation of the queue.
3164  resetQueue();
3165  return PI_SUCCESS;
3166 }
3167 
3169  MStatus = Status;
3170 }
3171 
3173  assert(MStatus != FusionStatus::ACTIVE &&
3174  "Cannot release the queue attached to the KernelFusionCommand if it "
3175  "is active.");
3176  MQueue.reset();
3177  MWorkerQueue.reset();
3178 }
3179 
3181 #ifdef XPTI_ENABLE_INSTRUMENTATION
3182  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
3183  if (!xptiCheckTraceEnabled(MStreamID, NotificationTraceType)) {
3184  return;
3185  }
3186  // Create a payload with the command name and an event using this payload to
3187  // emit a node_create
3188  MCommandNodeType = commandToNodeType(MType);
3189  MCommandName = commandToName(MType);
3190 
3191  static unsigned FusionNodeCount = 0;
3192  std::stringstream PayloadStr;
3193  PayloadStr << "Fusion command #" << FusionNodeCount++;
3194  xpti::payload_t Payload = xpti::payload_t(PayloadStr.str().c_str());
3195 
3196  uint64_t CommandInstanceNo = 0;
3197  xpti_td *CmdTraceEvent =
3198  xptiMakeEvent(MCommandName.c_str(), &Payload, xpti::trace_graph_event,
3199  xpti_at::active, &CommandInstanceNo);
3200 
3201  MInstanceID = CommandInstanceNo;
3202  if (CmdTraceEvent) {
3203  MTraceEvent = static_cast<void *>(CmdTraceEvent);
3204  // If we are seeing this event again, then the instance ID
3205  // will be greater
3206  // than 1; in this case, we must skip sending a
3207  // notification to create a node as this node has already
3208  // been created. We return this value so the epilog method
3209  // can be called selectively.
3210  // See makeTraceEventProlog.
3211  MFirstInstance = (CommandInstanceNo == 1);
3212  }
3213 
3214  // This function is called in the constructor of the command. At this point
3215  // the kernel fusion list is still empty, so we don't have a terrible lot of
3216  // information we could attach to this node here.
3217  if (MFirstInstance && CmdTraceEvent) {
3218  xpti::addMetadata(CmdTraceEvent, "sycl_device",
3219  deviceToID(MQueue->get_device()));
3220  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
3221  deviceToString(MQueue->get_device()));
3222  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
3223  getSyclObjImpl(MQueue->get_device())->getDeviceName());
3224  }
3225 
3226  if (MFirstInstance) {
3227  xptiNotifySubscribers(MStreamID, NotificationTraceType,
3228  detail::GSYCLGraphEvent,
3229  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
3230  static_cast<const void *>(MCommandNodeType.c_str()));
3231  }
3232 
3233 #endif
3234 }
3235 
3236 void KernelFusionCommand::printDot(std::ostream &Stream) const {
3237  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
3238 
3239  Stream << "ID = " << this << "\\n";
3240  Stream << "KERNEL FUSION on " << deviceToString(MQueue->get_device()) << "\\n"
3241  << "FUSION LIST: {";
3242  bool Initial = true;
3243  for (auto *Cmd : MFusionList) {
3244  if (!Initial) {
3245  Stream << ",\\n";
3246  }
3247  Initial = false;
3248  auto *KernelCG = static_cast<detail::CGExecKernel *>(&Cmd->getCG());
3249  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
3250  Stream << "created from source";
3251  } else {
3252  Stream << demangleKernelName(KernelCG->getKernelName());
3253  }
3254  }
3255  Stream << "}\\n";
3256 
3257  Stream << "\"];" << std::endl;
3258 
3259  for (const auto &Dep : MDeps) {
3260  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
3261  << " [ label = \"Access mode: "
3262  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
3263  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
3264  << std::endl;
3265  }
3266 }
3267 
3268 } // namespace detail
3269 } // namespace _V1
3270 } // namespace sycl
sycl::_V1::detail::ProgramManager::getHostPipeEntry
HostPipeMapEntry * getHostPipeEntry(const std::string &UniqueId)
Definition: program_manager.cpp:1714
sycl::_V1::detail::Command::isHostTask
bool isHostTask() const
Definition: commands.cpp:286
sycl::_V1::detail::Command::copySubmissionCodeLocation
void copySubmissionCodeLocation()
Definition: commands.cpp:947
sycl::_V1::detail::NDRDescT::NumWorkGroups
sycl::range< 3 > NumWorkGroups
Number of workgroups, used to record the number of workgroups from the simplest form of parallel_for_...
Definition: cg_types.hpp:134
sycl::_V1::detail::tls_code_loc_t::query
const detail::code_location & query()
Query the information in the TLS slot.
Definition: common.cpp:55
sycl::_V1::detail::ExecCGCommand::ExecCGCommand
ExecCGCommand(std::unique_ptr< detail::CG > CommandGroup, QueueImplPtr Queue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &Dependencies={})
Definition: commands.cpp:1860
sycl::_V1::detail::SetKernelParamsAndLaunch
static pi_result SetKernelParamsAndLaunch(const QueueImplPtr &Queue, std::vector< ArgDesc > &Args, const std::shared_ptr< device_image_impl > &DeviceImageImpl, sycl::detail::pi::PiKernel Kernel, NDRDescT &NDRDesc, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, const KernelArgMask *EliminatedArgMask, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
Definition: commands.cpp:2299
sycl::_V1::detail::Scheduler::MGraphBuilder
GraphBuilder MGraphBuilder
Definition: scheduler.hpp:902
_pi_mem
Definition: pi_cuda.hpp:56
sycl::_V1::detail::Command::MEvent
EventImplPtr MEvent
Definition: commands.hpp:246
sycl::_V1::detail::Command::MQueue
QueueImplPtr MQueue
Definition: commands.hpp:245
sycl::_V1::detail::Command
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:99
sycl::_V1::operator=
multi_ptr & operator=(const multi_ptr &)=default
sycl::_V1::detail::Command::readyForCleanup
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:733
piEnqueueKernelLaunch
pi_result piEnqueueKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:531
sycl::_V1::detail::kernel_param_kind_t::kind_stream
@ kind_stream
sycl::_V1::access::mode::discard_read_write
@ discard_read_write
event_impl.hpp
sycl::_V1::detail::EmptyCommand::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1708
sycl::_V1::detail::AllocaSubBufCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1143
sycl::_V1::detail::Command::emitInstrumentationData
virtual void emitInstrumentationData()=0
Instrumentation method which emits telemetry data.
sycl::_V1::detail::MapMemObject::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1327
sycl::_V1::detail::AccessorImplHost::MOffset
id< 3 > & MOffset
Definition: accessor_impl.hpp:125
sycl::_V1::access::mode
mode
Definition: access.hpp:34
sycl::_V1::detail::CG::Fill2DUSM
@ Fill2DUSM
Definition: cg.hpp:68
PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM
Definition: pi.h:748
sycl::_V1::detail::SYCL_STREAM_NAME
constexpr const char * SYCL_STREAM_NAME
Definition: xpti_registry.hpp:29
sycl::_V1::detail::AllocaCommandBase::MLinkedAllocaCmd
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
Definition: commands.hpp:475
sycl::_V1::detail::KernelFusionCommand::setFusionStatus
void setFusionStatus(FusionStatus Status)
Set the status of this fusion command to Status.
Definition: commands.cpp:3168
piKernelSetArg
pi_result piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Definition: pi_cuda.cpp:341
_pi_mem_obj_property::mem_access
pi_mem_access_flag mem_access
Definition: pi.h:1831
sycl::_V1::detail::Command::MShouldCompleteEventIfPossible
bool MShouldCompleteEventIfPossible
Definition: commands.hpp:375
sycl::_V1::detail::MemoryManager::prefetch_usm
static void prefetch_usm(void *Ptr, QueueImplPtr Queue, size_t Len, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
Definition: memory_manager.cpp:1010
sycl::_V1::detail::CGExecKernel::MSyclKernel
std::shared_ptr< detail::kernel_impl > MSyclKernel
Definition: cg.hpp:168
sycl::_V1::detail::MemoryManager::ext_oneapi_copyH2D_cmd_buffer
static void ext_oneapi_copyH2D_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, char *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
Definition: memory_manager.cpp:1549
context_impl.hpp
sycl::_V1::detail::SYCLMemObjI::getType
virtual MemObjType getType() const =0
sycl::_V1::detail::CG::CopyToDeviceGlobal
@ CopyToDeviceGlobal
Definition: cg.hpp:70
sycl::_V1::detail::MemCpyCommandHost::MemCpyCommandHost
MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, void **DstPtr, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1600
sycl::_V1::detail::NDRDescT::GlobalSize
sycl::range< 3 > GlobalSize
Definition: cg_types.hpp:128
sycl::_V1::detail::kernel_param_kind_t::kind_std_layout
@ kind_std_layout
sycl::_V1::detail::MemoryManager::unmap
static void unmap(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, void *MappedPtr, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
Definition: memory_manager.cpp:916
sycl::_V1::detail::ContextImplPtr
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:33
sycl::_V1::detail::Command::getPiEventsBlocking
std::vector< sycl::detail::pi::PiEvent > getPiEventsBlocking(const std::vector< EventImplPtr > &EventImpls) const
Collect PI events from EventImpls and filter out some of them in case of in order queue.
Definition: commands.cpp:249
sycl::_V1::detail::ExecCGCommand::readyForCleanup
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:3132
sycl::_V1::make_error_code
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:94
sycl::_V1::detail::Command::resolveReleaseDependencies
void resolveReleaseDependencies(std::set< Command * > &list)
Looks at all the dependencies for the release command and enables instrumentation to report these dep...
Definition: commands.cpp:894
sycl::_V1::detail::MemCpyCommand::MemCpyCommand
MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, AllocaCommandBase *DstAllocaCmd, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1425
sycl::_V1::detail::UpdateHostRequirementCommand::emitInstrumentationData
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1783
sycl::_V1::detail::ProgramManager::build
device_image_plain build(const device_image_plain &DeviceImage, const std::vector< device > &Devs, const property_list &PropList)
Definition: program_manager.cpp:2190
sycl::_V1::backend::ext_oneapi_level_zero
@ ext_oneapi_level_zero
sycl::_V1::detail::ReleaseCommand::readyForCleanup
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:1279
sycl::_V1::detail::MemoryManager::fill
static void fill(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, size_t PatternSize, const char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl)
Definition: memory_manager.cpp:810
sycl::_V1::detail::AccessorImplHost::MIsSubBuffer
bool MIsSubBuffer
Definition: accessor_impl.hpp:137
sycl::_V1::detail::AllocaCommand::AllocaCommand
AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData=true, AllocaCommandBase *LinkedAllocaCmd=nullptr, bool IsConst=false)
Definition: commands.cpp:1003
sycl::_V1::detail::Command::MSubmissionFunctionName
std::string MSubmissionFunctionName
Definition: commands.hpp:367
sycl::_V1::detail::ProgramManager::getOrCreateKernel
std::tuple< sycl::detail::pi::PiKernel, std::mutex *, const KernelArgMask *, sycl::detail::pi::PiProgram > getOrCreateKernel(const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, const program_impl *Prg)
Definition: program_manager.cpp:689
sycl::_V1::detail::UpdateHostRequirementCommand::UpdateHostRequirementCommand
UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd, void **DstPtr)
Definition: commands.cpp:1774
cg_types.hpp
sycl::_V1::detail::AccessorImplHost
Definition: accessor_impl.hpp:42
sycl::_V1::detail::AllocaSubBufCommand::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1097
sycl::_V1::detail::HostPipeMapEntry::getDevBinImage
RTDeviceBinaryImage * getDevBinImage()
Definition: host_pipe_map_entry.hpp:54
sycl::_V1::detail::Command::MTraceEventPrologComplete
bool MTraceEventPrologComplete
Flag to indicate if makeTraceEventProlog() has been run.
Definition: commands.hpp:355
sycl::_V1::detail::MemoryManager::copy_from_device_global
static void copy_from_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, void *DstMem, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
Definition: memory_manager.cpp:1391
sycl::_V1::detail::demangleKernelName
static std::string demangleKernelName(std::string Name)
Definition: commands.cpp:84
xpti_registry.hpp
sycl::_V1::detail::AccessModeToPi
pi_mem_obj_access AccessModeToPi(access::mode AccessorMode)
Definition: commands.cpp:2200
ur_event_handle_t_::get
native_type get() const noexcept
Definition: event.hpp:27
sycl::_V1::detail::pi::PiKernelCacheConfig
::pi_kernel_cache_config PiKernelCacheConfig
Definition: pi.hpp:155
sycl::_V1::detail::AllocaCommandBase::readyForCleanup
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:1001
sycl::_V1::detail::Command::emitEnqueuedEventSignal
void emitEnqueuedEventSignal(sycl::detail::pi::PiEvent &PiEventAddr)
Creates a signal event with the enqueued kernel event handle.
Definition: commands.cpp:778
sycl::_V1::detail::AccessorImplHost::MAccessMode
access::mode MAccessMode
Definition: accessor_impl.hpp:130
PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA
Definition: pi.h:750
sycl::_V1::detail::Command::MInstanceID
uint64_t MInstanceID
Instance ID tracked for the command.
Definition: commands.hpp:359
sycl::_V1::detail::CG::PrefetchUSM
@ PrefetchUSM
Definition: cg.hpp:64
sycl::_V1::detail::DeviceImplPtr
std::shared_ptr< device_impl > DeviceImplPtr
Definition: program_manager.hpp:65
sycl::_V1::detail::Command::emitEdgeEventForCommandDependence
void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, bool IsCommand, std::optional< access::mode > AccMode=std::nullopt)
Creates an edge event when the dependency is a command.
Definition: commands.cpp:531
sycl::_V1::detail::AccessorImplHost::MElemSize
unsigned int MElemSize
Definition: accessor_impl.hpp:135
sycl::_V1::detail::AllocaCommandBase::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:976
_pi_result
_pi_result
Definition: pi.h:205
sycl::_V1::detail::EnqueueResultT
Result of command enqueueing.
Definition: commands.hpp:54
sycl::_V1::detail::GlobalHandler::instance
static GlobalHandler & instance()
Definition: global_handler.cpp:125
sycl::_V1::detail::DispatchHostTask
Definition: commands.cpp:299
sycl::_V1::detail::Command::supportsPostEnqueueCleanup
virtual bool supportsPostEnqueueCleanup() const
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:731
sycl::_V1::detail::MemoryManager::ext_oneapi_copyD2D_cmd_buffer
static void ext_oneapi_copyD2D_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
Definition: memory_manager.cpp:1425
sycl::_V1::detail::Command::BlockReason::HostTask
@ HostTask
sycl::_V1::detail::MemoryManager::allocateMemSubBuffer
static void * allocateMemSubBuffer(ContextImplPtr TargetContext, void *ParentMemObj, size_t ElemSize, size_t Offset, range< 3 > Range, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
Definition: memory_manager.cpp:420
sycl::_V1::detail::EmptyCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1737
sycl::_V1::detail::kernel_param_kind_t::kind_accessor
@ kind_accessor
sycl::_V1::detail::MemoryManager::map
static void * map(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, access::mode AccessMode, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
Definition: memory_manager.cpp:870
sycl::_V1::detail::for_each
Function for_each(Group g, Ptr first, Ptr last, Function f)
Definition: group_algorithm.hpp:200
sycl::_V1::detail::tie
auto tie(Ts &...Args)
Definition: tuple.hpp:39
sycl::_V1::detail::MemoryManager::advise_usm
static void advise_usm(const void *Ptr, QueueImplPtr Queue, size_t Len, pi_mem_advice Advice, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
Definition: memory_manager.cpp:1034
sycl::_V1::detail::Command::getQueue
const QueueImplPtr & getQueue() const
Definition: commands.hpp:169
sycl::_V1::detail::SYCLConfig
Definition: config.hpp:111
sycl::_V1::detail::Command::emitInstrumentation
void emitInstrumentation(uint16_t Type, const char *Txt=nullptr)
Emits an event of Type.
Definition: commands.cpp:792
sycl::_V1::detail::accessModeToString
static std::string accessModeToString(access::mode Mode)
Definition: commands.cpp:143
sycl::_V1::detail::MapMemObject::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1290
sycl::_V1::detail::Command::makeTraceEventEpilog
void makeTraceEventEpilog()
If prolog has been run, run epilog; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:672
sycl::_V1::detail::ExecCGCommand::getAuxiliaryResources
std::vector< std::shared_ptr< const void > > getAuxiliaryResources() const
Definition: commands.cpp:1552
sycl::_V1::detail::Command::MIsBlockable
bool MIsBlockable
Indicates whether the command can be blocked from enqueueing.
Definition: commands.hpp:313
detail
---— Error handling, matching OpenCL plugin semantics.
Definition: common.hpp:44
sycl::_V1::detail::Command::processDepEvent
Command * processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform glueing of events from different contexts.
Definition: commands.cpp:685
PI_ACCESS_READ_ONLY
@ PI_ACCESS_READ_ONLY
Definition: pi.h:1817
sycl::_V1::detail::EmptyCommand::addRequirement
void addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, const Requirement *Req)
Definition: commands.cpp:1692
sycl
Definition: access.hpp:18
sycl::_V1::detail::DispatchHostTask::DispatchHostTask
DispatchHostTask(ExecCGCommand *ThisCmd, std::vector< interop_handle::ReqToMem > ReqToMem)
Definition: commands.cpp:344
sycl::_V1::detail::MemoryManager::memset_2d_usm
static void memset_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, char Value, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
Definition: memory_manager.cpp:1187
piextWaitExternalSemaphore
pi_result piextWaitExternalSemaphore(pi_queue command_queue, pi_interop_semaphore_handle sem_handle, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to instruct the queue with a non-blocking wait on an external semaphore.
Definition: pi_cuda.cpp:486
sycl::_V1::detail::HostKernel
Definition: cg_types.hpp:249
sycl::_V1::detail::EnqueueResultT::SyclEnqueueBlocked
@ SyclEnqueueBlocked
Definition: commands.hpp:58
sycl::_V1::detail::DispatchNativeKernel
void DispatchNativeKernel(void *Blob)
Definition: commands.cpp:2353
sycl::_V1::detail::MemoryManager::allocate
static void * allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
Definition: memory_manager.cpp:265
sycl::_V1::detail::KernelBundleImplPtr
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
Definition: kernel_bundle.hpp:145
access.hpp
sycl::_V1::detail::CG::Kernel
@ Kernel
Definition: cg.hpp:54
queue_impl.hpp
sycl::_V1::detail::Scheduler::GraphBuilder
Graph builder class.
Definition: scheduler.hpp:556
sycl::_V1::detail::ExecCGCommand::emitInstrumentationData
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:2074
sycl::_V1::detail::MemoryManager::ext_oneapi_copy_usm_cmd_buffer
static void ext_oneapi_copy_usm_cmd_buffer(ContextImplPtr Context, const void *SrcMem, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, size_t Len, void *DstMem, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
Definition: memory_manager.cpp:1607
sycl::_V1::detail::Command::getPiEvents
std::vector< sycl::detail::pi::PiEvent > getPiEvents(const std::vector< EventImplPtr > &EventImpls) const
Collect PI events from EventImpls and filter out some of them in case of in order queue.
Definition: commands.cpp:222
sycl::_V1::detail::Command::addUser
void addUser(Command *NewUser)
Definition: commands.hpp:132
sycl::_V1::detail::Command::MSyncPointDeps
std::vector< sycl::detail::pi::PiExtSyncPoint > MSyncPointDeps
List of sync points for submissions to a command buffer.
Definition: commands.hpp:400
sycl::_V1::detail::ExecCGCommand::getCG
detail::CG & getCG() const
Definition: commands.hpp:649
sycl::_V1::detail::Command::MLeafCounter
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
Definition: commands.hpp:315
sycl::_V1::detail::device_impl::getPlugin
const PluginPtr & getPlugin() const
Definition: device_impl.hpp:127
sycl::_V1::detail::Requirement
AccessorImplHost Requirement
Definition: accessor_impl.hpp:210
sycl::_V1::detail::ArgDesc::MType
sycl::detail::kernel_param_kind_t MType
Definition: cg_types.hpp:45
scheduler.hpp
sycl::_V1::detail::AccessorImplHost::MMemoryRange
range< 3 > & MMemoryRange
Definition: accessor_impl.hpp:129
sycl::_V1::detail::CGExecKernel::MNDRDesc
NDRDescT MNDRDesc
Stores ND-range description.
Definition: cg.hpp:166
sycl::_V1::detail::Command::getBlockReason
const char * getBlockReason() const
Definition: commands.cpp:936
_pi_kernel
Definition: pi_cuda.hpp:72
sycl::_V1::exception::get_cl_code
cl_int get_cl_code() const
Definition: exception.cpp:87
sycl::_V1::detail::MemCpyCommandHost::emitInstrumentationData
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1618
sycl::_V1::detail::CG::CopyFromDeviceGlobal
@ CopyFromDeviceGlobal
Definition: cg.hpp:71
sycl::_V1::detail::EmptyCommand::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1755
piextEnqueueWriteHostPipe
pi_result piextEnqueueWriteHostPipe(pi_queue queue, pi_program program, const char *pipe_symbol, pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Write to pipe of a given name.
Definition: pi_cuda.cpp:1001
sycl::_V1::detail::CG::CopyAccToPtr
@ CopyAccToPtr
Definition: cg.hpp:55
sycl::_V1::kernel
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:74
_pi_ext_command_buffer
Definition: pi_cuda.hpp:80
sycl::_V1::detail::Command::emitInstrumentationDataProxy
void emitInstrumentationDataProxy()
Proxy method which calls emitInstrumentationData.
Definition: commands.cpp:515
sycl::_V1::detail::flushCrossQueueDeps
static void flushCrossQueueDeps(const std::vector< EventImplPtr > &EventImpls, const QueueImplPtr &Queue)
Definition: commands.cpp:292
sycl::_V1::detail::EnqueueResultT::SyclEnqueueReady
@ SyclEnqueueReady
Definition: commands.hpp:56
piextSignalExternalSemaphore
pi_result piextSignalExternalSemaphore(pi_queue command_queue, pi_interop_semaphore_handle sem_handle, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to instruct the queue to signal the external semaphore handle once all previous commands have com...
Definition: pi_cuda.cpp:494
sycl::_V1::detail::Command::MDeps
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
Definition: commands.hpp:309
sycl::_V1::detail::ProgramManager::getInstance
static ProgramManager & getInstance()
Definition: program_manager.cpp:67
sycl::_V1::range< 3 >
sycl::_V1::detail::Command::MFirstInstance
bool MFirstInstance
Flag to indicate if this is the first time we are seeing this payload.
Definition: commands.hpp:357
sycl::_V1::detail::AllocaCommandBase::getRequirement
const Requirement * getRequirement() const final
Definition: commands.hpp:458
sycl::_V1::detail::AllocaSubBufCommand::getMemAllocation
void * getMemAllocation() const final
Definition: commands.cpp:1116
sycl::_V1::detail::ProgramManager::getSYCLKernelID
kernel_id getSYCLKernelID(const std::string &KernelName)
Definition: program_manager.cpp:1595
sycl::_V1::detail::CGHostTask::MQueue
std::shared_ptr< detail::queue_impl > MQueue
Definition: cg.hpp:334
sycl::_V1::detail::BlockingT
BlockingT
Definition: commands.hpp:51
piEventsWait
pi_result piEventsWait(pi_uint32 num_events, const pi_event *event_list)
Definition: pi_cuda.cpp:575
_pi_queue
Definition: pi_cuda.hpp:60
sycl::_V1::detail::cgTypeToString
static std::string cgTypeToString(detail::CG::CGTYPE Type)
Definition: commands.cpp:1807
sycl::_V1::detail::Command::emitEdgeEventForEventDependence
void emitEdgeEventForEventDependence(Command *Cmd, sycl::detail::pi::PiEvent &EventAddr)
Creates an edge event when the dependency is an event.
Definition: commands.cpp:582
PI_ACCESS_READ_WRITE
@ PI_ACCESS_READ_WRITE
Definition: pi.h:1815
sycl::_V1::detail::DepDesc::MDepRequirement
const Requirement * MDepRequirement
Requirement for the dependency.
Definition: commands.hpp:86
sycl::_V1::detail::AllocaSubBufCommand::AllocaSubBufCommand
AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, std::vector< Command * > &ToEnqueue, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:1079
sycl::_V1::detail::DepDesc::MDepCommand
Command * MDepCommand
The actual dependency command.
Definition: commands.hpp:84
program_impl.hpp
sycl::_V1::detail::MemoryManager::copy_to_device_global
static void copy_to_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, const void *SrcMem, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
Definition: memory_manager.cpp:1358
sycl::_V1::detail::CGExecKernel::MKernelName
std::string MKernelName
Definition: cg.hpp:171
sycl::_V1::detail::ReleaseCommand::supportsPostEnqueueCleanup
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:1277
sycl::_V1::detail::AccessorImplHost::MSYCLMemObj
detail::SYCLMemObjI * MSYCLMemObj
Definition: accessor_impl.hpp:132
sycl::_V1::detail::event_impl
Definition: event_impl.hpp:39
sycl::_V1::detail::UnMapMemObject::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1408
sycl::_V1::detail::UnMapMemObject::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1351
sycl::_V1::detail::Command::getType
CommandType getType() const
Definition: commands.hpp:135
sycl::_V1::detail::GlobalHandler::TraceEventXPTI
void TraceEventXPTI(const char *Message)
Definition: global_handler.cpp:87
PI_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG
provides the preferred cache configuration (large slm or large data)
Definition: pi.h:1536
sycl::_V1::device::is_gpu
bool is_gpu() const
Check if device is a GPU device.
Definition: device.cpp:87
host_pipe_map_entry.hpp
piKernelSetExecInfo
pi_result piKernelSetExecInfo(pi_kernel kernel, pi_kernel_exec_info value_name, size_t param_value_size, const void *param_value)
API to set attributes controlling kernel execution.
Definition: pi_cuda.cpp:1021
sycl::_V1::detail::Command::waitForEvents
void waitForEvents(QueueImplPtr Queue, std::vector< EventImplPtr > &RawEvents, sycl::detail::pi::PiEvent &Event)
Definition: commands.cpp:433
piKernelGetGroupInfo
pi_result piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_cuda.cpp:502
sycl::_V1::detail::Command::MSubmissionCodeLocation
code_location MSubmissionCodeLocation
Represents code location of command submission to SYCL API, assigned with the valid value only if com...
Definition: commands.hpp:363
sycl::_V1::detail::tls_code_loc_t
Data type that manages the code_location information in TLS.
Definition: common.hpp:129
sycl::_V1::detail::deviceToString
static std::string deviceToString(device Device)
Definition: commands.cpp:87
sycl::_V1::detail::Command::MBlockReason
BlockReason MBlockReason
Definition: commands.hpp:329
sycl::_V1::detail::MemCpyCommand::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1478
sycl::_V1::detail::XPTIRegistry::bufferAssociateNotification
static void bufferAssociateNotification(const void *, const void *)
Definition: xpti_registry.cpp:71
sycl::_V1::detail::code_location::functionName
constexpr const char * functionName() const noexcept
Definition: common.hpp:90
sycl::_V1::device::is_accelerator
bool is_accelerator() const
Check if device is an accelerator device.
Definition: device.cpp:89
sycl::_V1::detail::KernelArgMask
std::vector< bool > KernelArgMask
Definition: kernel_arg_mask.hpp:16
PI_KERNEL_ARG_MEM_OBJ_ACCESS
@ PI_KERNEL_ARG_MEM_OBJ_ACCESS
Definition: pi.h:1823
sycl::_V1::detail::CG::Copy2DUSM
@ Copy2DUSM
Definition: cg.hpp:67
sycl::_V1::detail::Command::MCommandNodeType
std::string MCommandNodeType
Buffer to build the command node type.
Definition: commands.hpp:351
commands.hpp
sycl::_V1::detail::AllocaCommandBase::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:997
sycl::_V1::detail::DepDesc
Dependency between two commands.
Definition: commands.hpp:73
sycl::_V1::detail::UnMapMemObject::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1373
sycl::_V1::detail::Command::MEnqueueMtx
std::mutex MEnqueueMtx
Mutex used to protect enqueueing from race conditions.
Definition: commands.hpp:280
sycl::_V1::detail::Command::getEvent
const EventImplPtr & getEvent() const
Definition: commands.hpp:171
sycl::_V1::detail::HostTask
Definition: cg_types.hpp:232
sycl::_V1::detail::CG::CopyPtrToAcc
@ CopyPtrToAcc
Definition: cg.hpp:56
piEnqueueEventsWaitWithBarrier
pi_result piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:642
sycl::_V1::detail::Command::MSubmissionFileName
std::string MSubmissionFileName
Introduces string to handle memory management since code_location struct works with raw char arrays.
Definition: commands.hpp:366
sycl::_V1::kernel_id
Objects of the class identify kernel is some kernel_bundle related APIs.
Definition: kernel_bundle.hpp:51
sycl::_V1::exception
Definition: exception.hpp:68
sycl::_V1::detail::pi::PiExtSyncPoint
::pi_ext_sync_point PiExtSyncPoint
Definition: pi.hpp:156
sycl::_V1::detail::pi::PiProgram
::pi_program PiProgram
Definition: pi.hpp:137
sycl::_V1::detail::ReleaseCommand::ReleaseCommand
ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd)
Definition: commands.cpp:1165
sycl::_V1::detail::KernelFusionCommand::getFusionList
std::vector< ExecCGCommand * > & getFusionList()
Definition: commands.cpp:3152
sycl::_V1::detail::kernel_param_kind_t::kind_invalid
@ kind_invalid
sycl::_V1::detail::ProgramManager::getDeviceImageFromBinaryImage
device_image_plain getDeviceImageFromBinaryImage(RTDeviceBinaryImage *BinImage, const context &Ctx, const device &Dev)
Definition: program_manager.cpp:1728
error_handling.hpp
sycl::_V1::detail::code_location::lineNumber
constexpr unsigned long lineNumber() const noexcept
Definition: common.hpp:87
sycl::_V1::detail::ReverseRangeDimensionsForKernel
void ReverseRangeDimensionsForKernel(NDRDescT &NDR)
Definition: commands.cpp:2192
sycl::_V1::nd_range
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: uniform.hpp:30
sycl::_V1::detail::codeToString
std::string codeToString(pi_int32 code)
Definition: common.hpp:153
sycl::_V1::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:59
sycl::_V1::detail::pi::PiMem
::pi_mem PiMem
Definition: pi.hpp:141
sycl::_V1::detail::KernelFusionCommand::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:3156
sycl::_V1::detail::adjustNDRangePerKernel
static void adjustNDRangePerKernel(NDRDescT &NDR, sycl::detail::pi::PiKernel Kernel, const device_impl &DeviceImpl)
Definition: commands.cpp:2164
sycl::_V1::detail::MemoryManager::fill_usm
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, int Pattern, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
Definition: memory_manager.cpp:972
sycl::_V1::detail::CG::Fill
@ Fill
Definition: cg.hpp:60
sycl::_V1::detail::device_impl
Definition: device_impl.hpp:35
_pi_program
Definition: pi_cuda.hpp:68
sycl::_V1::context::get_backend
backend get_backend() const noexcept
Returns the backend associated with this context.
Definition: context.cpp:133
_pi_sampler
Definition: pi_cuda.hpp:76
sycl::_V1::detail::UpdateHostRequirementCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1577
sycl::_V1::detail::EventImplPtr
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:39
sycl::_V1::detail::ProgramManager::kernelUsesAssert
bool kernelUsesAssert(const std::string &KernelName) const
Definition: program_manager.cpp:1277
sycl::_V1::detail::Scheduler::enqueueCommandForCG
void enqueueCommandForCG(EventImplPtr NewEvent, std::vector< Command * > &AuxilaryCmds, BlockingT Blocking=NON_BLOCKING)
Definition: scheduler.cpp:165
sycl::_V1::detail::AllocaCommandBase::MMemAllocation
void * MMemAllocation
Definition: commands.hpp:468
sycl::_V1::detail::MemCpyCommandHost::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1757
sycl::_V1::detail::KernelFusionCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:3236
program_manager.hpp
sycl::_V1::detail::kernel_param_kind_t::kind_pointer
@ kind_pointer
sycl::_V1::errc::invalid
@ invalid
sycl::_V1::detail::KernelFusionCommand::FusionStatus::ACTIVE
@ ACTIVE
sycl::_V1::detail::CG::getType
CGTYPE getType()
Definition: cg.hpp:124
sycl::_V1::detail::ArgDesc::MSize
int MSize
Definition: cg_types.hpp:47
sycl::_V1::detail::pi::PiKernel
::pi_kernel PiKernel
Definition: pi.hpp:138
sycl::_V1::detail::CGExecKernel
"Execute kernel" command group class.
Definition: cg.hpp:163
piextKernelSetArgSampler
pi_result piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, const pi_sampler *arg_value)
Definition: pi_cuda.cpp:354
sycl::_V1::detail::EnqueueResultT::SyclEnqueueSuccess
@ SyclEnqueueSuccess
Definition: commands.hpp:57
sycl::_V1::detail::Command::waitForPreparedHostEvents
void waitForPreparedHostEvents() const
Definition: commands.cpp:428
sycl::_V1::detail::MemoryManager::fill_2d_usm
static void fill_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, const std::vector< char > &Pattern, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
Definition: memory_manager.cpp:1146
sycl::_V1::detail::KernelFusionCommand::auxiliaryCommands
std::vector< Command * > & auxiliaryCommands()
Definition: commands.cpp:3144
sycl::_V1::detail::MemCpyCommand::emitInstrumentationData
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1444
sycl::_V1::detail::AllocaCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1059
sycl::_V1::detail::CG::Memset2DUSM
@ Memset2DUSM
Definition: cg.hpp:69
sampler.hpp
sycl::_V1::detail::ExecCGCommand
The exec CG command enqueues execution of kernel or explicit memory operation.
Definition: commands.hpp:635
sycl::_V1::detail::code_location::columnNumber
constexpr unsigned long columnNumber() const noexcept
Definition: common.hpp:88
sycl::_V1::detail::kernel_param_kind_t::kind_specialization_constants_buffer
@ kind_specialization_constants_buffer
sycl::_V1::detail::NDRDescT
Definition: cg_types.hpp:53
sycl::_V1::detail::CGHostTask
Definition: cg.hpp:330
sycl::_V1::detail::CG::CGTYPE
CGTYPE
Type of the command group.
Definition: cg.hpp:52
piextKernelSetArgMemObj
pi_result piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem_obj_property *arg_properties, const pi_mem *arg_value)
Definition: pi_cuda.cpp:347
sycl::_V1::detail::AllocaCommandBase::MIsLeaderAlloca
bool MIsLeaderAlloca
Indicates that the command owns memory allocation in case of connected alloca command.
Definition: commands.hpp:481
sycl::_V1::detail::Scheduler::NotifyHostTaskCompletion
void NotifyHostTaskCompletion(Command *Cmd)
Definition: scheduler.cpp:463
sycl::_V1::detail::Command::CommandType
CommandType
Definition: commands.hpp:101
sycl::_V1::detail::ReleaseCommand::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1170
sycl::_V1::detail::EnqueueResultT::SyclEnqueueFailed
@ SyclEnqueueFailed
Definition: commands.hpp:59
sycl::_V1::detail::Command::BlockReason::HostAccessor
@ HostAccessor
kernel_info.hpp
sycl::_V1::detail::device_impl::getHandleRef
sycl::detail::pi::PiDevice & getHandleRef()
Get reference to PI device.
Definition: device_impl.hpp:66
sycl::_V1::detail::Command::MAddress
void * MAddress
Reserved for storing the object address such as SPIR-V or memory object address.
Definition: commands.hpp:347
sycl::_V1::backend::opencl
@ opencl
sycl::_V1::detail::AllocaCommandBase::MIsActive
bool MIsActive
Indicates that current alloca is active one.
Definition: commands.hpp:477
sycl::_V1::detail::NDRDescT::LocalSize
sycl::range< 3 > LocalSize
Definition: cg_types.hpp:129
sycl::_V1::detail::device_image_plain
Definition: kernel_bundle.hpp:83
sycl::_V1::detail::ExecCGCommand::clearAuxiliaryResources
void clearAuxiliaryResources()
Definition: commands.cpp:1558
sycl::_V1::detail::code_location
Definition: common.hpp:66
piEnqueueEventsWait
pi_result piEnqueueEventsWait(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:634
backend_types.hpp
sycl::_V1::detail::Command::MTraceEvent
void * MTraceEvent
The event for node_create and task_begin.
Definition: commands.hpp:340
sycl::_V1::detail::Command::MCommandName
std::string MCommandName
Buffer to build the command end-user understandable name.
Definition: commands.hpp:353
sycl::_V1::detail::DispatchHostTask::operator()
void operator()() const
Definition: commands.cpp:348
sycl::_V1::detail::NDRDescT::GlobalOffset
sycl::id< 3 > GlobalOffset
Definition: cg_types.hpp:130
piextEnqueueReadHostPipe
pi_result piextEnqueueReadHostPipe(pi_queue queue, pi_program program, const char *pipe_symbol, pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Plugin.
Definition: pi_cuda.cpp:981
sycl::_V1::detail::enqueueImpCommandBufferKernel
pi_int32 enqueueImpCommandBufferKernel(context Ctx, DeviceImplPtr DeviceImpl, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const CGExecKernel &CommandGroup, std::vector< sycl::detail::pi::PiExtSyncPoint > &SyncPoints, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
Definition: commands.cpp:2377
sycl::_V1::detail::Command::DispatchHostTask
friend class DispatchHostTask
Definition: commands.hpp:282
_pi_image_desc
Definition: pi.h:1101
sycl::_V1::detail::AllocaCommandBase::AllocaCommandBase
AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
Definition: commands.cpp:965
sycl::_V1::detail::AllocaCommandBase::getMemAllocation
virtual void * getMemAllocation() const =0
sycl::_V1::detail::HostPipeMapEntry
Definition: host_pipe_map_entry.hpp:19
std
Definition: accessor.hpp:4139
sycl::_V1::detail::kernel_param_kind_t::kind_sampler
@ kind_sampler
kernel_desc.hpp
piextEnqueueCommandBuffer
pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, pi_queue queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to submit the command-buffer to queue for execution, returns an error if the command-buffer is no...
Definition: pi_cuda.cpp:1140
sycl::_V1::detail::MemoryManager::release
static void release(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
Definition: memory_manager.cpp:236
sycl::_V1::detail::BLOCKING
@ BLOCKING
Definition: commands.hpp:51
sycl::_V1::detail::ExecCGCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:2120
_pi_event
Definition: pi_cuda.hpp:64
sycl::_V1::detail::Command::MWorkerQueue
QueueImplPtr MWorkerQueue
Definition: commands.hpp:247
sycl::_V1::detail::Command::MMarkedForCleanup
bool MMarkedForCleanup
Indicates that the node will be freed by graph cleanup.
Definition: commands.hpp:380
sycl::_V1::detail::ReleaseCommand::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1275
sycl::_V1::detail::ExecCGCommand::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:3121
sycl::_V1::detail::Scheduler::getDefaultHostQueue
QueueImplPtr getDefaultHostQueue()
Definition: scheduler.hpp:444
sycl::_V1::detail::Command::makeTraceEventProlog
uint64_t makeTraceEventProlog(void *MAddress)
Create a trace event of node_create type; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:640
sycl::_V1::device::is_cpu
bool is_cpu() const
Check if device is a CPU device.
Definition: device.cpp:85
sycl::_V1::detail::code_location::fileName
constexpr const char * fileName() const noexcept
Definition: common.hpp:89
sycl::_V1::detail::ArgDesc
Definition: cg_types.hpp:39
sycl::_V1::detail::CG::CopyUSM
@ CopyUSM
Definition: cg.hpp:62
sycl::_V1::detail::QueueImplPtr
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: event_impl.hpp:35
sycl::_V1::detail::EmptyCommand::EmptyCommand
EmptyCommand(QueueImplPtr Queue)
Definition: commands.cpp:1680
sycl::_V1::detail::Scheduler::GraphBuilder::connectDepEvent
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
Definition: graph_builder.cpp:1307
sycl::_V1::detail::KernelFusionCommand::addToFusionList
void addToFusionList(ExecCGCommand *Kernel)
Definition: commands.cpp:3148
sycl::_V1::detail::ExecCGCommand::supportsPostEnqueueCleanup
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:3126
_pi_mem_obj_access
_pi_mem_obj_access
Definition: pi.h:1814
sycl::_V1::detail::CG::FillUSM
@ FillUSM
Definition: cg.hpp:63
sycl::_V1::detail::MemoryManager::ext_oneapi_copyD2H_cmd_buffer
static void ext_oneapi_copyD2H_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, char *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
Definition: memory_manager.cpp:1491
sycl::_V1::detail::Scheduler::getInstance
static Scheduler & getInstance()
Definition: scheduler.cpp:260
kernel_bundle_impl.hpp
sycl::_V1::detail::Command::addDep
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:738
sycl::_V1::detail::KernelFusionCommand::KernelFusionCommand
KernelFusionCommand(QueueImplPtr Queue)
Definition: commands.cpp:3138
sycl::_V1::detail::Command::MAddressString
std::string MAddressString
Buffer to build the address string.
Definition: commands.hpp:349
sycl::_V1::detail::MapMemObject::MapMemObject
MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode)
Definition: commands.cpp:1281
piextKernelSetArgPointer
pi_result piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Sets up pointer arguments for CL kernels.
Definition: pi_cuda.cpp:868
sycl::_V1::detail::AccessorImplHost::MOffsetInBytes
size_t MOffsetInBytes
Definition: accessor_impl.hpp:136
sycl::_V1::detail::AllocaCommandBase::supportsPostEnqueueCleanup
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:999
_pi_mem_obj_property
Definition: pi.h:1828
sycl::_V1::detail::CG::CopyAccToAcc
@ CopyAccToAcc
Definition: cg.hpp:57
sycl::_V1::detail::AccessorImplHost::MData
void *& MData
Definition: accessor_impl.hpp:139
sycl::_V1::detail::MemCpyCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1519
sycl::_V1::detail::AllocaCommandBase::getSYCLMemObj
SYCLMemObjI * getSYCLMemObj() const
Definition: commands.hpp:454
kernel_impl.hpp
sycl::_V1::detail::NDRDescT::set
void set(sycl::range< Dims_ > NumWorkItems)
Definition: cg_types.hpp:69
sycl::_V1::detail::Command::MPreparedHostDepsEvents
std::vector< EventImplPtr > & MPreparedHostDepsEvents
Definition: commands.hpp:252
sycl::_V1::detail::AccessorImplHost::MAccessRange
range< 3 > & MAccessRange
Definition: accessor_impl.hpp:127
sycl::_V1::detail::usm::free
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:278
sycl::_V1::detail::Command::Command
Command(CommandType Type, QueueImplPtr Queue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &SyncPoints={})
It is safe to bind MPreparedDepsEvents and MPreparedHostDepsEvents references to event_impl class mem...
Definition: commands.cpp:490
sycl::_V1::detail::AccessorImplHost::MDims
unsigned int MDims
Definition: accessor_impl.hpp:134
sycl::_V1::detail::AllocaCommand::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1021
sycl::_V1::detail::CG::CodeplayHostTask
@ CodeplayHostTask
Definition: cg.hpp:65
sycl::_V1::detail::Command::enqueue
virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector< Command * > &ToCleanUp)
Checks if the command is enqueued, and calls enqueueImp.
Definition: commands.cpp:803
sycl::_V1::detail::MemoryManager::copy
static void copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl)
Definition: memory_manager.cpp:753
sycl::_V1::detail::Command::MPreparedDepsEvents
std::vector< EventImplPtr > & MPreparedDepsEvents
Dependency events prepared for waiting by backend.
Definition: commands.hpp:251
sycl::_V1::detail::Command::MStreamID
int32_t MStreamID
The stream under which the traces are emitted.
Definition: commands.hpp:344
sycl::_V1::detail::ReleaseCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1257
sycl::_V1::detail::AllocaCommandBase::MRequirement
Requirement MRequirement
Definition: commands.hpp:486
sycl::_V1::detail::enqueueReadWriteHostPipe
pi_int32 enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, bool blocking, void *ptr, size_t size, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, bool read)
Definition: commands.cpp:2573
sycl::_V1::detail::AllocaCommandBase
Base class for memory allocation commands.
Definition: commands.hpp:447
sycl::_V1::detail::ArgDesc::MPtr
void * MPtr
Definition: cg_types.hpp:46
PI_ACCESS_WRITE_ONLY
@ PI_ACCESS_WRITE_ONLY
Definition: pi.h:1816
sycl::_V1::interop_handle
Definition: interop_handle.hpp:48
sycl::_V1::detail::Command::getWorkerQueue
const QueueImplPtr & getWorkerQueue() const
Get the queue this command will be submitted to.
Definition: commands.cpp:724
sycl::_V1::access::mode::read_write
@ read_write
sycl::_V1::detail::KernelFusionCommand::FusionStatus
FusionStatus
Definition: commands.hpp:711
_pi_kernel_cache_config
_pi_kernel_cache_config
Definition: pi.h:744
sycl::_V1::access::mode::write
@ write
sycl::_V1::detail::MemCpyCommand::getWorkerContext
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1474
sycl::_V1::detail::MemoryManager::copy_2d_usm
static void copy_2d_usm(const void *SrcMem, size_t SrcPitch, QueueImplPtr Queue, void *DstMem, size_t DstPitch, size_t Width, size_t Height, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
Definition: memory_manager.cpp:1058
sycl::_V1::detail::Command::MType
CommandType MType
The type of the command.
Definition: commands.hpp:278
sycl::_V1::detail::KernelFusionCommand::emitInstrumentationData
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:3180
sycl::_V1::detail::UnMapMemObject::UnMapMemObject
UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, void **SrcPtr, QueueImplPtr Queue)
Definition: commands.cpp:1344
sycl::_V1::detail::enqueue_kernel_launch::handleErrorOrWarning
void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl, pi_kernel Kernel, const NDRDescT &NDRDesc)
Analyzes error code and arguments of piEnqueueKernelLaunch to emit user-friendly exception describing...
Definition: error_handling.cpp:350
sycl::_V1::detail::Command::MCommandBuffer
sycl::detail::pi::PiExtCommandBuffer MCommandBuffer
CommandBuffer which will be used to submit to instead of the queue, if set.
Definition: commands.hpp:398
sycl::_V1::detail::SetArgBasedOnType
void SetArgBasedOnType(const PluginPtr &Plugin, sycl::detail::pi::PiKernel Kernel, const std::shared_ptr< device_image_impl > &DeviceImageImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg, size_t NextTrueIndex)
Definition: commands.cpp:2212
sycl::_V1::errc::kernel
@ kernel
sycl::_V1::detail::MemCpyCommandHost::getWorkerContext
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1648
piextCommandBufferNDRangeKernel
pi_result piextCommandBufferNDRangeKernel(pi_ext_command_buffer command_buffer, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, pi_ext_sync_point *sync_point)
API to append a kernel execution command to the command-buffer.
Definition: pi_cuda.cpp:1055
sycl::_V1::detail::NDRDescT::Dims
size_t Dims
Definition: cg_types.hpp:135
sycl::_V1::detail::Command::producesPiEvent
virtual bool producesPiEvent() const
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:729
sycl::_V1::detail::CGExecKernel::MArgs
std::vector< ArgDesc > MArgs
Definition: cg.hpp:170
sycl::_V1::access::mode::read
@ read
stream_impl.hpp
sycl::_V1::detail::get_kernel_device_specific_info
std::enable_if< !std::is_same< typename Param::return_type, sycl::range< 3 > >::value, typename Param::return_type >::type get_kernel_device_specific_info(sycl::detail::pi::PiKernel Kernel, sycl::detail::pi::PiDevice Device, const PluginPtr &Plugin)
Definition: kernel_info.hpp:89
sycl::_V1::detail::Command::MEnqueueStatus
std::atomic< EnqueueResultT::ResultT > MEnqueueStatus
Describes the status of the command.
Definition: commands.hpp:332
PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
Definition: pi.h:489
sycl::_V1::detail::CG::UpdateHost
@ UpdateHost
Definition: cg.hpp:61
pi_int32
int32_t pi_int32
Definition: pi.h:193
sycl::_V1::detail::emitKernelInstrumentationData
void emitKernelInstrumentationData(const std::shared_ptr< detail::kernel_impl > &SyclKernel, const detail::code_location &CodeLoc, const std::string &SyclKernelName, const QueueImplPtr &Queue, const NDRDescT &NDRDesc, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, std::vector< ArgDesc > &CGArgs)
Definition: commands.cpp:2026
sycl::_V1::access::mode::discard_write
@ discard_write
sycl::_V1::detail::getSyclObjImpl
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:30
sycl::_V1::detail::enqueueImpKernel
pi_int32 enqueueImpKernel(const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector< ArgDesc > &Args, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, const std::shared_ptr< detail::kernel_impl > &MSyclKernel, const std::string &KernelName, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig)
Definition: commands.cpp:2453
sycl::_V1::detail::applyFuncOnFilteredArgs
void applyFuncOnFilteredArgs(const KernelArgMask *EliminatedArgMask, std::vector< ArgDesc > &Args, std::function< void(detail::ArgDesc &Arg, int NextTrueIndex)> Func)
Definition: commands.cpp:100
sampler_impl.hpp
sycl::_V1::detail::KernelFusionCommand::resetQueue
void resetQueue()
Reset the queue.
Definition: commands.cpp:3172
sycl::_V1::detail::Command::ALLOCA_SUB_BUF
@ ALLOCA_SUB_BUF
Definition: commands.hpp:105
sycl::_V1::detail::ArgDesc::MIndex
int MIndex
Definition: cg_types.hpp:48
sycl::_V1::detail::Command::getWorkerContext
virtual const ContextImplPtr & getWorkerContext() const
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:720
sycl::_V1::context
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:51
sycl::_V1::detail::MemoryManager::copy_usm
static void copy_usm(const void *SrcMem, QueueImplPtr Queue, size_t Len, void *DstMem, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
Definition: memory_manager.cpp:931
sycl::_V1::detail::PluginPtr
std::shared_ptr< plugin > PluginPtr
Definition: pi.hpp:48
sycl::_V1::detail::MemoryManager::copy_image_bindless
static void copy_image_bindless(void *Src, QueueImplPtr Queue, void *Dst, const sycl::detail::pi::PiMemImageDesc &Desc, const sycl::detail::pi::PiMemImageFormat &Format, const sycl::detail::pi::PiImageCopyFlags Flags, sycl::detail::pi::PiImageOffset SrcOffset, sycl::detail::pi::PiImageOffset DstOffset, sycl::detail::pi::PiImageRegion CopyExtent, sycl::detail::pi::PiImageRegion HostExtent, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent)
Definition: memory_manager.cpp:1622
memory_manager.hpp
sycl::_V1::detail::Command::enqueueImp
virtual pi_int32 enqueueImp()=0
Private interface. Derived classes should implement this method.