DPC++ Runtime
Runtime libraries for oneAPI DPC++
commands.cpp
Go to the documentation of this file.
1 //===----------- commands.cpp - SYCL commands -------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 
11 #include <detail/context_impl.hpp>
12 #include <detail/event_impl.hpp>
14 #include <detail/kernel_impl.hpp>
15 #include <detail/kernel_info.hpp>
17 #include <detail/program_impl.hpp>
19 #include <detail/queue_impl.hpp>
20 #include <detail/sampler_impl.hpp>
23 #include <detail/stream_impl.hpp>
24 #include <detail/xpti_registry.hpp>
25 #include <sycl/access/access.hpp>
26 #include <sycl/backend_types.hpp>
27 #include <sycl/detail/cg_types.hpp>
29 #include <sycl/sampler.hpp>
30 
31 #include <cassert>
32 #include <optional>
33 #include <string>
34 #include <vector>
35 
36 #ifdef __has_include
37 #if __has_include(<cxxabi.h>)
38 #define __SYCL_ENABLE_GNU_DEMANGLING
39 #include <cstdlib>
40 #include <cxxabi.h>
41 #include <memory>
42 #endif
43 #endif
44 
45 #ifdef XPTI_ENABLE_INSTRUMENTATION
46 #include "xpti/xpti_trace_framework.hpp"
47 #include <detail/xpti_registry.hpp>
48 #endif
49 
50 namespace sycl {
52 namespace detail {
53 
54 #ifdef XPTI_ENABLE_INSTRUMENTATION
55 // Global graph for the application
56 extern xpti::trace_event_data_t *GSYCLGraphEvent;
57 
58 bool CurrentCodeLocationValid() {
59  detail::tls_code_loc_t Tls;
60  auto CodeLoc = Tls.query();
61  auto FileName = CodeLoc.fileName();
62  auto FunctionName = CodeLoc.functionName();
63  return (FileName && FileName[0] != '\0') ||
64  (FunctionName && FunctionName[0] != '\0');
65 }
66 #endif
67 
68 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
69 struct DemangleHandle {
70  char *p;
71  DemangleHandle(char *ptr) : p(ptr) {}
72  ~DemangleHandle() { std::free(p); }
73 };
74 static std::string demangleKernelName(std::string Name) {
75  int Status = -1; // some arbitrary value to eliminate the compiler warning
76  DemangleHandle result(abi::__cxa_demangle(Name.c_str(), NULL, NULL, &Status));
77  return (Status == 0) ? result.p : Name;
78 }
79 #else
80 static std::string demangleKernelName(std::string Name) { return Name; }
81 #endif
82 
83 static std::string deviceToString(device Device) {
84  if (getSyclObjImpl(Device)->is_host())
85  return "HOST";
86  else if (Device.is_cpu())
87  return "CPU";
88  else if (Device.is_gpu())
89  return "GPU";
90  else if (Device.is_accelerator())
91  return "ACCELERATOR";
92  else
93  return "UNKNOWN";
94 }
95 
97  const ProgramManager::KernelArgMask &EliminatedArgMask,
98  std::vector<ArgDesc> &Args,
99  std::function<void(detail::ArgDesc &Arg, int NextTrueIndex)> Func) {
100  if (EliminatedArgMask.empty()) {
101  for (ArgDesc &Arg : Args) {
102  Func(Arg, Arg.MIndex);
103  }
104  } else {
105  // TODO this is not necessary as long as we can guarantee that the
106  // arguments are already sorted (e. g. handle the sorting in handler
107  // if necessary due to set_arg(...) usage).
108  std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
109  return A.MIndex < B.MIndex;
110  });
111  int LastIndex = -1;
112  size_t NextTrueIndex = 0;
113 
114  for (ArgDesc &Arg : Args) {
115  // Handle potential gaps in set arguments (e. g. if some of them are
116  // set on the user side).
117  for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx)
118  if (!EliminatedArgMask[Idx])
119  ++NextTrueIndex;
120  LastIndex = Arg.MIndex;
121 
122  if (EliminatedArgMask[Arg.MIndex])
123  continue;
124 
125  Func(Arg, NextTrueIndex);
126  ++NextTrueIndex;
127  }
128  }
129 }
130 
131 #ifdef XPTI_ENABLE_INSTRUMENTATION
132 static size_t deviceToID(const device &Device) {
133  if (getSyclObjImpl(Device)->is_host())
134  return 0;
135  else
136  return reinterpret_cast<size_t>(getSyclObjImpl(Device)->getHandleRef());
137 }
138 #endif
139 
140 static std::string accessModeToString(access::mode Mode) {
141  switch (Mode) {
142  case access::mode::read:
143  return "read";
144  case access::mode::write:
145  return "write";
147  return "read_write";
148  case access::mode::discard_write:
149  return "discard_write";
150  case access::mode::discard_read_write:
151  return "discard_read_write";
152  default:
153  return "unknown";
154  }
155 }
156 
157 #ifdef XPTI_ENABLE_INSTRUMENTATION
158 // Using the command group type to create node types for the asynchronous task
159 // graph modeling
160 static std::string commandToNodeType(Command::CommandType Type) {
161  switch (Type) {
162  case Command::CommandType::RUN_CG:
163  return "command_group_node";
164  case Command::CommandType::COPY_MEMORY:
165  return "memory_transfer_node";
166  case Command::CommandType::ALLOCA:
167  return "memory_allocation_node";
168  case Command::CommandType::ALLOCA_SUB_BUF:
169  return "sub_buffer_creation_node";
170  case Command::CommandType::RELEASE:
171  return "memory_deallocation_node";
172  case Command::CommandType::MAP_MEM_OBJ:
173  return "memory_transfer_node";
174  case Command::CommandType::UNMAP_MEM_OBJ:
175  return "memory_transfer_node";
176  case Command::CommandType::UPDATE_REQUIREMENT:
177  return "host_acc_create_buffer_lock_node";
178  case Command::CommandType::EMPTY_TASK:
179  return "host_acc_destroy_buffer_release_node";
180  case Command::CommandType::FUSION:
181  return "kernel_fusion_placeholder_node";
182  default:
183  return "unknown_node";
184  }
185 }
186 
187 // Using the names being generated and the string are subject to change to
188 // something more meaningful to end-users as this will be visible in analysis
189 // tools that subscribe to this data
190 static std::string commandToName(Command::CommandType Type) {
191  switch (Type) {
192  case Command::CommandType::RUN_CG:
193  return "Command Group Action";
194  case Command::CommandType::COPY_MEMORY:
195  return "Memory Transfer (Copy)";
196  case Command::CommandType::ALLOCA:
197  return "Memory Allocation";
198  case Command::CommandType::ALLOCA_SUB_BUF:
199  return "Sub Buffer Creation";
200  case Command::CommandType::RELEASE:
201  return "Memory Deallocation";
202  case Command::CommandType::MAP_MEM_OBJ:
203  return "Memory Transfer (Map)";
204  case Command::CommandType::UNMAP_MEM_OBJ:
205  return "Memory Transfer (Unmap)";
206  case Command::CommandType::UPDATE_REQUIREMENT:
207  return "Host Accessor Creation/Buffer Lock";
208  case Command::CommandType::EMPTY_TASK:
209  return "Host Accessor Destruction/Buffer Lock Release";
210  case Command::CommandType::FUSION:
211  return "Kernel Fusion Placeholder";
212  default:
213  return "Unknown Action";
214  }
215 }
216 #endif
217 
218 std::vector<RT::PiEvent>
219 Command::getPiEvents(const std::vector<EventImplPtr> &EventImpls) const {
220  std::vector<RT::PiEvent> RetPiEvents;
221  for (auto &EventImpl : EventImpls) {
222  if (EventImpl->getHandleRef() == nullptr)
223  continue;
224 
225  // Do not add redundant event dependencies for in-order queues.
226  // At this stage dependency is definitely pi task and need to check if
227  // current one is a host task. In this case we should not skip pi event due
228  // to different sync mechanisms for different task types on in-order queue.
229  const QueueImplPtr &WorkerQueue = getWorkerQueue();
230  // MWorkerQueue in command is always not null. So check if
231  // EventImpl->getWorkerQueue != nullptr is implicit.
232  if (EventImpl->getWorkerQueue() == WorkerQueue &&
233  WorkerQueue->isInOrder() && !isHostTask())
234  continue;
235 
236  RetPiEvents.push_back(EventImpl->getHandleRef());
237  }
238 
239  return RetPiEvents;
240 }
241 
242 // This function is implemented (duplicating getPiEvents a lot) as short term
243 // solution for the issue that barrier with wait list could not
244 // handle empty pi event handles when kernel is enqueued on host task
245 // completion.
246 std::vector<RT::PiEvent> Command::getPiEventsBlocking(
247  const std::vector<EventImplPtr> &EventImpls) const {
248  std::vector<RT::PiEvent> RetPiEvents;
249  for (auto &EventImpl : EventImpls) {
250  // Throwaway events created with empty constructor will not have a context
251  // (which is set lazily) calling getContextImpl() would set that
252  // context, which we wish to avoid as it is expensive.
253  // Skip host task also.
254  if (!EventImpl->isContextInitialized() || EventImpl->is_host())
255  continue;
256  // In this path nullptr native event means that the command has not been
257  // enqueued. It may happen if async enqueue in a host task is involved.
258  if (EventImpl->getHandleRef() == nullptr) {
259  if (!EventImpl->getCommand() ||
260  !static_cast<Command *>(EventImpl->getCommand())->producesPiEvent())
261  continue;
262  std::vector<Command *> AuxCmds;
263  Scheduler::getInstance().enqueueCommandForCG(EventImpl, AuxCmds,
264  BLOCKING);
265  }
266  // Do not add redundant event dependencies for in-order queues.
267  // At this stage dependency is definitely pi task and need to check if
268  // current one is a host task. In this case we should not skip pi event due
269  // to different sync mechanisms for different task types on in-order queue.
270  const QueueImplPtr &WorkerQueue = getWorkerQueue();
271  // MWorkerQueue in command is always not null. So check if
272  // EventImpl->getWorkerQueue != nullptr is implicit.
273  if (EventImpl->getWorkerQueue() == WorkerQueue &&
274  WorkerQueue->isInOrder() && !isHostTask())
275  continue;
276 
277  RetPiEvents.push_back(EventImpl->getHandleRef());
278  }
279 
280  return RetPiEvents;
281 }
282 
283 bool Command::isHostTask() const {
284  return (MType == CommandType::RUN_CG) /* host task has this type also */ &&
285  ((static_cast<const ExecCGCommand *>(this))->getCG().getType() ==
286  CG::CGTYPE::CodeplayHostTask);
287 }
288 
289 static void flushCrossQueueDeps(const std::vector<EventImplPtr> &EventImpls,
290  const QueueImplPtr &Queue) {
291  for (auto &EventImpl : EventImpls) {
292  EventImpl->flushIfNeeded(Queue);
293  }
294 }
295 
297  ExecCGCommand *MThisCmd;
298  std::vector<interop_handle::ReqToMem> MReqToMem;
299 
300  pi_result waitForEvents() const {
301  std::map<const detail::plugin *, std::vector<EventImplPtr>>
302  RequiredEventsPerPlugin;
303 
304  for (const EventImplPtr &Event : MThisCmd->MPreparedDepsEvents) {
305  const detail::plugin &Plugin = Event->getPlugin();
306  RequiredEventsPerPlugin[&Plugin].push_back(Event);
307  }
308 
309  // wait for dependency device events
310  // FIXME Current implementation of waiting for events will make the thread
311  // 'sleep' until all of dependency events are complete. We need a bit more
312  // sophisticated waiting mechanism to allow to utilize this thread for any
313  // other available job and resume once all required events are ready.
314  for (auto &PluginWithEvents : RequiredEventsPerPlugin) {
315  std::vector<RT::PiEvent> RawEvents =
316  MThisCmd->getPiEvents(PluginWithEvents.second);
317  try {
318  PluginWithEvents.first->call<PiApiKind::piEventsWait>(RawEvents.size(),
319  RawEvents.data());
320  } catch (const sycl::exception &E) {
321  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
322  HostTask.MQueue->reportAsyncException(std::current_exception());
323  return (pi_result)E.get_cl_code();
324  } catch (...) {
325  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
326  HostTask.MQueue->reportAsyncException(std::current_exception());
327  return PI_ERROR_UNKNOWN;
328  }
329  }
330 
331  // Wait for dependency host events.
332  // Host events can't throw exceptions so don't try to catch it.
333  for (const EventImplPtr &Event : MThisCmd->MPreparedHostDepsEvents) {
334  Event->waitInternal();
335  }
336 
337  return PI_SUCCESS;
338  }
339 
340 public:
342  std::vector<interop_handle::ReqToMem> ReqToMem)
343  : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)) {}
344 
345  void operator()() const {
346  assert(MThisCmd->getCG().getType() == CG::CGTYPE::CodeplayHostTask);
347 
348  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
349 
350 #ifdef XPTI_ENABLE_INSTRUMENTATION
351  // Host task is executed async and in a separate thread that do not allow to
352  // use code location data stored in TLS. So we keep submission code location
353  // as Command field and put it here to TLS so that thrown exception could
354  // query and report it.
355  std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
356  if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
357  AsyncCodeLocationPtr.reset(
359  }
360 #endif
361 
362  pi_result WaitResult = waitForEvents();
363  if (WaitResult != PI_SUCCESS) {
364  std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error(
365  std::string("Couldn't wait for host-task's dependencies"),
366  WaitResult));
367  HostTask.MQueue->reportAsyncException(EPtr);
368  // reset host-task's lambda and quit
369  HostTask.MHostTask.reset();
370  Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd);
371  return;
372  }
373 
374  try {
375  // we're ready to call the user-defined lambda now
376  if (HostTask.MHostTask->isInteropTask()) {
377  interop_handle IH{MReqToMem, HostTask.MQueue,
378  HostTask.MQueue->getDeviceImplPtr(),
379  HostTask.MQueue->getContextImplPtr()};
380 
381  HostTask.MHostTask->call(IH);
382  } else
383  HostTask.MHostTask->call();
384  } catch (...) {
385  auto CurrentException = std::current_exception();
386 #ifdef XPTI_ENABLE_INSTRUMENTATION
387  // sycl::exception emit tracing of message with code location if
388  // available. For other types of exception we need to explicitly trigger
389  // tracing by calling TraceEventXPTI.
390  if (xptiTraceEnabled()) {
391  try {
392  rethrow_exception(CurrentException);
393  } catch (const sycl::exception &) {
394  // it is already traced, nothing to care about
395  } catch (const std::exception &StdException) {
396  GlobalHandler::instance().TraceEventXPTI(StdException.what());
397  } catch (...) {
398  GlobalHandler::instance().TraceEventXPTI(
399  "Host task lambda thrown non standard exception");
400  }
401  }
402 #endif
403  HostTask.MQueue->reportAsyncException(CurrentException);
404  }
405 
406  HostTask.MHostTask.reset();
407 
408 #ifdef XPTI_ENABLE_INSTRUMENTATION
409  // Host Task is done, clear its submittion location to not interfere with
410  // following dependent kernels submission.
411  AsyncCodeLocationPtr.reset();
412 #endif
413 
414  try {
415  // If we enqueue blocked users - pi level could throw exception that
416  // should be treated as async now.
417  Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd);
418  } catch (...) {
419  auto CurrentException = std::current_exception();
420  HostTask.MQueue->reportAsyncException(CurrentException);
421  }
422  }
423 };
424 
425 void Command::waitForPreparedHostEvents() const {
426  for (const EventImplPtr &HostEvent : MPreparedHostDepsEvents)
427  HostEvent->waitInternal();
428 }
429 
431  std::vector<EventImplPtr> &EventImpls,
432  RT::PiEvent &Event) {
433 
434  if (!EventImpls.empty()) {
435  if (Queue->is_host()) {
436  // Host queue can wait for events from different contexts, i.e. it may
437  // contain events with different contexts in its MPreparedDepsEvents.
438  // OpenCL 2.1 spec says that clWaitForEvents will return
439  // CL_INVALID_CONTEXT if events specified in the list do not belong to
440  // the same context. Thus we split all the events into per-context map.
441  // An example. We have two queues for the same CPU device: Q1, Q2. Thus
442  // we will have two different contexts for the same CPU device: C1, C2.
443  // Also we have default host queue. This queue is accessible via
444  // Scheduler. Now, let's assume we have three different events: E1(C1),
445  // E2(C1), E3(C2). The command's MPreparedDepsEvents will contain all
446  // three events (E1, E2, E3). Now, if piEventsWait is called for all
447  // three events we'll experience failure with CL_INVALID_CONTEXT 'cause
448  // these events refer to different contexts.
449  std::map<context_impl *, std::vector<EventImplPtr>>
450  RequiredEventsPerContext;
451 
452  for (const EventImplPtr &Event : EventImpls) {
453  ContextImplPtr Context = Event->getContextImpl();
454  assert(Context.get() &&
455  "Only non-host events are expected to be waited for here");
456  RequiredEventsPerContext[Context.get()].push_back(Event);
457  }
458 
459  for (auto &CtxWithEvents : RequiredEventsPerContext) {
460  std::vector<RT::PiEvent> RawEvents = getPiEvents(CtxWithEvents.second);
461  CtxWithEvents.first->getPlugin().call<PiApiKind::piEventsWait>(
462  RawEvents.size(), RawEvents.data());
463  }
464  } else {
465 #ifndef NDEBUG
466  for (const EventImplPtr &Event : EventImpls)
467  assert(Event->getContextImpl().get() &&
468  "Only non-host events are expected to be waited for here");
469 #endif
470 
471  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
472  flushCrossQueueDeps(EventImpls, getWorkerQueue());
473  const detail::plugin &Plugin = Queue->getPlugin();
475  Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
476  }
477  }
478 }
479 
483 Command::Command(CommandType Type, QueueImplPtr Queue)
484  : MQueue(std::move(Queue)),
485  MEvent(std::make_shared<detail::event_impl>(MQueue)),
486  MPreparedDepsEvents(MEvent->getPreparedDepsEvents()),
487  MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()),
488  MType(Type) {
490  MEvent->setWorkerQueue(MWorkerQueue);
491  MEvent->setSubmittedQueue(MWorkerQueue);
492  MEvent->setCommand(this);
493  MEvent->setContextImpl(MQueue->getContextImplPtr());
494  MEvent->setStateIncomplete();
496 
497 #ifdef XPTI_ENABLE_INSTRUMENTATION
498  if (!xptiTraceEnabled())
499  return;
500  // Obtain the stream ID so all commands can emit traces to that stream
501  MStreamID = xptiRegisterStream(SYCL_STREAM_NAME);
502 #endif
503 }
504 
506 #ifdef XPTI_ENABLE_INSTRUMENTATION
508 #endif
509 }
510 
522  Command *Cmd, void *ObjAddr, bool IsCommand,
523  std::optional<access::mode> AccMode) {
524 #ifdef XPTI_ENABLE_INSTRUMENTATION
525  // Bail early if either the source or the target node for the given
526  // dependency is undefined or NULL
527  if (!(xptiTraceEnabled() && MTraceEvent && Cmd && Cmd->MTraceEvent))
528  return;
529 
530  // If all the information we need for creating an edge event is available,
531  // then go ahead with creating it; if not, bail early!
532  xpti::utils::StringHelper SH;
533  std::string AddressStr = SH.addressAsString<void *>(ObjAddr);
534  std::string Prefix = AccMode ? accessModeToString(AccMode.value()) : "Event";
535  std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
536  // Create an edge with the dependent buffer address for which a command
537  // object has been created as one of the properties of the edge
538  xpti::payload_t Payload(TypeString.c_str(), MAddress);
539  uint64_t EdgeInstanceNo;
540  xpti_td *EdgeEvent =
541  xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
542  xpti_at::active, &EdgeInstanceNo);
543  if (EdgeEvent) {
544  xpti_td *SrcEvent = static_cast<xpti_td *>(Cmd->MTraceEvent);
545  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
546  EdgeEvent->source_id = SrcEvent->unique_id;
547  EdgeEvent->target_id = TgtEvent->unique_id;
548  if (IsCommand) {
549  xpti::addMetadata(EdgeEvent, "access_mode",
550  static_cast<int>(AccMode.value()));
551  xpti::addMetadata(EdgeEvent, "memory_object",
552  reinterpret_cast<size_t>(ObjAddr));
553  } else {
554  xpti::addMetadata(EdgeEvent, "event", reinterpret_cast<size_t>(ObjAddr));
555  }
556  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
557  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
558  nullptr);
559  }
560  // General comment - None of these are serious errors as the instrumentation
561  // layer MUST be tolerant of errors. If we need to let the end user know, we
562  // throw exceptions in the future
563 #endif
564 }
565 
571  RT::PiEvent &PiEventAddr) {
572 #ifdef XPTI_ENABLE_INSTRUMENTATION
573  // If we have failed to create an event to represent the Command, then we
574  // cannot emit an edge event. Bail early!
575  if (!(xptiTraceEnabled() && MTraceEvent))
576  return;
577 
578  if (Cmd && Cmd->MTraceEvent) {
579  // If the event is associated with a command, we use this command's trace
580  // event as the source of edge, hence modeling the control flow
581  emitEdgeEventForCommandDependence(Cmd, (void *)PiEventAddr, false);
582  return;
583  }
584  if (PiEventAddr) {
585  xpti::utils::StringHelper SH;
586  std::string AddressStr = SH.addressAsString<RT::PiEvent>(PiEventAddr);
587  // This is the case when it is a OCL event enqueued by the user or another
588  // event is registered by the runtime as a dependency The dependency on
589  // this occasion is an OCL event; so we build a virtual node in the graph
590  // with the event as the metadata for the node
591  std::string NodeName = SH.nameWithAddressString("virtual_node", AddressStr);
592  // Node name is "virtual_node[<event_addr>]"
593  xpti::payload_t VNPayload(NodeName.c_str(), MAddress);
594  uint64_t VNodeInstanceNo;
595  xpti_td *NodeEvent =
596  xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
597  xpti_at::active, &VNodeInstanceNo);
598  // Emit the virtual node first
599  xpti::addMetadata(NodeEvent, "kernel_name", NodeName);
600  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
601  detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
602  nullptr);
603  // Create a new event for the edge
604  std::string EdgeName = SH.nameWithAddressString("Event", AddressStr);
605  xpti::payload_t EdgePayload(EdgeName.c_str(), MAddress);
606  uint64_t EdgeInstanceNo;
607  xpti_td *EdgeEvent =
608  xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
609  xpti_at::active, &EdgeInstanceNo);
610  if (EdgeEvent && NodeEvent) {
611  // Source node represents the event and this event needs to be completed
612  // before target node can execute
613  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
614  EdgeEvent->source_id = NodeEvent->unique_id;
615  EdgeEvent->target_id = TgtEvent->unique_id;
616  xpti::addMetadata(EdgeEvent, "event",
617  reinterpret_cast<size_t>(PiEventAddr));
618  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
619  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
620  nullptr);
621  }
622  return;
623  }
624 #endif
625 }
626 
627 uint64_t Command::makeTraceEventProlog(void *MAddress) {
628  uint64_t CommandInstanceNo = 0;
629 #ifdef XPTI_ENABLE_INSTRUMENTATION
630  if (!xptiTraceEnabled())
631  return CommandInstanceNo;
632 
634  // Setup the member variables with information needed for event notification
635  MCommandNodeType = commandToNodeType(MType);
636  MCommandName = commandToName(MType);
637  xpti::utils::StringHelper SH;
638  MAddressString = SH.addressAsString<void *>(MAddress);
639  std::string CommandString =
640  SH.nameWithAddressString(MCommandName, MAddressString);
641 
642  xpti::payload_t p(CommandString.c_str(), MAddress);
643  xpti_td *CmdTraceEvent =
644  xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
645  xpti_at::active, &CommandInstanceNo);
646  MInstanceID = CommandInstanceNo;
647  if (CmdTraceEvent) {
648  MTraceEvent = (void *)CmdTraceEvent;
649  // If we are seeing this event again, then the instance ID will be greater
650  // than 1; in this case, we must skip sending a notification to create a
651  // node as this node has already been created. We return this value so the
652  // epilog method can be called selectively.
653  MFirstInstance = (CommandInstanceNo == 1);
654  }
655 #endif
656  return CommandInstanceNo;
657 }
658 
660 #ifdef XPTI_ENABLE_INSTRUMENTATION
661  if (!(xptiTraceEnabled() && MTraceEvent))
662  return;
664  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
665  detail::GSYCLGraphEvent,
666  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
667  static_cast<const void *>(MCommandNodeType.c_str()));
668 #endif
669 }
670 
672  std::vector<Command *> &ToCleanUp) {
673  const QueueImplPtr &WorkerQueue = getWorkerQueue();
674  const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();
675 
676  // 1. Async work is not supported for host device.
677  // 2. Non-host events can be ignored if they are not fully initialized.
678  // 3. Some types of commands do not produce PI events after they are
679  // enqueued
680  // (e.g. alloca). Note that we can't check the pi event to make that
681  // distinction since the command might still be unenqueued at this point.
682  bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized());
683  if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
684  PiEventExpected &= DepCmd->producesPiEvent();
685 
686  if (!PiEventExpected) {
687  // call to waitInternal() is in waitForPreparedHostEvents() as it's called
688  // from enqueue process functions
689  MPreparedHostDepsEvents.push_back(DepEvent);
690  return nullptr;
691  }
692 
693  Command *ConnectionCmd = nullptr;
694 
695  ContextImplPtr DepEventContext = DepEvent->getContextImpl();
696  // If contexts don't match we'll connect them using host task
697  if (DepEventContext != WorkerContext && !WorkerContext->is_host()) {
699  ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp);
700  } else
701  MPreparedDepsEvents.push_back(std::move(DepEvent));
702 
703  return ConnectionCmd;
704 }
705 
707  return MQueue->getContextImplPtr();
708 }
709 
711  assert(MWorkerQueue && "MWorkerQueue must not be nullptr");
712  return MWorkerQueue;
713 }
714 
715 bool Command::producesPiEvent() const { return true; }
716 
717 bool Command::supportsPostEnqueueCleanup() const { return true; }
718 
720  return MLeafCounter == 0 &&
722 }
723 
724 Command *Command::addDep(DepDesc NewDep, std::vector<Command *> &ToCleanUp) {
725  Command *ConnectionCmd = nullptr;
726 
727  if (NewDep.MDepCommand) {
728  ConnectionCmd =
729  processDepEvent(NewDep.MDepCommand->getEvent(), NewDep, ToCleanUp);
730  }
731  // ConnectionCmd insertion builds the following dependency structure:
732  // this -> emptyCmd (for ConnectionCmd) -> ConnectionCmd -> NewDep
733  // that means that this and NewDep are already dependent
734  if (!ConnectionCmd) {
735  MDeps.push_back(NewDep);
736  if (NewDep.MDepCommand)
737  NewDep.MDepCommand->addUser(this);
738  }
739 
740 #ifdef XPTI_ENABLE_INSTRUMENTATION
742  (void *)NewDep.MDepRequirement->MSYCLMemObj,
743  true, NewDep.MDepRequirement->MAccessMode);
744 #endif
745 
746  return ConnectionCmd;
747 }
748 
750  std::vector<Command *> &ToCleanUp) {
751 #ifdef XPTI_ENABLE_INSTRUMENTATION
752  // We need this for just the instrumentation, so guarding it will prevent
753  // unused variable warnings when instrumentation is turned off
754  Command *Cmd = (Command *)Event->getCommand();
755  RT::PiEvent &PiEventAddr = Event->getHandleRef();
756  // Now make an edge for the dependent event
757  emitEdgeEventForEventDependence(Cmd, PiEventAddr);
758 #endif
759 
760  return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr},
761  ToCleanUp);
762 }
763 
765 #ifdef XPTI_ENABLE_INSTRUMENTATION
766  if (!(xptiTraceEnabled() && MTraceEvent && PiEventAddr))
767  return;
768  // Asynchronous call, so send a signal with the event information as
769  // user_data
770  xptiNotifySubscribers(MStreamID, xpti::trace_signal, detail::GSYCLGraphEvent,
771  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
772  (void *)PiEventAddr);
773 #endif
774 }
775 
776 void Command::emitInstrumentation(uint16_t Type, const char *Txt) {
777 #ifdef XPTI_ENABLE_INSTRUMENTATION
778  if (!(xptiTraceEnabled() && MTraceEvent))
779  return;
780  // Trace event notifier that emits a Type event
781  xptiNotifySubscribers(MStreamID, Type, detail::GSYCLGraphEvent,
782  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
783  static_cast<const void *>(Txt));
784 #endif
785 }
786 
787 bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking,
788  std::vector<Command *> &ToCleanUp) {
789 #ifdef XPTI_ENABLE_INSTRUMENTATION
790  // If command is enqueued from host task thread - it will not have valid
791  // submission code location set. So we set it manually to properly trace
792  // failures if pi level report any.
793  std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
794  if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
795  AsyncCodeLocationPtr.reset(
797  }
798 #endif
799  // Exit if already enqueued
801  return true;
802 
803  // If the command is blocked from enqueueing
805  // Exit if enqueue type is not blocking
806  if (!Blocking) {
807  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, this);
808  return false;
809  }
810 
811 #ifdef XPTI_ENABLE_INSTRUMENTATION
812  // Scoped trace event notifier that emits a barrier begin and barrier end
813  // event, which models the barrier while enqueuing along with the blocked
814  // reason, as determined by the scheduler
815  std::string Info = "enqueue.barrier[";
816  Info += std::string(getBlockReason()) + "]";
817  emitInstrumentation(xpti::trace_barrier_begin, Info.c_str());
818 #endif
819 
820  // Wait if blocking
822  ;
823 #ifdef XPTI_ENABLE_INSTRUMENTATION
824  emitInstrumentation(xpti::trace_barrier_end, Info.c_str());
825 #endif
826  }
827 
828  std::lock_guard<std::mutex> Lock(MEnqueueMtx);
829 
830  // Exit if the command is already enqueued
832  return true;
833 
834 #ifdef XPTI_ENABLE_INSTRUMENTATION
835  emitInstrumentation(xpti::trace_task_begin, nullptr);
836 #endif
837 
839  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueFailed, this);
840  return false;
841  }
842 
843  // Command status set to "failed" beforehand, so this command
844  // has already been marked as "failed" if enqueueImp throws an exception.
845  // This will avoid execution of the same failed command twice.
848  pi_int32 Res = enqueueImp();
849 
850  if (PI_SUCCESS != Res)
851  EnqueueResult =
853  else {
855  (MEvent->is_host() || MEvent->getHandleRef() == nullptr))
856  MEvent->setComplete();
857 
858  // Consider the command is successfully enqueued if return code is
859  // PI_SUCCESS
864  assert(!MMarkedForCleanup);
865  MMarkedForCleanup = true;
866  ToCleanUp.push_back(this);
867  }
868  }
869 
870  // Emit this correlation signal before the task end
871  emitEnqueuedEventSignal(MEvent->getHandleRef());
872 #ifdef XPTI_ENABLE_INSTRUMENTATION
873  emitInstrumentation(xpti::trace_task_end, nullptr);
874 #endif
876 }
877 
878 void Command::resolveReleaseDependencies(std::set<Command *> &DepList) {
879 #ifdef XPTI_ENABLE_INSTRUMENTATION
880  assert(MType == CommandType::RELEASE && "Expected release command");
881  if (!MTraceEvent)
882  return;
883  // The current command is the target node for all dependencies as the source
884  // nodes have to be completed first before the current node can begin to
885  // execute; these edges model control flow
886  xpti_td *TgtTraceEvent = static_cast<xpti_td *>(MTraceEvent);
887  // We have all the Commands that must be completed before the release
888  // command can be enqueued; here we'll find the command that is an Alloca
889  // with the same SYCLMemObject address and create a dependency line (edge)
890  // between them in our sematic modeling
891  for (auto &Item : DepList) {
892  if (Item->MTraceEvent && Item->MAddress == MAddress) {
893  xpti::utils::StringHelper SH;
894  std::string AddressStr = SH.addressAsString<void *>(MAddress);
895  std::string TypeString =
896  "Edge:" + SH.nameWithAddressString(commandToName(MType), AddressStr);
897 
898  // Create an edge with the dependent buffer address being one of the
899  // properties of the edge
900  xpti::payload_t p(TypeString.c_str(), MAddress);
901  uint64_t EdgeInstanceNo;
902  xpti_td *EdgeEvent =
903  xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
904  xpti_at::active, &EdgeInstanceNo);
905  if (EdgeEvent) {
906  xpti_td *SrcTraceEvent = static_cast<xpti_td *>(Item->MTraceEvent);
907  EdgeEvent->target_id = TgtTraceEvent->unique_id;
908  EdgeEvent->source_id = SrcTraceEvent->unique_id;
909  xpti::addMetadata(EdgeEvent, "memory_object",
910  reinterpret_cast<size_t>(MAddress));
911  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
912  detail::GSYCLGraphEvent, EdgeEvent,
913  EdgeInstanceNo, nullptr);
914  }
915  }
916  }
917 #endif
918 }
919 
920 const char *Command::getBlockReason() const {
921  switch (MBlockReason) {
923  return "A Buffer is locked by the host accessor";
925  return "Blocked by host task";
926  }
927 
928  return "Unknown block reason";
929 }
930 
932 #ifdef XPTI_ENABLE_INSTRUMENTATION
933  if (!xptiTraceEnabled())
934  return;
935 
937  auto TData = Tls.query();
938  if (TData.fileName())
939  MSubmissionFileName = TData.fileName();
940  if (TData.functionName())
941  MSubmissionFunctionName = TData.functionName();
942  if (MSubmissionFileName.size() || MSubmissionFunctionName.size())
945  (int)TData.lineNumber(), (int)TData.columnNumber()};
946 #endif
947 }
948 
950  Requirement Req,
951  AllocaCommandBase *LinkedAllocaCmd,
952  bool IsConst)
953  : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
954  MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst),
955  MRequirement(std::move(Req)), MReleaseCmd(Queue, this) {
958 }
959 
961 #ifdef XPTI_ENABLE_INSTRUMENTATION
962  if (!xptiTraceEnabled())
963  return;
964  // Create a payload with the command name and an event using this payload to
965  // emit a node_create
968  // Set the relevant meta data properties for this command
969  if (MTraceEvent && MFirstInstance) {
970  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
971  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
972  xpti::addMetadata(TE, "sycl_device_type",
973  deviceToString(MQueue->get_device()));
974  xpti::addMetadata(TE, "sycl_device_name",
975  getSyclObjImpl(MQueue->get_device())->getDeviceName());
976  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
977  }
978 #endif
979 }
980 
981 bool AllocaCommandBase::producesPiEvent() const { return false; }
982 
983 bool AllocaCommandBase::supportsPostEnqueueCleanup() const { return false; }
984 
985 bool AllocaCommandBase::readyForCleanup() const { return false; }
986 
988  bool InitFromUserData,
989  AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
990  : AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
991  LinkedAllocaCmd, IsConst),
992  MInitFromUserData(InitFromUserData) {
993  // Node event must be created before the dependent edge is added to this
994  // node, so this call must be before the addDep() call.
996  // "Nothing to depend on"
997  std::vector<Command *> ToCleanUp;
998  Command *ConnectionCmd =
999  addDep(DepDesc(nullptr, getRequirement(), this), ToCleanUp);
1000  assert(ConnectionCmd == nullptr);
1001  assert(ToCleanUp.empty());
1002  (void)ConnectionCmd;
1003 }
1004 
1006 #ifdef XPTI_ENABLE_INSTRUMENTATION
1007  if (!xptiTraceEnabled())
1008  return;
1009 
1010  // Only if it is the first event, we emit a node create event
1011  if (MFirstInstance) {
1013  }
1014 #endif
1015 }
1016 
1017 pi_int32 AllocaCommand::enqueueImp() {
1019  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1020 
1021  RT::PiEvent &Event = MEvent->getHandleRef();
1022 
1023  void *HostPtr = nullptr;
1024  if (!MIsLeaderAlloca) {
1025 
1026  if (MQueue->is_host()) {
1027  // Do not need to make allocation if we have a linked device allocation
1028  Command::waitForEvents(MQueue, EventImpls, Event);
1029 
1030  return PI_SUCCESS;
1031  }
1032  HostPtr = MLinkedAllocaCmd->getMemAllocation();
1033  }
1034  // TODO: Check if it is correct to use std::move on stack variable and
1035  // delete it RawEvents below.
1037  MQueue->getContextImplPtr(), getSYCLMemObj(), MInitFromUserData, HostPtr,
1038  std::move(EventImpls), Event);
1039 
1040  return PI_SUCCESS;
1041 }
1042 
1043 void AllocaCommand::printDot(std::ostream &Stream) const {
1044  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1045 
1046  Stream << "ID = " << this << "\\n";
1047  Stream << "ALLOCA ON " << deviceToString(MQueue->get_device()) << "\\n";
1048  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
1049  Stream << " Link : " << this->MLinkedAllocaCmd << "\\n";
1050  Stream << "\"];" << std::endl;
1051 
1052  for (const auto &Dep : MDeps) {
1053  if (Dep.MDepCommand == nullptr)
1054  continue;
1055  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1056  << " [ label = \"Access mode: "
1057  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1058  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1059  << std::endl;
1060  }
1061 }
1062 
1064  AllocaCommandBase *ParentAlloca,
1065  std::vector<Command *> &ToEnqueue,
1066  std::vector<Command *> &ToCleanUp)
1067  : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue),
1068  std::move(Req),
1069  /*LinkedAllocaCmd*/ nullptr, /*IsConst*/ false),
1070  MParentAlloca(ParentAlloca) {
1071  // Node event must be created before the dependent edge
1072  // is added to this node, so this call must be before
1073  // the addDep() call.
1075  Command *ConnectionCmd = addDep(
1076  DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp);
1077  if (ConnectionCmd)
1078  ToEnqueue.push_back(ConnectionCmd);
1079 }
1080 
1082 #ifdef XPTI_ENABLE_INSTRUMENTATION
1083  if (!xptiTraceEnabled())
1084  return;
1085 
1086  // Only if it is the first event, we emit a node create event and any meta
1087  // data that is available for the command
1088  if (MFirstInstance) {
1089  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1090  xpti::addMetadata(TE, "offset", this->MRequirement.MOffsetInBytes);
1091  xpti::addMetadata(TE, "access_range_start",
1092  this->MRequirement.MAccessRange[0]);
1093  xpti::addMetadata(TE, "access_range_end",
1094  this->MRequirement.MAccessRange[1]);
1096  }
1097 #endif
1098 }
1099 
1101  // In some cases parent`s memory allocation might change (e.g., after
1102  // map/unmap operations). If parent`s memory allocation changes, sub-buffer
1103  // memory allocation should be changed as well.
1104  if (MQueue->is_host()) {
1105  return static_cast<void *>(
1106  static_cast<char *>(MParentAlloca->getMemAllocation()) +
1108  }
1109  return MMemAllocation;
1110 }
1111 
1112 pi_int32 AllocaSubBufCommand::enqueueImp() {
1114  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1115  RT::PiEvent &Event = MEvent->getHandleRef();
1116 
1118  MQueue->getContextImplPtr(), MParentAlloca->getMemAllocation(),
1120  MRequirement.MAccessRange, std::move(EventImpls), Event);
1121 
1123  MMemAllocation);
1124  return PI_SUCCESS;
1125 }
1126 
1127 void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
1128  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1129 
1130  Stream << "ID = " << this << "\\n";
1131  Stream << "ALLOCA SUB BUF ON " << deviceToString(MQueue->get_device())
1132  << "\\n";
1133  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
1134  Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n";
1135  Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n";
1136  Stream << "\"];" << std::endl;
1137 
1138  for (const auto &Dep : MDeps) {
1139  if (Dep.MDepCommand == nullptr)
1140  continue;
1141  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1142  << " [ label = \"Access mode: "
1143  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1144  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1145  << std::endl;
1146  }
1147 }
1148 
1150  : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) {
1152 }
1153 
1155 #ifdef XPTI_ENABLE_INSTRUMENTATION
1156  if (!xptiTraceEnabled())
1157  return;
1158  // Create a payload with the command name and an event using this payload to
1159  // emit a node_create
1160  MAddress = MAllocaCmd->getSYCLMemObj();
1162 
1163  if (MFirstInstance) {
1164  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1165  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1166  xpti::addMetadata(TE, "sycl_device_type",
1167  deviceToString(MQueue->get_device()));
1168  xpti::addMetadata(TE, "sycl_device_name",
1169  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1170  xpti::addMetadata(TE, "allocation_type",
1171  commandToName(MAllocaCmd->getType()));
1173  }
1174 #endif
1175 }
1176 
1177 pi_int32 ReleaseCommand::enqueueImp() {
1179  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1180  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
1181  bool SkipRelease = false;
1182 
1183  // On host side we only allocate memory for full buffers.
1184  // Thus, deallocating sub buffers leads to double memory freeing.
1185  SkipRelease |= MQueue->is_host() && MAllocaCmd->getType() == ALLOCA_SUB_BUF;
1186 
1187  const bool CurAllocaIsHost = MAllocaCmd->getQueue()->is_host();
1188  bool NeedUnmap = false;
1189  if (MAllocaCmd->MLinkedAllocaCmd) {
1190 
1191  // When releasing one of the "linked" allocations special rules take
1192  // place:
1193  // 1. Device allocation should always be released.
1194  // 2. Host allocation should be released if host allocation is "leader".
1195  // 3. Device alloca in the pair should be in active state in order to be
1196  // correctly released.
1197 
1198  // There is no actual memory allocation if a host alloca command is
1199  // created being linked to a device allocation.
1200  SkipRelease |= CurAllocaIsHost && !MAllocaCmd->MIsLeaderAlloca;
1201 
1202  NeedUnmap |= CurAllocaIsHost == MAllocaCmd->MIsActive;
1203  }
1204 
1205  if (NeedUnmap) {
1206  const QueueImplPtr &Queue = CurAllocaIsHost
1207  ? MAllocaCmd->MLinkedAllocaCmd->getQueue()
1208  : MAllocaCmd->getQueue();
1209 
1210  EventImplPtr UnmapEventImpl(new event_impl(Queue));
1211  UnmapEventImpl->setContextImpl(Queue->getContextImplPtr());
1212  UnmapEventImpl->setStateIncomplete();
1213  RT::PiEvent &UnmapEvent = UnmapEventImpl->getHandleRef();
1214 
1215  void *Src = CurAllocaIsHost
1216  ? MAllocaCmd->getMemAllocation()
1217  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1218 
1219  void *Dst = !CurAllocaIsHost
1220  ? MAllocaCmd->getMemAllocation()
1221  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1222 
1223  MemoryManager::unmap(MAllocaCmd->getSYCLMemObj(), Dst, Queue, Src,
1224  RawEvents, UnmapEvent);
1225 
1226  std::swap(MAllocaCmd->MIsActive, MAllocaCmd->MLinkedAllocaCmd->MIsActive);
1227  EventImpls.clear();
1228  EventImpls.push_back(UnmapEventImpl);
1229  }
1230  RT::PiEvent &Event = MEvent->getHandleRef();
1231  if (SkipRelease)
1232  Command::waitForEvents(MQueue, EventImpls, Event);
1233  else {
1235  MQueue->getContextImplPtr(), MAllocaCmd->getSYCLMemObj(),
1236  MAllocaCmd->getMemAllocation(), std::move(EventImpls), Event);
1237  }
1238  return PI_SUCCESS;
1239 }
1240 
1241 void ReleaseCommand::printDot(std::ostream &Stream) const {
1242  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\"";
1243 
1244  Stream << "ID = " << this << " ; ";
1245  Stream << "RELEASE ON " << deviceToString(MQueue->get_device()) << "\\n";
1246  Stream << " Alloca : " << MAllocaCmd << "\\n";
1247  Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n";
1248  Stream << "\"];" << std::endl;
1249 
1250  for (const auto &Dep : MDeps) {
1251  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1252  << " [ label = \"Access mode: "
1253  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1254  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1255  << std::endl;
1256  }
1257 }
1258 
1259 bool ReleaseCommand::producesPiEvent() const { return false; }
1260 
1261 bool ReleaseCommand::supportsPostEnqueueCleanup() const { return false; }
1262 
1263 bool ReleaseCommand::readyForCleanup() const { return false; }
1264 
1266  void **DstPtr, QueueImplPtr Queue,
1267  access::mode MapMode)
1268  : Command(CommandType::MAP_MEM_OBJ, std::move(Queue)),
1269  MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr),
1270  MMapMode(MapMode) {
1272 }
1273 
1275 #ifdef XPTI_ENABLE_INSTRUMENTATION
1276  if (!xptiTraceEnabled())
1277  return;
1278  // Create a payload with the command name and an event using this payload to
1279  // emit a node_create
1280  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1282 
1283  if (MFirstInstance) {
1284  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1285  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1286  xpti::addMetadata(TE, "sycl_device_type",
1287  deviceToString(MQueue->get_device()));
1288  xpti::addMetadata(TE, "sycl_device_name",
1289  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1290  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1292  }
1293 #endif
1294 }
1295 
1296 pi_int32 MapMemObject::enqueueImp() {
1298  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1299  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
1300  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1301 
1302  RT::PiEvent &Event = MEvent->getHandleRef();
1303  *MDstPtr = MemoryManager::map(
1304  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(), MQueue,
1305  MMapMode, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1306  MSrcReq.MOffset, MSrcReq.MElemSize, std::move(RawEvents), Event);
1307 
1308  return PI_SUCCESS;
1309 }
1310 
1311 void MapMemObject::printDot(std::ostream &Stream) const {
1312  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
1313 
1314  Stream << "ID = " << this << " ; ";
1315  Stream << "MAP ON " << deviceToString(MQueue->get_device()) << "\\n";
1316 
1317  Stream << "\"];" << std::endl;
1318 
1319  for (const auto &Dep : MDeps) {
1320  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1321  << " [ label = \"Access mode: "
1322  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1323  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1324  << std::endl;
1325  }
1326 }
1327 
1329  void **SrcPtr, QueueImplPtr Queue)
1330  : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)),
1331  MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) {
1333 }
1334 
1336 #ifdef XPTI_ENABLE_INSTRUMENTATION
1337  if (!xptiTraceEnabled())
1338  return;
1339  // Create a payload with the command name and an event using this payload to
1340  // emit a node_create
1341  MAddress = MDstAllocaCmd->getSYCLMemObj();
1343 
1344  if (MFirstInstance) {
1345  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1346  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1347  xpti::addMetadata(TE, "sycl_device_type",
1348  deviceToString(MQueue->get_device()));
1349  xpti::addMetadata(TE, "sycl_device_name",
1350  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1351  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1353  }
1354 #endif
1355 }
1356 
1358  // TODO remove this workaround once the batching issue is addressed in Level
1359  // Zero plugin.
1360  // Consider the following scenario on Level Zero:
1361  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1362  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1363  // 3. queueA.wait().
1364  // 4. queueB.wait().
1365  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1366  // dependencies (i.e. they were waited for prior to enqueueing any command
1367  // that's dependent on them). This allowed Level Zero plugin to detect that
1368  // each queue is idle on steps 1/2 and submit the command list right away.
1369  // This is no longer the case since we started passing these dependencies in
1370  // an event waitlist and Level Zero plugin attempts to batch these commands,
1371  // so the execution of kernel B starts only on step 4. This workaround
1372  // restores the old behavior in this case until this is resolved.
1373  return MQueue->getPlugin().getBackend() != backend::ext_oneapi_level_zero ||
1374  MEvent->getHandleRef() != nullptr;
1375 }
1376 
1377 pi_int32 UnMapMemObject::enqueueImp() {
1379  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1380  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
1381  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1382 
1383  RT::PiEvent &Event = MEvent->getHandleRef();
1384  MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(),
1385  MDstAllocaCmd->getMemAllocation(), MQueue, *MSrcPtr,
1386  std::move(RawEvents), Event);
1387 
1388  return PI_SUCCESS;
1389 }
1390 
1391 void UnMapMemObject::printDot(std::ostream &Stream) const {
1392  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
1393 
1394  Stream << "ID = " << this << " ; ";
1395  Stream << "UNMAP ON " << deviceToString(MQueue->get_device()) << "\\n";
1396 
1397  Stream << "\"];" << std::endl;
1398 
1399  for (const auto &Dep : MDeps) {
1400  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1401  << " [ label = \"Access mode: "
1402  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1403  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1404  << std::endl;
1405  }
1406 }
1407 
1409  AllocaCommandBase *SrcAllocaCmd,
1410  Requirement DstReq,
1411  AllocaCommandBase *DstAllocaCmd,
1412  QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
1413  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1414  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1415  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)),
1416  MDstAllocaCmd(DstAllocaCmd) {
1417  if (!MSrcQueue->is_host()) {
1418  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1419  }
1420 
1421  MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue;
1422  MEvent->setWorkerQueue(MWorkerQueue);
1423 
1425 }
1426 
1428 #ifdef XPTI_ENABLE_INSTRUMENTATION
1429  if (!xptiTraceEnabled())
1430  return;
1431  // Create a payload with the command name and an event using this payload to
1432  // emit a node_create
1433  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1435 
1436  if (MFirstInstance) {
1437  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1438  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1439  deviceToID(MQueue->get_device()));
1440  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1441  deviceToString(MQueue->get_device()));
1442  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1443  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1444  xpti::addMetadata(CmdTraceEvent, "memory_object",
1445  reinterpret_cast<size_t>(MAddress));
1446  xpti::addMetadata(CmdTraceEvent, "copy_from",
1447  reinterpret_cast<size_t>(
1448  getSyclObjImpl(MSrcQueue->get_device()).get()));
1449  xpti::addMetadata(
1450  CmdTraceEvent, "copy_to",
1451  reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()));
1453  }
1454 #endif
1455 }
1456 
1458  return getWorkerQueue()->getContextImplPtr();
1459 }
1460 
1462  // TODO remove this workaround once the batching issue is addressed in Level
1463  // Zero plugin.
1464  // Consider the following scenario on Level Zero:
1465  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1466  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1467  // 3. queueA.wait().
1468  // 4. queueB.wait().
1469  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1470  // dependencies (i.e. they were waited for prior to enqueueing any command
1471  // that's dependent on them). This allowed Level Zero plugin to detect that
1472  // each queue is idle on steps 1/2 and submit the command list right away.
1473  // This is no longer the case since we started passing these dependencies in
1474  // an event waitlist and Level Zero plugin attempts to batch these commands,
1475  // so the execution of kernel B starts only on step 4. This workaround
1476  // restores the old behavior in this case until this is resolved.
1477  return MQueue->is_host() ||
1478  MQueue->getPlugin().getBackend() != backend::ext_oneapi_level_zero ||
1479  MEvent->getHandleRef() != nullptr;
1480 }
1481 
1482 pi_int32 MemCpyCommand::enqueueImp() {
1484  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1485 
1486  RT::PiEvent &Event = MEvent->getHandleRef();
1487 
1488  auto RawEvents = getPiEvents(EventImpls);
1489  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1490 
1492  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1493  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1494  MSrcReq.MOffset, MSrcReq.MElemSize, MDstAllocaCmd->getMemAllocation(),
1495  MQueue, MDstReq.MDims, MDstReq.MMemoryRange, MDstReq.MAccessRange,
1496  MDstReq.MOffset, MDstReq.MElemSize, std::move(RawEvents), Event);
1497 
1498  return PI_SUCCESS;
1499 }
1500 
1501 void MemCpyCommand::printDot(std::ostream &Stream) const {
1502  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
1503 
1504  Stream << "ID = " << this << " ; ";
1505  Stream << "MEMCPY ON " << deviceToString(MQueue->get_device()) << "\\n";
1506  Stream << "From: " << MSrcAllocaCmd << " is host: " << MSrcQueue->is_host()
1507  << "\\n";
1508  Stream << "To: " << MDstAllocaCmd << " is host: " << MQueue->is_host()
1509  << "\\n";
1510 
1511  Stream << "\"];" << std::endl;
1512 
1513  for (const auto &Dep : MDeps) {
1514  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1515  << " [ label = \"Access mode: "
1516  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1517  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1518  << std::endl;
1519  }
1520 }
1521 
1522 AllocaCommandBase *ExecCGCommand::getAllocaForReq(Requirement *Req) {
1523  for (const DepDesc &Dep : MDeps) {
1524  if (Dep.MDepRequirement == Req)
1525  return Dep.MAllocaCmd;
1526  }
1527  throw runtime_error("Alloca for command not found",
1528  PI_ERROR_INVALID_OPERATION);
1529 }
1530 
1531 std::vector<std::shared_ptr<const void>>
1533  if (MCommandGroup->getType() == CG::Kernel)
1534  return ((CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
1535  return {};
1536 }
1537 
1539  if (MCommandGroup->getType() == CG::Kernel)
1540  ((CGExecKernel *)MCommandGroup.get())->clearAuxiliaryResources();
1541 }
1542 
1543 pi_int32 UpdateHostRequirementCommand::enqueueImp() {
1545  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1546  RT::PiEvent &Event = MEvent->getHandleRef();
1547  Command::waitForEvents(MQueue, EventImpls, Event);
1548 
1549  assert(MSrcAllocaCmd && "Expected valid alloca command");
1550  assert(MSrcAllocaCmd->getMemAllocation() && "Expected valid source pointer");
1551  assert(MDstPtr && "Expected valid target pointer");
1552  *MDstPtr = MSrcAllocaCmd->getMemAllocation();
1553 
1554  return PI_SUCCESS;
1555 }
1556 
1557 void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const {
1558  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\"";
1559 
1560  Stream << "ID = " << this << "\\n";
1561  Stream << "UPDATE REQ ON " << deviceToString(MQueue->get_device()) << "\\n";
1562  bool IsReqOnBuffer =
1563  MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer;
1564  Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n";
1565  if (IsReqOnBuffer)
1566  Stream << "Is sub buffer: " << std::boolalpha << MDstReq.MIsSubBuffer
1567  << "\\n";
1568 
1569  Stream << "\"];" << std::endl;
1570 
1571  for (const auto &Dep : MDeps) {
1572  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1573  << " [ label = \"Access mode: "
1574  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1575  << "MemObj: " << Dep.MAllocaCmd->getSYCLMemObj() << " \" ]"
1576  << std::endl;
1577  }
1578 }
1579 
1581  AllocaCommandBase *SrcAllocaCmd,
1582  Requirement DstReq, void **DstPtr,
1583  QueueImplPtr SrcQueue,
1584  QueueImplPtr DstQueue)
1585  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1586  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1587  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) {
1588  if (!MSrcQueue->is_host()) {
1589  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1590  }
1591 
1592  MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue;
1593  MEvent->setWorkerQueue(MWorkerQueue);
1594 
1596 }
1597 
1599 #ifdef XPTI_ENABLE_INSTRUMENTATION
1600  if (!xptiTraceEnabled())
1601  return;
1602  // Create a payload with the command name and an event using this payload to
1603  // emit a node_create
1604  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1606 
1607  if (MFirstInstance) {
1608  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1609  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1610  deviceToID(MQueue->get_device()));
1611  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1612  deviceToString(MQueue->get_device()));
1613  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1614  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1615  xpti::addMetadata(CmdTraceEvent, "memory_object",
1616  reinterpret_cast<size_t>(MAddress));
1617  xpti::addMetadata(CmdTraceEvent, "copy_from",
1618  reinterpret_cast<size_t>(
1619  getSyclObjImpl(MSrcQueue->get_device()).get()));
1620  xpti::addMetadata(
1621  CmdTraceEvent, "copy_to",
1622  reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()));
1624  }
1625 #endif
1626 }
1627 
1629  return getWorkerQueue()->getContextImplPtr();
1630 }
1631 
1632 pi_int32 MemCpyCommandHost::enqueueImp() {
1633  const QueueImplPtr &Queue = getWorkerQueue();
1635  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1636  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
1637 
1638  RT::PiEvent &Event = MEvent->getHandleRef();
1639  // Omit copying if mode is discard one.
1640  // TODO: Handle this at the graph building time by, for example, creating
1641  // empty node instead of memcpy.
1644  Command::waitForEvents(Queue, EventImpls, Event);
1645 
1646  return PI_SUCCESS;
1647  }
1648 
1649  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1651  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1652  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1653  MSrcReq.MOffset, MSrcReq.MElemSize, *MDstPtr, MQueue, MDstReq.MDims,
1654  MDstReq.MMemoryRange, MDstReq.MAccessRange, MDstReq.MOffset,
1655  MDstReq.MElemSize, std::move(RawEvents), Event);
1656 
1657  return PI_SUCCESS;
1658 }
1659 
1661  : Command(CommandType::EMPTY_TASK, std::move(Queue)) {
1663 }
1664 
1665 pi_int32 EmptyCommand::enqueueImp() {
1667  waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
1668 
1669  return PI_SUCCESS;
1670 }
1671 
1673  const Requirement *Req) {
1674  const Requirement &ReqRef = *Req;
1675  MRequirements.emplace_back(ReqRef);
1676  const Requirement *const StoredReq = &MRequirements.back();
1677 
1678  // EmptyCommand is always host one, so we believe that result of addDep is
1679  // nil
1680  std::vector<Command *> ToCleanUp;
1681  Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp);
1682  assert(Cmd == nullptr && "Conection command should be null for EmptyCommand");
1683  assert(ToCleanUp.empty() && "addDep should add a command for cleanup only if "
1684  "there's a connection command");
1685  (void)Cmd;
1686 }
1687 
1689 #ifdef XPTI_ENABLE_INSTRUMENTATION
1690  if (!xptiTraceEnabled())
1691  return;
1692  // Create a payload with the command name and an event using this payload to
1693  // emit a node_create
1694  if (MRequirements.empty())
1695  return;
1696 
1697  Requirement &Req = *MRequirements.begin();
1698 
1699  MAddress = Req.MSYCLMemObj;
1701 
1702  if (MFirstInstance) {
1703  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1704  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1705  deviceToID(MQueue->get_device()));
1706  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1707  deviceToString(MQueue->get_device()));
1708  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1709  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1710  xpti::addMetadata(CmdTraceEvent, "memory_object",
1711  reinterpret_cast<size_t>(MAddress));
1713  }
1714 #endif
1715 }
1716 
1717 void EmptyCommand::printDot(std::ostream &Stream) const {
1718  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
1719 
1720  Stream << "ID = " << this << "\\n";
1721  Stream << "EMPTY NODE"
1722  << "\\n";
1723 
1724  Stream << "\"];" << std::endl;
1725 
1726  for (const auto &Dep : MDeps) {
1727  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1728  << " [ label = \"Access mode: "
1729  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1730  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1731  << std::endl;
1732  }
1733 }
1734 
1735 bool EmptyCommand::producesPiEvent() const { return false; }
1736 
1737 void MemCpyCommandHost::printDot(std::ostream &Stream) const {
1738  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
1739 
1740  Stream << "ID = " << this << "\\n";
1741  Stream << "MEMCPY HOST ON " << deviceToString(MQueue->get_device()) << "\\n";
1742 
1743  Stream << "\"];" << std::endl;
1744 
1745  for (const auto &Dep : MDeps) {
1746  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1747  << " [ label = \"Access mode: "
1748  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1749  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1750  << std::endl;
1751  }
1752 }
1753 
1755  QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd,
1756  void **DstPtr)
1757  : Command(CommandType::UPDATE_REQUIREMENT, std::move(Queue)),
1758  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(Req)), MDstPtr(DstPtr) {
1759 
1761 }
1762 
1764 #ifdef XPTI_ENABLE_INSTRUMENTATION
1765  if (!xptiTraceEnabled())
1766  return;
1767  // Create a payload with the command name and an event using this payload to
1768  // emit a node_create
1769  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1771 
1772  if (MFirstInstance) {
1773  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1774  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1775  deviceToID(MQueue->get_device()));
1776  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1777  deviceToString(MQueue->get_device()));
1778  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1779  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1780  xpti::addMetadata(CmdTraceEvent, "memory_object",
1781  reinterpret_cast<size_t>(MAddress));
1783  }
1784 #endif
1785 }
1786 
1787 static std::string cgTypeToString(detail::CG::CGTYPE Type) {
1788  switch (Type) {
1789  case detail::CG::Kernel:
1790  return "Kernel";
1791  break;
1793  return "update_host";
1794  break;
1795  case detail::CG::Fill:
1796  return "fill";
1797  break;
1799  return "copy acc to acc";
1800  break;
1802  return "copy acc to ptr";
1803  break;
1805  return "copy ptr to acc";
1806  break;
1807  case detail::CG::CopyUSM:
1808  return "copy usm";
1809  break;
1810  case detail::CG::FillUSM:
1811  return "fill usm";
1812  break;
1814  return "prefetch usm";
1815  break;
1817  return "host task";
1818  break;
1819  case detail::CG::Copy2DUSM:
1820  return "copy 2d usm";
1821  break;
1822  case detail::CG::Fill2DUSM:
1823  return "fill 2d usm";
1824  break;
1826  return "memset 2d usm";
1827  break;
1829  return "copy to device_global";
1830  break;
1832  return "copy from device_global";
1833  break;
1834  default:
1835  return "unknown";
1836  break;
1837  }
1838 }
1839 
1840 ExecCGCommand::ExecCGCommand(std::unique_ptr<detail::CG> CommandGroup,
1841  QueueImplPtr Queue)
1842  : Command(CommandType::RUN_CG, std::move(Queue)),
1843  MCommandGroup(std::move(CommandGroup)) {
1844  if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) {
1845  MEvent->setSubmittedQueue(
1846  static_cast<detail::CGHostTask *>(MCommandGroup.get())->MQueue);
1847  }
1848 
1850 }
1851 
1853 #ifdef XPTI_ENABLE_INSTRUMENTATION
1854  if (!xptiTraceEnabled())
1855  return;
1856  // Create a payload with the command name and an event using this payload to
1857  // emit a node_create
1858  bool HasSourceInfo = false;
1859  std::string KernelName;
1860  std::optional<bool> FromSource;
1861  switch (MCommandGroup->getType()) {
1862  case detail::CG::Kernel: {
1863  auto KernelCG =
1864  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
1865 
1866  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
1867  FromSource = true;
1868  pi_kernel KernelHandle = KernelCG->MSyclKernel->getHandleRef();
1869  MAddress = KernelHandle;
1870  KernelName = MCommandGroup->MFunctionName;
1871  } else {
1872  FromSource = false;
1873  KernelName = demangleKernelName(KernelCG->getKernelName());
1874  }
1875  } break;
1876  default:
1877  KernelName = cgTypeToString(MCommandGroup->getType());
1878  break;
1879  }
1880  std::string CommandType = commandToNodeType(MType);
1881  // Get source file, line number information from the CommandGroup object
1882  // and create payload using name, address, and source info
1883  //
1884  // On Windows, since the support for builtin functions is not available in
1885  // MSVC, the MFileName, MLine will be set to nullptr and "0" respectively.
1886  // Handle this condition explicitly here.
1887  xpti::payload_t Payload;
1888  if (!MCommandGroup->MFileName.empty()) {
1889  // File name has a valid string
1890  Payload =
1891  xpti::payload_t(KernelName.c_str(), MCommandGroup->MFileName.c_str(),
1892  MCommandGroup->MLine, MCommandGroup->MColumn, MAddress);
1893  HasSourceInfo = true;
1894  } else if (MAddress) {
1895  // We have a valid function name and an address
1896  Payload = xpti::payload_t(KernelName.c_str(), MAddress);
1897  } else {
1898  // In any case, we will have a valid function name and we'll use that to
1899  // create the hash
1900  Payload = xpti::payload_t(KernelName.c_str());
1901  }
1902 
1903  uint64_t CGKernelInstanceNo;
1904  // Create event using the payload
1905  xpti_td *CmdTraceEvent =
1906  xptiMakeEvent("ExecCG", &Payload, xpti::trace_graph_event,
1907  xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
1908 
1909  if (CmdTraceEvent) {
1910  MInstanceID = CGKernelInstanceNo;
1911  MTraceEvent = (void *)CmdTraceEvent;
1912  // If we are seeing this event again, then the instance ID will be greater
1913  // than 1; in this case, we will skip sending a notification to create a
1914  // node as this node has already been created.
1915  if (CGKernelInstanceNo > 1)
1916  return;
1917 
1918  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1919  deviceToID(MQueue->get_device()));
1920  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1921  deviceToString(MQueue->get_device()));
1922  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1923  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1924  if (!KernelName.empty()) {
1925  xpti::addMetadata(CmdTraceEvent, "kernel_name", KernelName);
1926  }
1927  if (FromSource.has_value()) {
1928  xpti::addMetadata(CmdTraceEvent, "from_source", FromSource.value());
1929  }
1930  if (HasSourceInfo) {
1931  xpti::addMetadata(CmdTraceEvent, "sym_function_name", KernelName);
1932  xpti::addMetadata(CmdTraceEvent, "sym_source_file_name",
1933  MCommandGroup->MFileName);
1934  xpti::addMetadata(CmdTraceEvent, "sym_line_no", MCommandGroup->MLine);
1935  xpti::addMetadata(CmdTraceEvent, "sym_column_no", MCommandGroup->MColumn);
1936  }
1937 
1938  if (MCommandGroup->getType() == detail::CG::Kernel) {
1939  auto KernelCG =
1940  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
1941  auto &NDRDesc = KernelCG->MNDRDesc;
1942  std::vector<ArgDesc> Args;
1943 
1944  auto FilterArgs = [&Args](detail::ArgDesc &Arg, int NextTrueIndex) {
1945  Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex});
1946  };
1947  RT::PiProgram Program = nullptr;
1948  RT::PiKernel Kernel = nullptr;
1949  std::mutex *KernelMutex = nullptr;
1950 
1951  std::shared_ptr<kernel_impl> SyclKernelImpl;
1952  std::shared_ptr<device_image_impl> DeviceImageImpl;
1953  auto KernelBundleImplPtr = KernelCG->getKernelBundle();
1954 
1955  // Use kernel_bundle if available unless it is interop.
1956  // Interop bundles can't be used in the first branch, because the
1957  // kernels in interop kernel bundles (if any) do not have kernel_id and
1958  // can therefore not be looked up, but since they are self-contained
1959  // they can simply be launched directly.
1960  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
1961  kernel_id KernelID =
1963  KernelCG->MKernelName);
1964  kernel SyclKernel =
1965  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
1966  Program = detail::getSyclObjImpl(SyclKernel)
1967  ->getDeviceImage()
1968  ->get_program_ref();
1969  } else if (nullptr != KernelCG->MSyclKernel) {
1970  auto SyclProg = KernelCG->MSyclKernel->getProgramImpl();
1971  Program = SyclProg->getHandleRef();
1972  } else {
1973  std::tie(Kernel, KernelMutex, Program) =
1975  KernelCG->MOSModuleHandle, MQueue->getContextImplPtr(),
1976  MQueue->getDeviceImplPtr(), KernelCG->MKernelName, nullptr);
1977  }
1978 
1979  ProgramManager::KernelArgMask EliminatedArgMask;
1980  if (nullptr == KernelCG->MSyclKernel ||
1981  !KernelCG->MSyclKernel->isCreatedFromSource()) {
1982  EliminatedArgMask =
1984  KernelCG->MOSModuleHandle, Program, KernelCG->MKernelName);
1985  }
1986 
1987  applyFuncOnFilteredArgs(EliminatedArgMask, KernelCG->MArgs, FilterArgs);
1988 
1989  xpti::offload_kernel_enqueue_data_t KernelData{
1990  {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
1991  {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
1992  {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
1993  NDRDesc.GlobalOffset[2]},
1994  Args.size()};
1995  xpti::addMetadata(CmdTraceEvent, "enqueue_kernel_data", KernelData);
1996  for (size_t i = 0; i < Args.size(); i++) {
1997  std::string Prefix("arg");
1998  xpti::offload_kernel_arg_data_t arg{(int)Args[i].MType, Args[i].MPtr,
1999  Args[i].MSize, Args[i].MIndex};
2000  xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i), arg);
2001  }
2002  }
2003 
2004  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
2005  detail::GSYCLGraphEvent, CmdTraceEvent,
2006  CGKernelInstanceNo,
2007  static_cast<const void *>(CommandType.c_str()));
2008  }
2009 #endif
2010 }
2011 
2012 void ExecCGCommand::printDot(std::ostream &Stream) const {
2013  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2014 
2015  Stream << "ID = " << this << "\\n";
2016  Stream << "EXEC CG ON " << deviceToString(MQueue->get_device()) << "\\n";
2017 
2018  switch (MCommandGroup->getType()) {
2019  case detail::CG::Kernel: {
2020  auto KernelCG =
2021  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2022  Stream << "Kernel name: ";
2023  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
2024  Stream << "created from source";
2025  else
2026  Stream << demangleKernelName(KernelCG->getKernelName());
2027  Stream << "\\n";
2028  break;
2029  }
2030  default:
2031  Stream << "CG type: " << cgTypeToString(MCommandGroup->getType()) << "\\n";
2032  break;
2033  }
2034 
2035  Stream << "\"];" << std::endl;
2036 
2037  for (const auto &Dep : MDeps) {
2038  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
2039  << " [ label = \"Access mode: "
2040  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
2041  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
2042  << std::endl;
2043  }
2044 }
2045 
2046 // SYCL has a parallel_for_work_group variant where the only NDRange
2047 // characteristics set by a user is the number of work groups. This does not
2048 // map to the OpenCL clEnqueueNDRangeAPI, which requires global work size to
2049 // be set as well. This function determines local work size based on the
2050 // device characteristics and the number of work groups requested by the user,
2051 // then calculates the global work size. SYCL specification (from 4.8.5.3):
2052 // The member function handler::parallel_for_work_group is parameterized by
2053 // the number of work - groups, such that the size of each group is chosen by
2054 // the runtime, or by the number of work - groups and number of work - items
2055 // for users who need more control.
2057  const device_impl &DeviceImpl) {
2058  if (NDR.GlobalSize[0] != 0)
2059  return; // GlobalSize is set - no need to adjust
2060  // check the prerequisites:
2061  assert(NDR.LocalSize[0] == 0);
2062  // TODO might be good to cache this info together with the kernel info to
2063  // avoid get_kernel_work_group_info on every kernel run
2065  sycl::info::kernel_device_specific::compile_work_group_size>(
2066  Kernel, DeviceImpl.getHandleRef(), DeviceImpl.getPlugin());
2067 
2068  if (WGSize[0] == 0) {
2069  WGSize = {1, 1, 1};
2070  }
2071  NDR.set(NDR.Dims, nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize));
2072 }
2073 
2074 // We have the following mapping between dimensions with SPIR-V builtins:
2075 // 1D: id[0] -> x
2076 // 2D: id[0] -> y, id[1] -> x
2077 // 3D: id[0] -> z, id[1] -> y, id[2] -> x
2078 // So in order to ensure the correctness we update all the kernel
2079 // parameters accordingly.
2080 // Initially we keep the order of NDRDescT as it provided by the user, this
2081 // simplifies overall handling and do the reverse only when
2082 // the kernel is enqueued.
2084  if (NDR.Dims > 1) {
2085  std::swap(NDR.GlobalSize[0], NDR.GlobalSize[NDR.Dims - 1]);
2086  std::swap(NDR.LocalSize[0], NDR.LocalSize[NDR.Dims - 1]);
2087  std::swap(NDR.GlobalOffset[0], NDR.GlobalOffset[NDR.Dims - 1]);
2088  }
2089 }
2090 
2092  const QueueImplPtr &Queue, std::vector<ArgDesc> &Args,
2093  const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2094  RT::PiKernel Kernel, NDRDescT &NDRDesc, std::vector<RT::PiEvent> &RawEvents,
2095  RT::PiEvent *OutEvent,
2096  const ProgramManager::KernelArgMask &EliminatedArgMask,
2097  const std::function<void *(Requirement *Req)> &getMemAllocationFunc) {
2098  const detail::plugin &Plugin = Queue->getPlugin();
2099 
2100  auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
2101  &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
2102  switch (Arg.MType) {
2104  break;
2106  Requirement *Req = (Requirement *)(Arg.MPtr);
2107  if (Req->MAccessRange == range<3>({0, 0, 0}))
2108  break;
2109  assert(getMemAllocationFunc != nullptr &&
2110  "We should have caught this earlier.");
2111 
2112  RT::PiMem MemArg = (RT::PiMem)getMemAllocationFunc(Req);
2113  if (Plugin.getBackend() == backend::opencl) {
2114  Plugin.call<PiApiKind::piKernelSetArg>(Kernel, NextTrueIndex,
2115  sizeof(RT::PiMem), &MemArg);
2116  } else {
2117  Plugin.call<PiApiKind::piextKernelSetArgMemObj>(Kernel, NextTrueIndex,
2118  &MemArg);
2119  }
2120  break;
2121  }
2123  Plugin.call<PiApiKind::piKernelSetArg>(Kernel, NextTrueIndex, Arg.MSize,
2124  Arg.MPtr);
2125  break;
2126  }
2128  sampler *SamplerPtr = (sampler *)Arg.MPtr;
2129  RT::PiSampler Sampler = detail::getSyclObjImpl(*SamplerPtr)
2130  ->getOrCreateSampler(Queue->get_context());
2131  Plugin.call<PiApiKind::piextKernelSetArgSampler>(Kernel, NextTrueIndex,
2132  &Sampler);
2133  break;
2134  }
2136  Plugin.call<PiApiKind::piextKernelSetArgPointer>(Kernel, NextTrueIndex,
2137  Arg.MSize, Arg.MPtr);
2138  break;
2139  }
2141  if (Queue->is_host()) {
2142  throw sycl::feature_not_supported(
2143  "SYCL2020 specialization constants are not yet supported on host "
2144  "device",
2145  PI_ERROR_INVALID_OPERATION);
2146  }
2147  assert(DeviceImageImpl != nullptr);
2148  RT::PiMem SpecConstsBuffer = DeviceImageImpl->get_spec_const_buffer_ref();
2149  // Avoid taking an address of nullptr
2150  RT::PiMem *SpecConstsBufferArg =
2151  SpecConstsBuffer ? &SpecConstsBuffer : nullptr;
2152  Plugin.call<PiApiKind::piextKernelSetArgMemObj>(Kernel, NextTrueIndex,
2153  SpecConstsBufferArg);
2154  break;
2155  }
2157  throw runtime_error("Invalid kernel param kind", PI_ERROR_INVALID_VALUE);
2158  break;
2159  }
2160  };
2161 
2162  applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
2163 
2164  adjustNDRangePerKernel(NDRDesc, Kernel, *(Queue->getDeviceImplPtr()));
2165 
2166  // Remember this information before the range dimensions are reversed
2167  const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
2168 
2170 
2171  size_t RequiredWGSize[3] = {0, 0, 0};
2172  size_t *LocalSize = nullptr;
2173 
2174  if (HasLocalSize)
2175  LocalSize = &NDRDesc.LocalSize[0];
2176  else {
2177  Plugin.call<PiApiKind::piKernelGetGroupInfo>(
2178  Kernel, Queue->getDeviceImplPtr()->getHandleRef(),
2179  PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize),
2180  RequiredWGSize, /* param_value_size_ret = */ nullptr);
2181 
2182  const bool EnforcedLocalSize =
2183  (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2184  RequiredWGSize[2] != 0);
2185  if (EnforcedLocalSize)
2186  LocalSize = RequiredWGSize;
2187  }
2188 
2189  pi_result Error = Plugin.call_nocheck<PiApiKind::piEnqueueKernelLaunch>(
2190  Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
2191  &NDRDesc.GlobalSize[0], LocalSize, RawEvents.size(),
2192  RawEvents.empty() ? nullptr : &RawEvents[0], OutEvent);
2193  return Error;
2194 }
2195 
2196 // The function initialize accessors and calls lambda.
2197 // The function is used as argument to piEnqueueNativeKernel which requires
2198 // that the passed function takes one void* argument.
2199 void DispatchNativeKernel(void *Blob) {
2200  void **CastedBlob = (void **)Blob;
2201 
2202  std::vector<Requirement *> *Reqs =
2203  static_cast<std::vector<Requirement *> *>(CastedBlob[0]);
2204 
2205  std::unique_ptr<HostKernelBase> *HostKernel =
2206  static_cast<std::unique_ptr<HostKernelBase> *>(CastedBlob[1]);
2207 
2208  NDRDescT *NDRDesc = static_cast<NDRDescT *>(CastedBlob[2]);
2209 
2210  // Other value are pointer to the buffers.
2211  void **NextArg = CastedBlob + 3;
2212  for (detail::Requirement *Req : *Reqs)
2213  Req->MData = *(NextArg++);
2214 
2215  (*HostKernel)->call(*NDRDesc, nullptr);
2216 
2217  // The ownership of these objects have been passed to us, need to cleanup
2218  delete Reqs;
2219  delete HostKernel;
2220  delete NDRDesc;
2221 }
2222 
2224  const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector<ArgDesc> &Args,
2225  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
2226  const std::shared_ptr<detail::kernel_impl> &MSyclKernel,
2227  const std::string &KernelName, const detail::OSModuleHandle &OSModuleHandle,
2228  std::vector<RT::PiEvent> &RawEvents, RT::PiEvent *OutEvent,
2229  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2230  RT::PiKernelCacheConfig KernelCacheConfig) {
2231 
2232  // Run OpenCL kernel
2233  auto ContextImpl = Queue->getContextImplPtr();
2234  auto DeviceImpl = Queue->getDeviceImplPtr();
2235  RT::PiKernel Kernel = nullptr;
2236  std::mutex *KernelMutex = nullptr;
2237  RT::PiProgram Program = nullptr;
2238 
2239  std::shared_ptr<kernel_impl> SyclKernelImpl;
2240  std::shared_ptr<device_image_impl> DeviceImageImpl;
2241 
2242  // Use kernel_bundle if available unless it is interop.
2243  // Interop bundles can't be used in the first branch, because the kernels
2244  // in interop kernel bundles (if any) do not have kernel_id
2245  // and can therefore not be looked up, but since they are self-contained
2246  // they can simply be launched directly.
2247  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
2248  kernel_id KernelID =
2250  kernel SyclKernel =
2251  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
2252 
2253  SyclKernelImpl = detail::getSyclObjImpl(SyclKernel);
2254 
2255  Kernel = SyclKernelImpl->getHandleRef();
2256  DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2257 
2258  Program = DeviceImageImpl->get_program_ref();
2259 
2260  std::tie(Kernel, KernelMutex) =
2262  KernelBundleImplPtr->get_context(), KernelName,
2263  /*PropList=*/{}, Program);
2264  } else if (nullptr != MSyclKernel) {
2265  assert(MSyclKernel->get_info<info::kernel::context>() ==
2266  Queue->get_context());
2267  Kernel = MSyclKernel->getHandleRef();
2268  auto SyclProg = MSyclKernel->getProgramImpl();
2269  Program = SyclProg->getHandleRef();
2270  if (SyclProg->is_cacheable()) {
2271  RT::PiKernel FoundKernel = nullptr;
2272  std::tie(FoundKernel, KernelMutex, std::ignore) =
2274  OSModuleHandle, ContextImpl, DeviceImpl, KernelName,
2275  SyclProg.get());
2276  assert(FoundKernel == Kernel);
2277  } else {
2278  // Non-cacheable kernels use mutexes from kernel_impls.
2279  // TODO this can still result in a race condition if multiple SYCL
2280  // kernels are created with the same native handle. To address this,
2281  // we need to either store and use a pi_native_handle -> mutex map or
2282  // reuse and return existing SYCL kernels from make_native to avoid
2283  // their duplication in such cases.
2284  KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex();
2285  }
2286  } else {
2287  std::tie(Kernel, KernelMutex, Program) =
2289  OSModuleHandle, ContextImpl, DeviceImpl, KernelName, nullptr);
2290  }
2291 
2292  // We may need more events for the launch, so we make another reference.
2293  std::vector<RT::PiEvent> &EventsWaitList = RawEvents;
2294 
2295  // Initialize device globals associated with this.
2296  std::vector<RT::PiEvent> DeviceGlobalInitEvents =
2297  ContextImpl->initializeDeviceGlobals(Program, Queue);
2298  std::vector<RT::PiEvent> EventsWithDeviceGlobalInits;
2299  if (!DeviceGlobalInitEvents.empty()) {
2300  EventsWithDeviceGlobalInits.reserve(RawEvents.size() +
2301  DeviceGlobalInitEvents.size());
2302  EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2303  RawEvents.begin(), RawEvents.end());
2304  EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2305  DeviceGlobalInitEvents.begin(),
2306  DeviceGlobalInitEvents.end());
2307  EventsWaitList = EventsWithDeviceGlobalInits;
2308  }
2309 
2310  pi_result Error = PI_SUCCESS;
2311  ProgramManager::KernelArgMask EliminatedArgMask;
2312  if (nullptr == MSyclKernel || !MSyclKernel->isCreatedFromSource()) {
2313  EliminatedArgMask =
2315  OSModuleHandle, Program, KernelName);
2316  }
2317  {
2318  assert(KernelMutex);
2319  std::lock_guard<std::mutex> Lock(*KernelMutex);
2320 
2321  // Set SLM/Cache configuration for the kernel if non-default value is
2322  // provided.
2323  if (KernelCacheConfig == PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM ||
2324  KernelCacheConfig == PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA) {
2325  const detail::plugin &Plugin = Queue->getPlugin();
2328  sizeof(RT::PiKernelCacheConfig), &KernelCacheConfig);
2329  }
2330 
2331  Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel,
2332  NDRDesc, EventsWaitList, OutEvent,
2333  EliminatedArgMask, getMemAllocationFunc);
2334  }
2335  if (PI_SUCCESS != Error) {
2336  // If we have got non-success error code, let's analyze it to emit nice
2337  // exception explaining what was wrong
2338  const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr());
2340  Kernel, NDRDesc);
2341  }
2342 
2343  return PI_SUCCESS;
2344 }
2345 
2346 pi_int32 ExecCGCommand::enqueueImp() {
2347  if (getCG().getType() != CG::CGTYPE::CodeplayHostTask)
2348  waitForPreparedHostEvents();
2349  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
2350  auto RawEvents = getPiEvents(EventImpls);
2351  flushCrossQueueDeps(EventImpls, getWorkerQueue());
2352 
2353  RT::PiEvent *Event = (MQueue->has_discard_events_support() &&
2354  MCommandGroup->MRequirements.size() == 0)
2355  ? nullptr
2356  : &MEvent->getHandleRef();
2357  switch (MCommandGroup->getType()) {
2358 
2359  case CG::CGTYPE::UpdateHost: {
2360  throw runtime_error("Update host should be handled by the Scheduler.",
2361  PI_ERROR_INVALID_OPERATION);
2362  }
2363  case CG::CGTYPE::CopyAccToPtr: {
2364  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2365  Requirement *Req = (Requirement *)Copy->getSrc();
2366  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2367 
2368  MemoryManager::copy(
2369  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2370  Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2371  Req->MElemSize, Copy->getDst(),
2372  Scheduler::getInstance().getDefaultHostQueue(), Req->MDims,
2373  Req->MAccessRange, Req->MAccessRange, /*DstOffset=*/{0, 0, 0},
2374  Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef());
2375 
2376  return PI_SUCCESS;
2377  }
2378  case CG::CGTYPE::CopyPtrToAcc: {
2379  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2380  Requirement *Req = (Requirement *)(Copy->getDst());
2381  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2382 
2383  Scheduler::getInstance().getDefaultHostQueue();
2384 
2385  MemoryManager::copy(
2386  AllocaCmd->getSYCLMemObj(), Copy->getSrc(),
2387  Scheduler::getInstance().getDefaultHostQueue(), Req->MDims,
2388  Req->MAccessRange, Req->MAccessRange,
2389  /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2390  MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2391  Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef());
2392 
2393  return PI_SUCCESS;
2394  }
2395  case CG::CGTYPE::CopyAccToAcc: {
2396  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2397  Requirement *ReqSrc = (Requirement *)(Copy->getSrc());
2398  Requirement *ReqDst = (Requirement *)(Copy->getDst());
2399 
2400  AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2401  AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2402 
2403  MemoryManager::copy(
2404  AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(), MQueue,
2405  ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2406  ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2407  MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2408  ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents),
2409  MEvent->getHandleRef());
2410 
2411  return PI_SUCCESS;
2412  }
2413  case CG::CGTYPE::Fill: {
2414  CGFill *Fill = (CGFill *)MCommandGroup.get();
2415  Requirement *Req = (Requirement *)(Fill->getReqToFill());
2416  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2417 
2418  MemoryManager::fill(
2419  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2420  Fill->MPattern.size(), Fill->MPattern.data(), Req->MDims,
2421  Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
2422  std::move(RawEvents), MEvent->getHandleRef());
2423 
2424  return PI_SUCCESS;
2425  }
2426  case CG::CGTYPE::RunOnHostIntel: {
2427  CGExecKernel *HostTask = (CGExecKernel *)MCommandGroup.get();
2428 
2429  // piEnqueueNativeKernel takes arguments blob which is passes to user
2430  // function.
2431  // Need the following items to restore context in the host task.
2432  // Make a copy on heap to "dettach" from the command group as it can be
2433  // released before the host task completes.
2434  std::vector<void *> ArgsBlob(HostTask->MArgs.size() + 3);
2435 
2436  std::vector<Requirement *> *CopyReqs =
2437  new std::vector<Requirement *>(HostTask->MRequirements);
2438 
2439  // Not actually a copy, but move. Should be OK as it's not expected that
2440  // MHostKernel will be used elsewhere.
2441  std::unique_ptr<HostKernelBase> *CopyHostKernel =
2442  new std::unique_ptr<HostKernelBase>(std::move(HostTask->MHostKernel));
2443 
2444  NDRDescT *CopyNDRDesc = new NDRDescT(HostTask->MNDRDesc);
2445 
2446  ArgsBlob[0] = (void *)CopyReqs;
2447  ArgsBlob[1] = (void *)CopyHostKernel;
2448  ArgsBlob[2] = (void *)CopyNDRDesc;
2449 
2450  void **NextArg = ArgsBlob.data() + 3;
2451 
2452  if (MQueue->is_host()) {
2453  for (ArgDesc &Arg : HostTask->MArgs) {
2454  assert(Arg.MType == kernel_param_kind_t::kind_accessor);
2455 
2456  Requirement *Req = (Requirement *)(Arg.MPtr);
2457  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2458 
2459  *NextArg = AllocaCmd->getMemAllocation();
2460  NextArg++;
2461  }
2462 
2463  if (!RawEvents.empty()) {
2464  // Assuming that the events are for devices to the same Plugin.
2465  const detail::plugin &Plugin = EventImpls[0]->getPlugin();
2466  Plugin.call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
2467  }
2468  DispatchNativeKernel((void *)ArgsBlob.data());
2469 
2470  return PI_SUCCESS;
2471  }
2472 
2473  std::vector<pi_mem> Buffers;
2474  // piEnqueueNativeKernel requires additional array of pointers to args
2475  // blob, values that pointers point to are replaced with actual pointers
2476  // to the memory before execution of user function.
2477  std::vector<void *> MemLocs;
2478 
2479  for (ArgDesc &Arg : HostTask->MArgs) {
2480  assert(Arg.MType == kernel_param_kind_t::kind_accessor);
2481 
2482  Requirement *Req = (Requirement *)(Arg.MPtr);
2483  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2484  pi_mem MemArg = (pi_mem)AllocaCmd->getMemAllocation();
2485 
2486  Buffers.push_back(MemArg);
2487  MemLocs.push_back(NextArg);
2488  NextArg++;
2489  }
2490  const detail::plugin &Plugin = MQueue->getPlugin();
2491  pi_result Error = Plugin.call_nocheck<PiApiKind::piEnqueueNativeKernel>(
2492  MQueue->getHandleRef(), DispatchNativeKernel, (void *)ArgsBlob.data(),
2493  ArgsBlob.size() * sizeof(ArgsBlob[0]), Buffers.size(), Buffers.data(),
2494  const_cast<const void **>(MemLocs.data()), RawEvents.size(),
2495  RawEvents.empty() ? nullptr : RawEvents.data(), Event);
2496 
2497  switch (Error) {
2498  case PI_ERROR_INVALID_OPERATION:
2499  throw sycl::runtime_error(
2500  "Device doesn't support run_on_host_intel tasks.", Error);
2501  case PI_SUCCESS:
2502  return Error;
2503  default:
2504  throw sycl::runtime_error("Enqueueing run_on_host_intel task has failed.",
2505  Error);
2506  }
2507  }
2508  case CG::CGTYPE::Kernel: {
2509  CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2510 
2511  NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
2512  std::vector<ArgDesc> &Args = ExecKernel->MArgs;
2513 
2514  if (MQueue->is_host() || (MQueue->getPlugin().getBackend() ==
2516  for (ArgDesc &Arg : Args)
2517  if (kernel_param_kind_t::kind_accessor == Arg.MType) {
2518  Requirement *Req = (Requirement *)(Arg.MPtr);
2519  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2520  Req->MData = AllocaCmd->getMemAllocation();
2521  }
2522  if (!RawEvents.empty()) {
2523  // Assuming that the events are for devices to the same Plugin.
2524  const detail::plugin &Plugin = EventImpls[0]->getPlugin();
2525  Plugin.call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
2526  }
2527 
2528  if (MQueue->is_host()) {
2529  ExecKernel->MHostKernel->call(NDRDesc,
2530  getEvent()->getHostProfilingInfo());
2531  } else {
2532  assert(MQueue->getPlugin().getBackend() ==
2534 
2535  MQueue->getPlugin().call<PiApiKind::piEnqueueKernelLaunch>(
2536  nullptr,
2537  reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
2538  NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
2539  &NDRDesc.LocalSize[0], 0, nullptr, nullptr);
2540  }
2541 
2542  return PI_SUCCESS;
2543  }
2544 
2545  auto getMemAllocationFunc = [this](Requirement *Req) {
2546  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2547  return AllocaCmd->getMemAllocation();
2548  };
2549 
2550  const std::shared_ptr<detail::kernel_impl> &SyclKernel =
2551  ExecKernel->MSyclKernel;
2552  const std::string &KernelName = ExecKernel->MKernelName;
2553  const detail::OSModuleHandle &OSModuleHandle = ExecKernel->MOSModuleHandle;
2554 
2555  if (!Event) {
2556  // Kernel only uses assert if it's non interop one
2557  bool KernelUsesAssert = !(SyclKernel && SyclKernel->isInterop()) &&
2558  ProgramManager::getInstance().kernelUsesAssert(
2559  OSModuleHandle, KernelName);
2560  if (KernelUsesAssert) {
2561  Event = &MEvent->getHandleRef();
2562  }
2563  }
2564 
2565  return enqueueImpKernel(
2566  MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel,
2567  KernelName, OSModuleHandle, RawEvents, Event, getMemAllocationFunc,
2568  ExecKernel->MKernelCacheConfig);
2569  }
2570  case CG::CGTYPE::CopyUSM: {
2571  CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2572  MemoryManager::copy_usm(Copy->getSrc(), MQueue, Copy->getLength(),
2573  Copy->getDst(), std::move(RawEvents), Event);
2574 
2575  return PI_SUCCESS;
2576  }
2577  case CG::CGTYPE::FillUSM: {
2578  CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
2579  MemoryManager::fill_usm(Fill->getDst(), MQueue, Fill->getLength(),
2580  Fill->getFill(), std::move(RawEvents), Event);
2581 
2582  return PI_SUCCESS;
2583  }
2584  case CG::CGTYPE::PrefetchUSM: {
2585  CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
2586  MemoryManager::prefetch_usm(Prefetch->getDst(), MQueue,
2587  Prefetch->getLength(), std::move(RawEvents),
2588  Event);
2589 
2590  return PI_SUCCESS;
2591  }
2592  case CG::CGTYPE::AdviseUSM: {
2593  CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
2594  MemoryManager::advise_usm(Advise->getDst(), MQueue, Advise->getLength(),
2595  Advise->getAdvice(), std::move(RawEvents), Event);
2596 
2597  return PI_SUCCESS;
2598  }
2599  case CG::CGTYPE::Copy2DUSM: {
2600  CGCopy2DUSM *Copy = (CGCopy2DUSM *)MCommandGroup.get();
2601  MemoryManager::copy_2d_usm(Copy->getSrc(), Copy->getSrcPitch(), MQueue,
2602  Copy->getDst(), Copy->getDstPitch(),
2603  Copy->getWidth(), Copy->getHeight(),
2604  std::move(RawEvents), Event);
2605  return PI_SUCCESS;
2606  }
2607  case CG::CGTYPE::Fill2DUSM: {
2608  CGFill2DUSM *Fill = (CGFill2DUSM *)MCommandGroup.get();
2609  MemoryManager::fill_2d_usm(Fill->getDst(), MQueue, Fill->getPitch(),
2610  Fill->getWidth(), Fill->getHeight(),
2611  Fill->getPattern(), std::move(RawEvents), Event);
2612  return PI_SUCCESS;
2613  }
2614  case CG::CGTYPE::Memset2DUSM: {
2615  CGMemset2DUSM *Memset = (CGMemset2DUSM *)MCommandGroup.get();
2616  MemoryManager::memset_2d_usm(
2617  Memset->getDst(), MQueue, Memset->getPitch(), Memset->getWidth(),
2618  Memset->getHeight(), Memset->getValue(), std::move(RawEvents), Event);
2619  return PI_SUCCESS;
2620  }
2621  case CG::CGTYPE::CodeplayInteropTask: {
2622  const detail::plugin &Plugin = MQueue->getPlugin();
2623  CGInteropTask *ExecInterop = (CGInteropTask *)MCommandGroup.get();
2624  // Wait for dependencies to complete before dispatching work on the host
2625  // TODO: Use a callback to dispatch the interop task instead of waiting
2626  // for
2627  // the event
2628  if (!RawEvents.empty()) {
2629  Plugin.call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
2630  }
2631  std::vector<interop_handler::ReqToMem> ReqMemObjs;
2632  // Extract the Mem Objects for all Requirements, to ensure they are
2633  // available if a user ask for them inside the interop task scope
2634  const auto &HandlerReq = ExecInterop->MRequirements;
2635  std::for_each(
2636  std::begin(HandlerReq), std::end(HandlerReq), [&](Requirement *Req) {
2637  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2638  auto MemArg = reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
2639  interop_handler::ReqToMem ReqToMem = std::make_pair(Req, MemArg);
2640  ReqMemObjs.emplace_back(ReqToMem);
2641  });
2642 
2643  std::sort(std::begin(ReqMemObjs), std::end(ReqMemObjs));
2644  interop_handler InteropHandler(std::move(ReqMemObjs), MQueue);
2645  ExecInterop->MInteropTask->call(InteropHandler);
2646  Plugin.call<PiApiKind::piEnqueueEventsWait>(MQueue->getHandleRef(), 0,
2647  nullptr, Event);
2648 
2649  return PI_SUCCESS;
2650  }
2651  case CG::CGTYPE::CodeplayHostTask: {
2652  CGHostTask *HostTask = static_cast<CGHostTask *>(MCommandGroup.get());
2653 
2654  for (ArgDesc &Arg : HostTask->MArgs) {
2655  switch (Arg.MType) {
2656  case kernel_param_kind_t::kind_accessor: {
2657  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
2658  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2659 
2660  Req->MData = AllocaCmd->getMemAllocation();
2661  break;
2662  }
2663  default:
2664  throw runtime_error("Unsupported arg type", PI_ERROR_INVALID_VALUE);
2665  }
2666  }
2667 
2668  std::vector<interop_handle::ReqToMem> ReqToMem;
2669 
2670  if (HostTask->MHostTask->isInteropTask()) {
2671  // Extract the Mem Objects for all Requirements, to ensure they are
2672  // available if a user asks for them inside the interop task scope
2673  const std::vector<Requirement *> &HandlerReq = HostTask->MRequirements;
2674  auto ReqToMemConv = [&ReqToMem, HostTask](Requirement *Req) {
2675  const std::vector<AllocaCommandBase *> &AllocaCmds =
2676  Req->MSYCLMemObj->MRecord->MAllocaCommands;
2677 
2678  for (AllocaCommandBase *AllocaCmd : AllocaCmds)
2679  if (HostTask->MQueue->getContextImplPtr() ==
2680  AllocaCmd->getQueue()->getContextImplPtr()) {
2681  auto MemArg =
2682  reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
2683  ReqToMem.emplace_back(std::make_pair(Req, MemArg));
2684 
2685  return;
2686  }
2687 
2688  assert(false &&
2689  "Can't get memory object due to no allocation available");
2690 
2691  throw runtime_error(
2692  "Can't get memory object due to no allocation available",
2693  PI_ERROR_INVALID_MEM_OBJECT);
2694  };
2695  std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
2696  std::sort(std::begin(ReqToMem), std::end(ReqToMem));
2697  }
2698 
2699  // Host task is executed asynchronously so we should record where it was
2700  // submitted to report exception origin properly.
2701  copySubmissionCodeLocation();
2702 
2703  MQueue->getThreadPool().submit<DispatchHostTask>(
2704  DispatchHostTask(this, std::move(ReqToMem)));
2705 
2706  MShouldCompleteEventIfPossible = false;
2707 
2708  return PI_SUCCESS;
2709  }
2710  case CG::CGTYPE::Barrier: {
2711  if (MQueue->getDeviceImplPtr()->is_host()) {
2712  // NOP for host device.
2713  return PI_SUCCESS;
2714  }
2715  const detail::plugin &Plugin = MQueue->getPlugin();
2717  MQueue->getHandleRef(), 0, nullptr, Event);
2718 
2719  return PI_SUCCESS;
2720  }
2721  case CG::CGTYPE::BarrierWaitlist: {
2722  CGBarrier *Barrier = static_cast<CGBarrier *>(MCommandGroup.get());
2723  std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
2724  std::vector<RT::PiEvent> PiEvents = getPiEventsBlocking(Events);
2725  if (MQueue->getDeviceImplPtr()->is_host() || PiEvents.empty()) {
2726  // NOP for host device.
2727  // If Events is empty, then the barrier has no effect.
2728  return PI_SUCCESS;
2729  }
2730  const detail::plugin &Plugin = MQueue->getPlugin();
2732  MQueue->getHandleRef(), PiEvents.size(), &PiEvents[0], Event);
2733 
2734  return PI_SUCCESS;
2735  }
2736  case CG::CGTYPE::CopyToDeviceGlobal: {
2737  CGCopyToDeviceGlobal *Copy = (CGCopyToDeviceGlobal *)MCommandGroup.get();
2738  MemoryManager::copy_to_device_global(
2739  Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(), MQueue,
2740  Copy->getNumBytes(), Copy->getOffset(), Copy->getSrc(),
2741  Copy->getOSModuleHandle(), std::move(RawEvents), Event);
2742 
2743  return CL_SUCCESS;
2744  }
2745  case CG::CGTYPE::CopyFromDeviceGlobal: {
2746  CGCopyFromDeviceGlobal *Copy =
2747  (CGCopyFromDeviceGlobal *)MCommandGroup.get();
2748  MemoryManager::copy_from_device_global(
2749  Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(), MQueue,
2750  Copy->getNumBytes(), Copy->getOffset(), Copy->getDest(),
2751  Copy->getOSModuleHandle(), std::move(RawEvents), Event);
2752 
2753  return CL_SUCCESS;
2754  }
2755  case CG::CGTYPE::None:
2756  throw runtime_error("CG type not implemented.", PI_ERROR_INVALID_OPERATION);
2757  }
2758  return PI_ERROR_INVALID_OPERATION;
2759 }
2760 
2761 bool ExecCGCommand::producesPiEvent() const {
2762  return MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask;
2763 }
2764 
2765 bool ExecCGCommand::supportsPostEnqueueCleanup() const {
2766  // Host tasks are cleaned up upon completion instead.
2767  return Command::supportsPostEnqueueCleanup() &&
2768  (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask);
2769 }
2770 
2771 bool ExecCGCommand::readyForCleanup() const {
2772  if (MCommandGroup->getType() == CG::CGTYPE::CodeplayHostTask)
2773  return MLeafCounter == 0 && MEvent->isCompleted();
2774  return Command::readyForCleanup();
2775 }
2776 
2777 KernelFusionCommand::KernelFusionCommand(QueueImplPtr Queue)
2778  : Command(Command::CommandType::FUSION, Queue),
2779  MStatus(FusionStatus::ACTIVE) {
2781 }
2782 
2783 std::vector<Command *> &KernelFusionCommand::auxiliaryCommands() {
2784  return MAuxiliaryCommands;
2785 }
2786 
2788  MFusionList.push_back(Kernel);
2789 }
2790 
2791 std::vector<ExecCGCommand *> &KernelFusionCommand::getFusionList() {
2792  return MFusionList;
2793 }
2794 
2795 bool KernelFusionCommand::producesPiEvent() const { return false; }
2796 
2797 pi_int32 KernelFusionCommand::enqueueImp() {
2799  waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
2800 
2801  return PI_SUCCESS;
2802 }
2803 
2805  MStatus = Status;
2806 }
2807 
2809 #ifdef XPTI_ENABLE_INSTRUMENTATION
2810  if (!xptiTraceEnabled()) {
2811  return;
2812  }
2813  // Create a payload with the command name and an event using this payload to
2814  // emit a node_create
2815  MCommandNodeType = commandToNodeType(MType);
2816  MCommandName = commandToName(MType);
2817 
2818  static unsigned FusionNodeCount = 0;
2819  std::stringstream PayloadStr;
2820  PayloadStr << "Fusion command #" << FusionNodeCount++;
2821  xpti::payload_t Payload = xpti::payload_t(PayloadStr.str().c_str());
2822 
2823  uint64_t CommandInstanceNo = 0;
2824  xpti_td *CmdTraceEvent =
2825  xptiMakeEvent(MCommandName.c_str(), &Payload, xpti::trace_graph_event,
2826  xpti_at::active, &CommandInstanceNo);
2827 
2828  MInstanceID = CommandInstanceNo;
2829  if (CmdTraceEvent) {
2830  MTraceEvent = static_cast<void *>(CmdTraceEvent);
2831  // If we are seeing this event again, then the instance ID
2832  // will be greater
2833  // than 1; in this case, we must skip sending a
2834  // notification to create a node as this node has already
2835  // been created. We return this value so the epilog method
2836  // can be called selectively.
2837  // See makeTraceEventProlog.
2838  MFirstInstance = (CommandInstanceNo == 1);
2839  }
2840 
2841  // This function is called in the constructor of the command. At this point
2842  // the kernel fusion list is still empty, so we don't have a terrible lot of
2843  // information we could attach to this node here.
2844  if (MFirstInstance && CmdTraceEvent) {
2845  xpti::addMetadata(CmdTraceEvent, "sycl_device",
2846  deviceToID(MQueue->get_device()));
2847  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
2848  deviceToString(MQueue->get_device()));
2849  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
2850  getSyclObjImpl(MQueue->get_device())->getDeviceName());
2851  }
2852 
2853  if (MFirstInstance) {
2854  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
2855  detail::GSYCLGraphEvent,
2856  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
2857  static_cast<const void *>(MCommandNodeType.c_str()));
2858  }
2859 
2860 #endif
2861 }
2862 
2863 void KernelFusionCommand::printDot(std::ostream &Stream) const {
2864  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2865 
2866  Stream << "ID = " << this << "\\n";
2867  Stream << "KERNEL FUSION on " << deviceToString(MQueue->get_device()) << "\\n"
2868  << "FUSION LIST: {";
2869  bool Initial = true;
2870  for (auto *Cmd : MFusionList) {
2871  if (!Initial) {
2872  Stream << ",\\n";
2873  }
2874  Initial = false;
2875  auto *KernelCG = static_cast<detail::CGExecKernel *>(&Cmd->getCG());
2876  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
2877  Stream << "created from source";
2878  } else {
2879  Stream << demangleKernelName(KernelCG->getKernelName());
2880  }
2881  }
2882  Stream << "}\\n";
2883 
2884  Stream << "\"];" << std::endl;
2885 
2886  for (const auto &Dep : MDeps) {
2887  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
2888  << " [ label = \"Access mode: "
2889  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
2890  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
2891  << std::endl;
2892  }
2893 }
2894 
2895 } // namespace detail
2896 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
2897 } // namespace sycl
sycl::_V1::detail::Command::copySubmissionCodeLocation
void copySubmissionCodeLocation()
Definition: commands.cpp:931
sycl::_V1::detail::NDRDescT::NumWorkGroups
sycl::range< 3 > NumWorkGroups
Number of workgroups, used to record the number of workgroups from the simplest form of parallel_for_...
Definition: cg_types.hpp:122
sycl::_V1::detail::tls_code_loc_t::query
const detail::code_location & query()
Query the information in the TLS slot.
Definition: common.cpp:55
sycl::_V1::detail::Scheduler::MGraphBuilder
GraphBuilder MGraphBuilder
Definition: scheduler.hpp:858
sycl::_V1::detail::ProgramManager::getOrCreateKernel
std::tuple< RT::PiKernel, std::mutex *, RT::PiProgram > getOrCreateKernel(OSModuleHandle M, const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, const program_impl *Prg)
Definition: program_manager.cpp:646
_pi_mem
PI Mem mapping to CUDA memory allocations, both data and texture/surface.
Definition: pi_cuda.hpp:224
sycl::_V1::detail::Command::MEvent
EventImplPtr MEvent
Definition: commands.hpp:242
sycl::_V1::detail::Command::MQueue
QueueImplPtr MQueue
Definition: commands.hpp:241
sycl::_V1::detail::Command
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:99
sycl::_V1::detail::Command::readyForCleanup
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:719
piEnqueueKernelLaunch
pi_result piEnqueueKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1813
sycl::_V1::detail::kernel_param_kind_t::kind_stream
@ kind_stream
sycl::_V1::access::mode::discard_read_write
@ discard_read_write
event_impl.hpp
sycl::_V1::detail::EmptyCommand::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1688
sycl::_V1::detail::AllocaSubBufCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1127
sycl::_V1::detail::Command::emitInstrumentationData
virtual void emitInstrumentationData()=0
Instrumentation method which emits telemetry data.
sycl::_V1::detail::MapMemObject::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1311
sycl::_V1::detail::AccessorImplHost::MOffset
id< 3 > & MOffset
Definition: accessor_impl.hpp:102
sycl::_V1::access::mode
mode
Definition: access.hpp:30
sycl::_V1::detail::CG::Fill2DUSM
@ Fill2DUSM
Definition: cg.hpp:73
PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM
Definition: pi.h:637
sycl::_V1::detail::SYCL_STREAM_NAME
constexpr const char * SYCL_STREAM_NAME
Definition: xpti_registry.hpp:29
sycl::_V1::detail::AllocaCommandBase::MLinkedAllocaCmd
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
Definition: commands.hpp:448
sycl::_V1::detail::KernelFusionCommand::setFusionStatus
void setFusionStatus(FusionStatus Status)
Set the status of this fusion command to Status.
Definition: commands.cpp:2804
piKernelSetArg
pi_result piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Definition: pi_esimd_emulator.cpp:1370
sycl::_V1::detail::Command::MShouldCompleteEventIfPossible
bool MShouldCompleteEventIfPossible
Definition: commands.hpp:360
context_impl.hpp
sycl::_V1::detail::SYCLMemObjI::getType
virtual MemObjType getType() const =0
sycl::_V1::detail::CG::CopyToDeviceGlobal
@ CopyToDeviceGlobal
Definition: cg.hpp:75
sycl::_V1::detail::MemCpyCommandHost::MemCpyCommandHost
MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, void **DstPtr, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1580
sycl::_V1::detail::NDRDescT::GlobalSize
sycl::range< 3 > GlobalSize
Definition: cg_types.hpp:116
sycl::_V1::detail::kernel_param_kind_t::kind_std_layout
@ kind_std_layout
sycl::_V1::detail::ContextImplPtr
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:30
sycl::_V1::detail::Command::resolveReleaseDependencies
void resolveReleaseDependencies(std::set< Command * > &list)
Looks at all the dependencies for the release command and enables instrumentation to report these dep...
Definition: commands.cpp:878
sycl::_V1::detail::MemCpyCommand::MemCpyCommand
MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, AllocaCommandBase *DstAllocaCmd, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1408
sycl::_V1::detail::UpdateHostRequirementCommand::emitInstrumentationData
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1763
sycl::_V1::backend::ext_oneapi_level_zero
@ ext_oneapi_level_zero
sycl::_V1::detail::ReleaseCommand::readyForCleanup
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:1263
sycl::_V1::detail::waitForEvents
static void waitForEvents(const std::vector< EventImplPtr > &Events)
Definition: memory_manager.cpp:114
sycl::_V1::detail::AccessorImplHost::MIsSubBuffer
bool MIsSubBuffer
Definition: accessor_impl.hpp:114
sycl::_V1::detail::AllocaCommand::AllocaCommand
AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData=true, AllocaCommandBase *LinkedAllocaCmd=nullptr, bool IsConst=false)
Definition: commands.cpp:987
sycl::_V1::detail::Command::MSubmissionFunctionName
std::string MSubmissionFunctionName
Definition: commands.hpp:352
sycl::_V1::detail::UpdateHostRequirementCommand::UpdateHostRequirementCommand
UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd, void **DstPtr)
Definition: commands.cpp:1754
cg_types.hpp
sycl::_V1::detail::AccessorImplHost
Definition: accessor_impl.hpp:42
__SYCL_INLINE_VER_NAMESPACE
#define __SYCL_INLINE_VER_NAMESPACE(X)
Definition: defines_elementary.hpp:11
sycl::_V1::detail::AllocaSubBufCommand::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1081
sycl::_V1::detail::Command::MTraceEventPrologComplete
bool MTraceEventPrologComplete
Flag to indicate if makeTraceEventProlog() has been run.
Definition: commands.hpp:340
sycl::_V1::detail::demangleKernelName
static std::string demangleKernelName(std::string Name)
Definition: commands.cpp:80
xpti_registry.hpp
sycl::_V1::detail::pi::PiKernelCacheConfig
::pi_kernel_cache_config PiKernelCacheConfig
Definition: pi.hpp:148
sycl::_V1::detail::AllocaCommandBase::readyForCleanup
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:985
sycl::_V1::detail::AccessorImplHost::MAccessMode
access::mode MAccessMode
Definition: accessor_impl.hpp:107
PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA
Definition: pi.h:639
sycl::_V1::detail::Command::MInstanceID
uint64_t MInstanceID
Instance ID tracked for the command.
Definition: commands.hpp:344
sycl::_V1::detail::CG::PrefetchUSM
@ PrefetchUSM
Definition: cg.hpp:68
sycl::_V1::detail::Command::emitEdgeEventForCommandDependence
void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, bool IsCommand, std::optional< access::mode > AccMode=std::nullopt)
Creates an edge event when the dependency is a command.
Definition: commands.cpp:521
sycl::_V1::detail::AccessorImplHost::MElemSize
unsigned int MElemSize
Definition: accessor_impl.hpp:112
sycl::_V1::detail::AllocaCommandBase::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:960
_pi_result
_pi_result
Definition: pi.h:140
sycl::_V1::detail::EnqueueResultT
Result of command enqueueing.
Definition: commands.hpp:54
sycl::_V1::detail::DispatchHostTask
Definition: commands.cpp:296
sycl::_V1::detail::Command::supportsPostEnqueueCleanup
virtual bool supportsPostEnqueueCleanup() const
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:717
sycl::_V1::detail::Command::BlockReason::HostTask
@ HostTask
sycl::_V1::detail::EmptyCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1717
sycl::_V1::detail::kernel_param_kind_t::kind_accessor
@ kind_accessor
sycl::_V1::detail::for_each
Function for_each(Group g, Ptr first, Ptr last, Function f)
Definition: group_algorithm.hpp:170
sycl::_V1::detail::tie
auto tie(Ts &...Args)
Definition: tuple.hpp:40
sycl::_V1::detail::Command::getQueue
const QueueImplPtr & getQueue() const
Definition: commands.hpp:166
sycl::_V1::detail::SYCLConfig
Definition: config.hpp:110
sycl::_V1::detail::Command::emitInstrumentation
void emitInstrumentation(uint16_t Type, const char *Txt=nullptr)
Emits an event of Type.
Definition: commands.cpp:776
sycl::_V1::detail::accessModeToString
static std::string accessModeToString(access::mode Mode)
Definition: commands.cpp:140
sycl::_V1::detail::MemoryManager::allocateMemSubBuffer
static void * allocateMemSubBuffer(ContextImplPtr TargetContext, void *ParentMemObj, size_t ElemSize, size_t Offset, range< 3 > Range, std::vector< EventImplPtr > DepEvents, RT::PiEvent &OutEvent)
Definition: memory_manager.cpp:402
sycl::_V1::detail::MapMemObject::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1274
sycl::_V1::detail::Command::makeTraceEventEpilog
void makeTraceEventEpilog()
If prolog has been run, run epilog; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:659
sycl::_V1::detail::ExecCGCommand::getAuxiliaryResources
std::vector< std::shared_ptr< const void > > getAuxiliaryResources() const
Definition: commands.cpp:1532
sycl::_V1::detail::Command::MIsBlockable
bool MIsBlockable
Indicates whether the command can be blocked from enqueueing.
Definition: commands.hpp:298
sycl::_V1::detail::Command::processDepEvent
Command * processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform glueing of events from different contexts.
Definition: commands.cpp:671
sycl::_V1::detail::EmptyCommand::addRequirement
void addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, const Requirement *Req)
Definition: commands.cpp:1672
sycl
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
sycl::_V1::detail::DispatchHostTask::DispatchHostTask
DispatchHostTask(ExecCGCommand *ThisCmd, std::vector< interop_handle::ReqToMem > ReqToMem)
Definition: commands.cpp:341
sycl::_V1::detail::AccessorImplHost::MOffsetInBytes
unsigned int MOffsetInBytes
Definition: accessor_impl.hpp:113
sycl::_V1::detail::pi::PiSampler
::pi_sampler PiSampler
Definition: pi.hpp:137
sycl::_V1::detail::HostKernel
Definition: cg_types.hpp:245
sycl::_V1::detail::EnqueueResultT::SyclEnqueueBlocked
@ SyclEnqueueBlocked
Definition: commands.hpp:58
sycl::_V1::detail::DispatchNativeKernel
void DispatchNativeKernel(void *Blob)
Definition: commands.cpp:2199
sycl::_V1::detail::plugin::getBackend
backend getBackend(void) const
Definition: plugin.hpp:229
sycl::_V1::detail::KernelBundleImplPtr
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
Definition: kernel_bundle.hpp:138
sycl::_V1::detail::MemoryManager::unmap
static void unmap(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, void *MappedPtr, std::vector< RT::PiEvent > DepEvents, RT::PiEvent &OutEvent)
Definition: memory_manager.cpp:827
access.hpp
sycl::_V1::detail::CG::Kernel
@ Kernel
Definition: cg.hpp:57
queue_impl.hpp
sycl::_V1::detail::Scheduler::GraphBuilder
Graph builder class.
Definition: scheduler.hpp:522
sycl::_V1::detail::ExecCGCommand::emitInstrumentationData
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1852
sycl::_V1::backend::ext_intel_esimd_emulator
@ ext_intel_esimd_emulator
sycl::_V1::detail::Command::addUser
void addUser(Command *NewUser)
Definition: commands.hpp:129
sycl::_V1::detail::ExecCGCommand::getCG
detail::CG & getCG() const
Definition: commands.hpp:612
sycl::_V1::detail::Command::MLeafCounter
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
Definition: commands.hpp:300
sycl::_V1::detail::Requirement
AccessorImplHost Requirement
Definition: accessor_impl.hpp:150
sycl::_V1::detail::ArgDesc::MType
sycl::detail::kernel_param_kind_t MType
Definition: cg_types.hpp:33
sycl::_V1::detail::MemoryManager::release
static void release(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, std::vector< EventImplPtr > DepEvents, RT::PiEvent &OutEvent)
Definition: memory_manager.cpp:235
scheduler.hpp
sycl::_V1::detail::AccessorImplHost::MMemoryRange
range< 3 > & MMemoryRange
Definition: accessor_impl.hpp:106
sycl::_V1::detail::CGExecKernel::MNDRDesc
NDRDescT MNDRDesc
Stores ND-range description.
Definition: cg.hpp:138
sycl::_V1::detail::Command::getBlockReason
const char * getBlockReason() const
Definition: commands.cpp:920
_pi_kernel
Implementation of a PI Kernel for CUDA.
Definition: pi_cuda.hpp:816
sycl::_V1::detail::MemCpyCommandHost::emitInstrumentationData
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1598
sycl::_V1::detail::CG::CopyFromDeviceGlobal
@ CopyFromDeviceGlobal
Definition: cg.hpp:76
sycl::_V1::detail::EmptyCommand::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1735
sycl::_V1::detail::CG::CopyAccToPtr
@ CopyAccToPtr
Definition: cg.hpp:58
sycl::_V1::kernel
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:71
sycl::_V1::detail::Command::emitInstrumentationDataProxy
void emitInstrumentationDataProxy()
Proxy method which calls emitInstrumentationData.
Definition: commands.cpp:505
sycl::_V1::detail::flushCrossQueueDeps
static void flushCrossQueueDeps(const std::vector< EventImplPtr > &EventImpls, const QueueImplPtr &Queue)
Definition: commands.cpp:289
sycl::_V1::detail::adjustNDRangePerKernel
static void adjustNDRangePerKernel(NDRDescT &NDR, RT::PiKernel Kernel, const device_impl &DeviceImpl)
Definition: commands.cpp:2056
sycl::_V1::detail::EnqueueResultT::SyclEnqueueReady
@ SyclEnqueueReady
Definition: commands.hpp:56
sycl::_V1::detail::Command::MDeps
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
Definition: commands.hpp:294
sycl::_V1::detail::ProgramManager::getInstance
static ProgramManager & getInstance()
Definition: program_manager.cpp:65
sycl::_V1::detail::write
void write(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize, unsigned WIOffset, const char *Str, unsigned Len, unsigned Padding=0)
Definition: stream.hpp:111
sycl::_V1::range< 3 >
sycl::_V1::detail::Command::MFirstInstance
bool MFirstInstance
Flag to indicate if this is the first time we are seeing this payload.
Definition: commands.hpp:342
sycl::_V1::detail::AllocaCommandBase::getRequirement
const Requirement * getRequirement() const final
Definition: commands.hpp:431
sycl::_V1::detail::AllocaSubBufCommand::getMemAllocation
void * getMemAllocation() const final
Definition: commands.cpp:1100
sycl::_V1::detail::ProgramManager::getSYCLKernelID
kernel_id getSYCLKernelID(const std::string &KernelName)
Definition: program_manager.cpp:1544
sycl::_V1::detail::CGHostTask::MQueue
std::shared_ptr< detail::queue_impl > MQueue
Definition: cg.hpp:370
sycl::_V1::detail::BlockingT
BlockingT
Definition: commands.hpp:51
piEventsWait
pi_result piEventsWait(pi_uint32 num_events, const pi_event *event_list)
Definition: pi_esimd_emulator.cpp:1453
sycl::_V1::detail::cgTypeToString
static std::string cgTypeToString(detail::CG::CGTYPE Type)
Definition: commands.cpp:1787
sycl::_V1::detail::plugin
The plugin class provides a unified interface to the underlying low-level runtimes for the device-agn...
Definition: plugin.hpp:90
sycl::_V1::detail::device_impl::getHandleRef
RT::PiDevice & getHandleRef()
Get reference to PI device.
Definition: device_impl.hpp:65
sycl::_V1::detail::DepDesc::MDepRequirement
const Requirement * MDepRequirement
Requirement for the dependency.
Definition: commands.hpp:86
sycl::_V1::detail::applyFuncOnFilteredArgs
static void applyFuncOnFilteredArgs(const ProgramManager::KernelArgMask &EliminatedArgMask, std::vector< ArgDesc > &Args, std::function< void(detail::ArgDesc &Arg, int NextTrueIndex)> Func)
Definition: commands.cpp:96
sycl::_V1::detail::AllocaSubBufCommand::AllocaSubBufCommand
AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, std::vector< Command * > &ToEnqueue, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:1063
sycl::_V1::detail::DepDesc::MDepCommand
Command * MDepCommand
The actual dependency command.
Definition: commands.hpp:84
program_impl.hpp
sycl::_V1::detail::ReleaseCommand::supportsPostEnqueueCleanup
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:1261
sycl::_V1::detail::AccessorImplHost::MSYCLMemObj
detail::SYCLMemObjI * MSYCLMemObj
Definition: accessor_impl.hpp:109
sycl::_V1::detail::event_impl
Definition: event_impl.hpp:36
sycl::_V1::detail::UnMapMemObject::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1391
sycl::_V1::detail::UnMapMemObject::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1335
sycl::_V1::detail::Command::getType
CommandType getType() const
Definition: commands.hpp:132
PI_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG
provides the preferred cache configuration (large slm or large data)
Definition: pi.h:1381
piKernelSetExecInfo
pi_result piKernelSetExecInfo(pi_kernel kernel, pi_kernel_exec_info value_name, size_t param_value_size, const void *param_value)
API to set attributes controlling kernel execution.
Definition: pi_esimd_emulator.cpp:2008
piKernelGetGroupInfo
pi_result piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1387
sycl::_V1::detail::Command::MSubmissionCodeLocation
code_location MSubmissionCodeLocation
Represents code location of command submission to SYCL API, assigned with the valid value only if com...
Definition: commands.hpp:348
sycl::_V1::detail::tls_code_loc_t
Data type that manages the code_location information in TLS.
Definition: common.hpp:152
sycl::_V1::detail::deviceToString
static std::string deviceToString(device Device)
Definition: commands.cpp:83
sycl::_V1::detail::Command::MBlockReason
BlockReason MBlockReason
Definition: commands.hpp:314
sycl::_V1::detail::MemCpyCommand::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1461
sycl::_V1::detail::XPTIRegistry::bufferAssociateNotification
static void bufferAssociateNotification(const void *, const void *)
Definition: xpti_registry.cpp:69
sycl::_V1::detail::MemoryManager::map
static void * map(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, access::mode AccessMode, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< RT::PiEvent > DepEvents, RT::PiEvent &OutEvent)
Definition: memory_manager.cpp:782
sycl::_V1::detail::CG::Copy2DUSM
@ Copy2DUSM
Definition: cg.hpp:72
sycl::_V1::detail::Command::MCommandNodeType
std::string MCommandNodeType
Buffer to build the command node type.
Definition: commands.hpp:336
commands.hpp
sycl::_V1::detail::AllocaCommandBase::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:981
sycl::_V1::detail::DepDesc
Dependency between two commands.
Definition: commands.hpp:73
sycl::_V1::detail::UnMapMemObject::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1357
sycl::_V1::detail::Command::MEnqueueMtx
std::mutex MEnqueueMtx
Mutex used to protect enqueueing from race conditions.
Definition: commands.hpp:276
sycl::_V1::detail::Command::getEvent
const EventImplPtr & getEvent() const
Definition: commands.hpp:168
sycl::_V1::detail::HostTask
Definition: cg_types.hpp:228
sycl::_V1::detail::CG::CopyPtrToAcc
@ CopyPtrToAcc
Definition: cg.hpp:59
piEnqueueEventsWaitWithBarrier
pi_result piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1536
sycl::_V1::detail::Command::MSubmissionFileName
std::string MSubmissionFileName
Introduces string to handle memory management since code_location struct works with raw char arrays.
Definition: commands.hpp:351
sycl::_V1::kernel_id
Objects of the class identify kernel is some kernel_bundle related APIs.
Definition: kernel_bundle.hpp:44
sycl::_V1::detail::pi::PiProgram
::pi_program PiProgram
Definition: pi.hpp:130
sycl::_V1::detail::ReleaseCommand::ReleaseCommand
ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd)
Definition: commands.cpp:1149
sycl::_V1::detail::KernelFusionCommand::getFusionList
std::vector< ExecCGCommand * > & getFusionList()
Definition: commands.cpp:2791
sycl::_V1::detail::kernel_param_kind_t::kind_invalid
@ kind_invalid
error_handling.hpp
sycl::_V1::nd_range
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: uniform.hpp:36
sycl::_V1::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:49
sycl::_V1::detail::pi::PiMem
::pi_mem PiMem
Definition: pi.hpp:134
sycl::_V1::detail::KernelFusionCommand::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:2795
sycl::_V1::detail::ReverseRangeDimensionsForKernel
static void ReverseRangeDimensionsForKernel(NDRDescT &NDR)
Definition: commands.cpp:2083
pi_mem
_pi_mem * pi_mem
Definition: pi.h:969
sycl::_V1::detail::CG::Fill
@ Fill
Definition: cg.hpp:63
sycl::_V1::detail::device_impl
Definition: device_impl.hpp:36
sycl::_V1::read_write
constexpr mode_tag_t< access_mode::read_write > read_write
Definition: access.hpp:74
sycl::_V1::detail::UpdateHostRequirementCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1557
sycl::_V1::detail::OSModuleHandle
intptr_t OSModuleHandle
Uniquely identifies an operating system module (executable or a dynamic library)
Definition: os_util.hpp:48
sycl::_V1::detail::EventImplPtr
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:42
sycl::_V1::detail::AllocaCommandBase::MMemAllocation
void * MMemAllocation
Definition: commands.hpp:441
sycl::_V1::detail::MemoryManager::allocate
static void * allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector< EventImplPtr > DepEvents, RT::PiEvent &OutEvent)
Definition: memory_manager.cpp:264
sycl::_V1::detail::MemoryManager::copy
static void copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< RT::PiEvent > DepEvents, RT::PiEvent &OutEvent)
Definition: memory_manager.cpp:716
sycl::_V1::detail::MemCpyCommandHost::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1737
sycl::_V1::detail::KernelFusionCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:2863
program_manager.hpp
sycl::_V1::detail::kernel_param_kind_t::kind_pointer
@ kind_pointer
sycl::_V1::detail::CG::getType
CGTYPE getType()
Definition: cg.hpp:101
sycl::_V1::detail::ArgDesc::MSize
int MSize
Definition: cg_types.hpp:35
sycl::_V1::detail::pi::PiKernel
::pi_kernel PiKernel
Definition: pi.hpp:131
sycl::_V1::detail::CGExecKernel
"Execute kernel" command group class.
Definition: cg.hpp:135
piextKernelSetArgSampler
pi_result piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, const pi_sampler *arg_value)
Definition: pi_esimd_emulator.cpp:1379
sycl::_V1::detail::EnqueueResultT::SyclEnqueueSuccess
@ SyclEnqueueSuccess
Definition: commands.hpp:57
sycl::_V1::detail::Command::waitForPreparedHostEvents
void waitForPreparedHostEvents() const
Definition: commands.cpp:425
piextKernelSetArgMemObj
pi_result piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem *arg_value)
Definition: pi_esimd_emulator.cpp:1374
sycl::_V1::detail::KernelFusionCommand::auxiliaryCommands
std::vector< Command * > & auxiliaryCommands()
Definition: commands.cpp:2783
sycl::_V1::detail::MemCpyCommand::emitInstrumentationData
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1427
sycl::_V1::detail::AllocaCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1043
sycl::_V1::detail::CG::Memset2DUSM
@ Memset2DUSM
Definition: cg.hpp:74
sycl::_V1::detail::ExecCGCommand
The exec CG command enqueues execution of kernel or explicit memory operation.
Definition: commands.hpp:601
sycl::_V1::detail::kernel_param_kind_t::kind_specialization_constants_buffer
@ kind_specialization_constants_buffer
sycl::_V1::detail::NDRDescT
Definition: cg_types.hpp:41
sycl::_V1::detail::Command::waitForEvents
void waitForEvents(QueueImplPtr Queue, std::vector< EventImplPtr > &RawEvents, RT::PiEvent &Event)
Definition: commands.cpp:430
sycl::_V1::detail::CGHostTask
Definition: cg.hpp:366
sycl::_V1::detail::CG::CGTYPE
CGTYPE
Type of the command group.
Definition: cg.hpp:55
sycl::_V1::detail::AllocaCommandBase::MIsLeaderAlloca
bool MIsLeaderAlloca
Indicates that the command owns memory allocation in case of connected alloca command.
Definition: commands.hpp:454
sycl::_V1::detail::Command::CommandType
CommandType
Definition: commands.hpp:101
sycl::_V1::detail::ReleaseCommand::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1154
sycl::_V1::detail::EnqueueResultT::SyclEnqueueFailed
@ SyclEnqueueFailed
Definition: commands.hpp:59
sycl::_V1::detail::Command::BlockReason::HostAccessor
@ HostAccessor
kernel_info.hpp
sycl::_V1::detail::Command::MAddress
void * MAddress
Reserved for storing the object address such as SPIR-V or memory object address.
Definition: commands.hpp:332
sycl::_V1::backend::opencl
@ opencl
sycl::_V1::detail::AllocaCommandBase::MIsActive
bool MIsActive
Indicates that current alloca is active one.
Definition: commands.hpp:450
sycl::_V1::detail::NDRDescT::LocalSize
sycl::range< 3 > LocalSize
Definition: cg_types.hpp:117
sycl::_V1::detail::ExecCGCommand::clearAuxiliaryResources
void clearAuxiliaryResources()
Definition: commands.cpp:1538
piEnqueueEventsWait
pi_result piEnqueueEventsWait(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1531
sycl::_V1::detail::Command::getPiEvents
std::vector< RT::PiEvent > getPiEvents(const std::vector< EventImplPtr > &EventImpls) const
Collect PI events from EventImpls and filter out some of them in case of in order queue.
Definition: commands.cpp:219
backend_types.hpp
sycl::_V1::detail::Command::MTraceEvent
void * MTraceEvent
The event for node_create and task_begin.
Definition: commands.hpp:325
sycl::_V1::detail::Command::MCommandName
std::string MCommandName
Buffer to build the command end-user understandable name.
Definition: commands.hpp:338
sycl::_V1::detail::DispatchHostTask::operator()
void operator()() const
Definition: commands.cpp:345
sycl::_V1::detail::NDRDescT::GlobalOffset
sycl::id< 3 > GlobalOffset
Definition: cg_types.hpp:118
sycl::_V1::detail::ProgramManager::KernelArgMask
std::vector< bool > KernelArgMask
Definition: program_manager.hpp:84
sycl::_V1::detail::AllocaCommandBase::AllocaCommandBase
AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
Definition: commands.cpp:949
sycl::_V1::detail::plugin::call
void call(ArgsT... Args) const
Calls the API, traces the call, checks the result.
Definition: plugin.hpp:217
sycl::_V1::detail::AllocaCommandBase::getMemAllocation
virtual void * getMemAllocation() const =0
std
Definition: accessor.hpp:3230
sycl::_V1::detail::kernel_param_kind_t::kind_sampler
@ kind_sampler
kernel_desc.hpp
sycl::_V1::detail::BLOCKING
@ BLOCKING
Definition: commands.hpp:51
sycl::_V1::detail::ExecCGCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:2012
sycl::_V1::detail::Command::MWorkerQueue
QueueImplPtr MWorkerQueue
Definition: commands.hpp:243
sycl::_V1::detail::Command::MMarkedForCleanup
bool MMarkedForCleanup
Indicates that the node will be freed by graph cleanup.
Definition: commands.hpp:365
sycl::_V1::detail::ProgramManager::getEliminatedKernelArgMask
KernelArgMask getEliminatedKernelArgMask(OSModuleHandle M, pi::PiProgram NativePrg, const std::string &KernelName)
Returns the mask for eliminated kernel arguments for the requested kernel within the native program.
Definition: program_manager.cpp:1471
sycl::_V1::detail::device_impl::getPlugin
const plugin & getPlugin() const
Definition: device_impl.hpp:126
sycl::_V1::detail::ReleaseCommand::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1259
sycl::_V1::detail::Command::makeTraceEventProlog
uint64_t makeTraceEventProlog(void *MAddress)
Create a trace event of node_create type; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:627
sycl::_V1::detail::ArgDesc
Definition: cg_types.hpp:27
sycl::_V1::detail::CG::CopyUSM
@ CopyUSM
Definition: cg.hpp:66
sycl::_V1::detail::QueueImplPtr
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: event_impl.hpp:32
sycl::_V1::detail::EmptyCommand::EmptyCommand
EmptyCommand(QueueImplPtr Queue)
Definition: commands.cpp:1660
sycl::_V1::detail::Scheduler::GraphBuilder::connectDepEvent
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
Definition: graph_builder.cpp:1296
sycl::_V1::detail::KernelFusionCommand::addToFusionList
void addToFusionList(ExecCGCommand *Kernel)
Definition: commands.cpp:2787
sampler.hpp
sycl::_V1::detail::CG::FillUSM
@ FillUSM
Definition: cg.hpp:67
sycl::_V1::detail::Scheduler::getInstance
static Scheduler & getInstance()
Definition: scheduler.cpp:252
kernel_bundle_impl.hpp
sycl::_V1::detail::Command::emitEnqueuedEventSignal
void emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr)
Creates a signal event with the enqueued kernel event handle.
Definition: commands.cpp:764
sycl::_V1::detail::Command::addDep
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:724
sycl::_V1::detail::Command::MAddressString
std::string MAddressString
Buffer to build the address string.
Definition: commands.hpp:334
sycl::_V1::detail::MapMemObject::MapMemObject
MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode)
Definition: commands.cpp:1265
piextKernelSetArgPointer
pi_result piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Sets up pointer arguments for CL kernels.
Definition: pi_esimd_emulator.cpp:1967
sycl::_V1::detail::get_kernel_device_specific_info
std::enable_if< !std::is_same< typename Param::return_type, sycl::range< 3 > >::value, typename Param::return_type >::type get_kernel_device_specific_info(RT::PiKernel Kernel, RT::PiDevice Device, const plugin &Plugin)
Definition: kernel_info.hpp:86
sycl::_V1::detail::AllocaCommandBase::supportsPostEnqueueCleanup
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:983
sycl::_V1::detail::CG::CopyAccToAcc
@ CopyAccToAcc
Definition: cg.hpp:60
piEnqueueNativeKernel
pi_result piEnqueueNativeKernel(pi_queue queue, void(*user_func)(void *), void *args, size_t cb_args, pi_uint32 num_mem_objects, const pi_mem *mem_list, const void **args_mem_loc, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1883
sycl::_V1::detail::AccessorImplHost::MData
void *& MData
Definition: accessor_impl.hpp:116
sycl::_V1::detail::MemCpyCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1501
sycl::_V1::detail::AllocaCommandBase::getSYCLMemObj
SYCLMemObjI * getSYCLMemObj() const
Definition: commands.hpp:427
kernel_impl.hpp
sycl::_V1::detail::NDRDescT::set
void set(sycl::range< Dims_ > NumWorkItems)
Definition: cg_types.hpp:57
sycl::_V1::detail::Command::MPreparedHostDepsEvents
std::vector< EventImplPtr > & MPreparedHostDepsEvents
Definition: commands.hpp:248
sycl::_V1::detail::AccessorImplHost::MAccessRange
range< 3 > & MAccessRange
Definition: accessor_impl.hpp:104
sycl::_V1::detail::usm::free
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:267
sycl::_V1::detail::AccessorImplHost::MDims
unsigned int MDims
Definition: accessor_impl.hpp:111
sycl::_V1::detail::AllocaCommand::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1005
sycl::_V1::detail::CG::CodeplayHostTask
@ CodeplayHostTask
Definition: cg.hpp:70
sycl::_V1::detail::Command::enqueue
virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector< Command * > &ToCleanUp)
Checks if the command is enqueued, and calls enqueueImp.
Definition: commands.cpp:787
sycl::_V1::detail::Command::MPreparedDepsEvents
std::vector< EventImplPtr > & MPreparedDepsEvents
Dependency events prepared for waiting by backend.
Definition: commands.hpp:247
sycl::_V1::detail::Command::MStreamID
int32_t MStreamID
The stream under which the traces are emitted.
Definition: commands.hpp:329
sycl::_V1::detail::ReleaseCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1241
sycl::_V1::detail::AllocaCommandBase::MRequirement
Requirement MRequirement
Definition: commands.hpp:459
sycl::_V1::detail::AllocaCommandBase
Base class for memory allocation commands.
Definition: commands.hpp:420
sycl::_V1::detail::ArgDesc::MPtr
void * MPtr
Definition: cg_types.hpp:34
sycl::_V1::detail::SetKernelParamsAndLaunch
static pi_result SetKernelParamsAndLaunch(const QueueImplPtr &Queue, std::vector< ArgDesc > &Args, const std::shared_ptr< device_image_impl > &DeviceImageImpl, RT::PiKernel Kernel, NDRDescT &NDRDesc, std::vector< RT::PiEvent > &RawEvents, RT::PiEvent *OutEvent, const ProgramManager::KernelArgMask &EliminatedArgMask, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
Definition: commands.cpp:2091
sycl::_V1::detail::Command::emitEdgeEventForEventDependence
void emitEdgeEventForEventDependence(Command *Cmd, RT::PiEvent &EventAddr)
Creates an edge event when the dependency is an event.
Definition: commands.cpp:570
sycl::_V1::interop_handle
Definition: interop_handle.hpp:36
sycl::_V1::detail::Command::getWorkerQueue
const QueueImplPtr & getWorkerQueue() const
Get the queue this command will be submitted to.
Definition: commands.cpp:710
sycl::_V1::access::mode::read_write
@ read_write
sycl::_V1::detail::KernelFusionCommand::FusionStatus
FusionStatus
Definition: commands.hpp:662
sycl::_V1::detail::MemCpyCommand::getWorkerContext
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1457
sycl::_V1::detail::Command::MType
CommandType MType
The type of the command.
Definition: commands.hpp:274
sycl::_V1::detail::KernelFusionCommand::emitInstrumentationData
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:2808
sycl::_V1::detail::UnMapMemObject::UnMapMemObject
UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, void **SrcPtr, QueueImplPtr Queue)
Definition: commands.cpp:1328
sycl::_V1::detail::enqueueImpKernel
pi_int32 enqueueImpKernel(const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector< ArgDesc > &Args, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, const std::shared_ptr< detail::kernel_impl > &MSyclKernel, const std::string &KernelName, const detail::OSModuleHandle &OSModuleHandle, std::vector< RT::PiEvent > &RawEvents, RT::PiEvent *OutEvent, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, RT::PiKernelCacheConfig KernelCacheConfig)
Definition: commands.cpp:2223
sycl::_V1::detail::enqueue_kernel_launch::handleErrorOrWarning
void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl, pi_kernel Kernel, const NDRDescT &NDRDesc)
Analyzes error code and arguments of piEnqueueKernelLaunch to emit user-friendly exception describing...
Definition: error_handling.cpp:296
sycl::_V1::detail::MemCpyCommandHost::getWorkerContext
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1628
sycl::_V1::detail::pi::PiEvent
::pi_event PiEvent
Definition: pi.hpp:136
sycl::_V1::detail::NDRDescT::Dims
size_t Dims
Definition: cg_types.hpp:123
sycl::_V1::detail::Command::producesPiEvent
virtual bool producesPiEvent() const
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:715
stream_impl.hpp
sycl::_V1::detail::Command::MEnqueueStatus
std::atomic< EnqueueResultT::ResultT > MEnqueueStatus
Describes the status of the command.
Definition: commands.hpp:317
PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
Definition: pi.h:388
sycl::_V1::detail::CG::UpdateHost
@ UpdateHost
Definition: cg.hpp:64
pi_int32
int32_t pi_int32
Definition: pi.h:128
sycl::_V1::access::mode::discard_write
@ discard_write
sycl::_V1::detail::getSyclObjImpl
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: common.hpp:300
sampler_impl.hpp
sycl::_V1::detail::Command::ALLOCA_SUB_BUF
@ ALLOCA_SUB_BUF
Definition: commands.hpp:105
sycl::_V1::detail::ArgDesc::MIndex
int MIndex
Definition: cg_types.hpp:36
sycl::_V1::detail::Command::getWorkerContext
virtual const ContextImplPtr & getWorkerContext() const
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:706
sycl::_V1::detail::ExecCGCommand::ExecCGCommand
ExecCGCommand(std::unique_ptr< detail::CG > CommandGroup, QueueImplPtr Queue)
Definition: commands.cpp:1840
memory_manager.hpp
sycl::_V1::detail::Command::enqueueImp
virtual pi_int32 enqueueImp()=0
Private interface. Derived classes should implement this method.