DPC++ Runtime
Runtime libraries for oneAPI DPC++
commands.cpp
Go to the documentation of this file.
1 //
2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3 // See https://llvm.org/LICENSE.txt for license information.
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5 //
6 //===----------------------------------------------------------------------===//
7 
8 #include "ur_api.h"
10 
11 #include <detail/context_impl.hpp>
12 #include <detail/event_impl.hpp>
15 #include <detail/kernel_impl.hpp>
16 #include <detail/kernel_info.hpp>
19 #include <detail/queue_impl.hpp>
20 #include <detail/sampler_impl.hpp>
23 #include <detail/stream_impl.hpp>
24 #include <detail/xpti_registry.hpp>
25 #include <sycl/access/access.hpp>
26 #include <sycl/backend_types.hpp>
27 #include <sycl/detail/cg_types.hpp>
28 #include <sycl/detail/helpers.hpp>
30 #include <sycl/sampler.hpp>
31 
32 #include <cassert>
33 #include <optional>
34 #include <string>
35 #include <vector>
36 
37 #ifdef __has_include
38 #if __has_include(<cxxabi.h>)
39 #define __SYCL_ENABLE_GNU_DEMANGLING
40 #include <cstdlib>
41 #include <cxxabi.h>
42 #include <memory>
43 #endif
44 #endif
45 
46 #ifdef XPTI_ENABLE_INSTRUMENTATION
47 #include "xpti/xpti_trace_framework.hpp"
48 #include <detail/xpti_registry.hpp>
49 #endif
50 
51 namespace sycl {
52 inline namespace _V1 {
53 namespace detail {
54 
55 #ifdef XPTI_ENABLE_INSTRUMENTATION
56 // Global graph for the application
57 extern xpti::trace_event_data_t *GSYCLGraphEvent;
58 
59 static bool CurrentCodeLocationValid() {
60  detail::tls_code_loc_t Tls;
61  auto CodeLoc = Tls.query();
62  auto FileName = CodeLoc.fileName();
63  auto FunctionName = CodeLoc.functionName();
64  return (FileName && FileName[0] != '\0') ||
65  (FunctionName && FunctionName[0] != '\0');
66 }
67 
68 void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
69  xpti_td *TraceEvent, uint16_t Type,
70  const void *Addr) {
71  if (!(xptiCheckTraceEnabled(StreamID, Type) && TraceEvent))
72  return;
73  // Trace event notifier that emits a Type event
74  xptiNotifySubscribers(StreamID, Type, detail::GSYCLGraphEvent,
75  static_cast<xpti_td *>(TraceEvent), InstanceID, Addr);
76 }
77 
78 static size_t deviceToID(const device &Device) {
79  return reinterpret_cast<size_t>(getSyclObjImpl(Device)->getHandleRef());
80 }
81 
82 static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) {
83  xpti::addMetadata(TraceEvent, "sycl_device_type",
84  queueDeviceToString(Queue.get()));
85  if (Queue) {
86  xpti::addMetadata(TraceEvent, "sycl_device",
87  deviceToID(Queue->get_device()));
88  xpti::addMetadata(TraceEvent, "sycl_device_name",
89  getSyclObjImpl(Queue->get_device())->getDeviceName());
90  }
91 }
92 
93 static unsigned long long getQueueID(const QueueImplPtr &Queue) {
94  return Queue ? Queue->getQueueID() : 0;
95 }
96 #endif
97 
98 static ContextImplPtr getContext(const QueueImplPtr &Queue) {
99  if (Queue)
100  return Queue->getContextImplPtr();
101  return nullptr;
102 }
103 
104 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
105 struct DemangleHandle {
106  char *p;
107  DemangleHandle(char *ptr) : p(ptr) {}
108 
109  DemangleHandle(const DemangleHandle &) = delete;
110  DemangleHandle &operator=(const DemangleHandle &) = delete;
111 
112  ~DemangleHandle() { std::free(p); }
113 };
114 static std::string demangleKernelName(std::string Name) {
115  int Status = -1; // some arbitrary value to eliminate the compiler warning
116  DemangleHandle result(abi::__cxa_demangle(Name.c_str(), NULL, NULL, &Status));
117  return (Status == 0) ? result.p : Name;
118 }
119 #else
120 static std::string demangleKernelName(std::string Name) { return Name; }
121 #endif
122 
124  const KernelArgMask *EliminatedArgMask, std::vector<ArgDesc> &Args,
125  std::function<void(detail::ArgDesc &Arg, int NextTrueIndex)> Func) {
126  if (!EliminatedArgMask || EliminatedArgMask->size() == 0) {
127  for (ArgDesc &Arg : Args) {
128  Func(Arg, Arg.MIndex);
129  }
130  } else {
131  // TODO this is not necessary as long as we can guarantee that the
132  // arguments are already sorted (e. g. handle the sorting in handler
133  // if necessary due to set_arg(...) usage).
134  std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
135  return A.MIndex < B.MIndex;
136  });
137  int LastIndex = -1;
138  size_t NextTrueIndex = 0;
139 
140  for (ArgDesc &Arg : Args) {
141  // Handle potential gaps in set arguments (e. g. if some of them are
142  // set on the user side).
143  for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx)
144  if (!(*EliminatedArgMask)[Idx])
145  ++NextTrueIndex;
146  LastIndex = Arg.MIndex;
147 
148  if ((*EliminatedArgMask)[Arg.MIndex])
149  continue;
150 
151  Func(Arg, NextTrueIndex);
152  ++NextTrueIndex;
153  }
154  }
155 }
156 
157 static std::string accessModeToString(access::mode Mode) {
158  switch (Mode) {
159  case access::mode::read:
160  return "read";
161  case access::mode::write:
162  return "write";
164  return "read_write";
166  return "discard_write";
168  return "discard_read_write";
169  default:
170  return "unknown";
171  }
172 }
173 
174 #ifdef XPTI_ENABLE_INSTRUMENTATION
175 // Using the command group type to create node types for the asynchronous task
176 // graph modeling
177 static std::string commandToNodeType(Command::CommandType Type) {
178  switch (Type) {
179  case Command::CommandType::RUN_CG:
180  return "command_group_node";
181  case Command::CommandType::COPY_MEMORY:
182  return "memory_transfer_node";
183  case Command::CommandType::ALLOCA:
184  return "memory_allocation_node";
185  case Command::CommandType::ALLOCA_SUB_BUF:
186  return "sub_buffer_creation_node";
187  case Command::CommandType::RELEASE:
188  return "memory_deallocation_node";
189  case Command::CommandType::MAP_MEM_OBJ:
190  return "memory_transfer_node";
191  case Command::CommandType::UNMAP_MEM_OBJ:
192  return "memory_transfer_node";
193  case Command::CommandType::UPDATE_REQUIREMENT:
194  return "host_acc_create_buffer_lock_node";
195  case Command::CommandType::EMPTY_TASK:
196  return "host_acc_destroy_buffer_release_node";
197  case Command::CommandType::FUSION:
198  return "kernel_fusion_placeholder_node";
199  default:
200  return "unknown_node";
201  }
202 }
203 
204 // Using the names being generated and the string are subject to change to
205 // something more meaningful to end-users as this will be visible in analysis
206 // tools that subscribe to this data
207 static std::string commandToName(Command::CommandType Type) {
208  switch (Type) {
209  case Command::CommandType::RUN_CG:
210  return "Command Group Action";
211  case Command::CommandType::COPY_MEMORY:
212  return "Memory Transfer (Copy)";
213  case Command::CommandType::ALLOCA:
214  return "Memory Allocation";
215  case Command::CommandType::ALLOCA_SUB_BUF:
216  return "Sub Buffer Creation";
217  case Command::CommandType::RELEASE:
218  return "Memory Deallocation";
219  case Command::CommandType::MAP_MEM_OBJ:
220  return "Memory Transfer (Map)";
221  case Command::CommandType::UNMAP_MEM_OBJ:
222  return "Memory Transfer (Unmap)";
223  case Command::CommandType::UPDATE_REQUIREMENT:
224  return "Host Accessor Creation/Buffer Lock";
225  case Command::CommandType::EMPTY_TASK:
226  return "Host Accessor Destruction/Buffer Lock Release";
227  case Command::CommandType::FUSION:
228  return "Kernel Fusion Placeholder";
229  default:
230  return "Unknown Action";
231  }
232 }
233 #endif
234 
235 std::vector<ur_event_handle_t>
236 Command::getUrEvents(const std::vector<EventImplPtr> &EventImpls) const {
237  std::vector<ur_event_handle_t> RetUrEvents;
238  for (auto &EventImpl : EventImpls) {
239  if (EventImpl->getHandleRef() == nullptr)
240  continue;
241 
242  // Do not add redundant event dependencies for in-order queues.
243  // At this stage dependency is definitely ur task and need to check if
244  // current one is a host task. In this case we should not skip ur event due
245  // to different sync mechanisms for different task types on in-order queue.
246  if (MWorkerQueue && EventImpl->getWorkerQueue() == MWorkerQueue &&
247  MWorkerQueue->isInOrder() && !isHostTask())
248  continue;
249 
250  RetUrEvents.push_back(EventImpl->getHandleRef());
251  }
252 
253  return RetUrEvents;
254 }
255 
256 // This function is implemented (duplicating getUrEvents a lot) as short term
257 // solution for the issue that barrier with wait list could not
258 // handle empty ur event handles when kernel is enqueued on host task
259 // completion.
260 std::vector<ur_event_handle_t> Command::getUrEventsBlocking(
261  const std::vector<EventImplPtr> &EventImpls) const {
262  std::vector<ur_event_handle_t> RetUrEvents;
263  for (auto &EventImpl : EventImpls) {
264  // Throwaway events created with empty constructor will not have a context
265  // (which is set lazily) calling getContextImpl() would set that
266  // context, which we wish to avoid as it is expensive.
267  // Skip host task and NOP events also.
268  if (EventImpl->isDefaultConstructed() || EventImpl->isHost() ||
269  EventImpl->isNOP())
270  continue;
271  // In this path nullptr native event means that the command has not been
272  // enqueued. It may happen if async enqueue in a host task is involved.
273  if (!EventImpl->isEnqueued()) {
274  if (!EventImpl->getCommand() ||
275  !static_cast<Command *>(EventImpl->getCommand())->producesPiEvent())
276  continue;
277  std::vector<Command *> AuxCmds;
278  Scheduler::getInstance().enqueueCommandForCG(EventImpl, AuxCmds,
279  BLOCKING);
280  }
281  // Do not add redundant event dependencies for in-order queues.
282  // At this stage dependency is definitely ur task and need to check if
283  // current one is a host task. In this case we should not skip pi event due
284  // to different sync mechanisms for different task types on in-order queue.
285  if (MWorkerQueue && EventImpl->getWorkerQueue() == MWorkerQueue &&
286  MWorkerQueue->isInOrder() && !isHostTask())
287  continue;
288 
289  RetUrEvents.push_back(EventImpl->getHandleRef());
290  }
291 
292  return RetUrEvents;
293 }
294 
295 bool Command::isHostTask() const {
296  return (MType == CommandType::RUN_CG) /* host task has this type also */ &&
297  ((static_cast<const ExecCGCommand *>(this))->getCG().getType() ==
299 }
300 
301 bool Command::isFusable() const {
302  if ((MType != CommandType::RUN_CG)) {
303  return false;
304  }
305  const auto &CG = (static_cast<const ExecCGCommand &>(*this)).getCG();
306  return (CG.getType() == CGType::Kernel) &&
307  (!static_cast<const CGExecKernel &>(CG).MKernelIsCooperative) &&
308  (!static_cast<const CGExecKernel &>(CG).MKernelUsesClusterLaunch);
309 }
310 
311 static void flushCrossQueueDeps(const std::vector<EventImplPtr> &EventImpls,
312  const QueueImplPtr &Queue) {
313  for (auto &EventImpl : EventImpls) {
314  EventImpl->flushIfNeeded(Queue);
315  }
316 }
317 
318 namespace {
319 
320 struct EnqueueNativeCommandData {
322  std::function<void(interop_handle)> func;
323 };
324 
325 void InteropFreeFunc(ur_queue_handle_t, void *InteropData) {
326  auto *Data = reinterpret_cast<EnqueueNativeCommandData *>(InteropData);
327  return Data->func(Data->ih);
328 }
329 } // namespace
330 
332  ExecCGCommand *MThisCmd;
333  std::vector<interop_handle::ReqToMem> MReqToMem;
334  std::vector<ur_mem_handle_t> MReqUrMem;
335 
336  bool waitForEvents() const {
337  std::map<const PluginPtr, std::vector<EventImplPtr>>
338  RequiredEventsPerPlugin;
339 
340  for (const EventImplPtr &Event : MThisCmd->MPreparedDepsEvents) {
341  const PluginPtr &Plugin = Event->getPlugin();
342  RequiredEventsPerPlugin[Plugin].push_back(Event);
343  }
344 
345  // wait for dependency device events
346  // FIXME Current implementation of waiting for events will make the thread
347  // 'sleep' until all of dependency events are complete. We need a bit more
348  // sophisticated waiting mechanism to allow to utilize this thread for any
349  // other available job and resume once all required events are ready.
350  for (auto &PluginWithEvents : RequiredEventsPerPlugin) {
351  std::vector<ur_event_handle_t> RawEvents =
352  MThisCmd->getUrEvents(PluginWithEvents.second);
353  if (RawEvents.size() == 0)
354  continue;
355  try {
356  PluginWithEvents.first->call(urEventWait, RawEvents.size(),
357  RawEvents.data());
358  } catch (const sycl::exception &) {
359  MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(
360  std::current_exception());
361  return false;
362  } catch (...) {
363  MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(
364  std::current_exception());
365  return false;
366  }
367  }
368 
369  // Wait for dependency host events.
370  // Host events can't throw exceptions so don't try to catch it.
371  for (const EventImplPtr &Event : MThisCmd->MPreparedHostDepsEvents) {
372  Event->waitInternal();
373  }
374 
375  return true;
376  }
377 
378 public:
380  std::vector<interop_handle::ReqToMem> ReqToMem,
381  std::vector<ur_mem_handle_t> ReqUrMem)
382  : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)),
383  MReqUrMem(std::move(ReqUrMem)) {}
384 
385  void operator()() const {
386  assert(MThisCmd->getCG().getType() == CGType::CodeplayHostTask);
387 
388  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
389 
390 #ifdef XPTI_ENABLE_INSTRUMENTATION
391  // Host task is executed async and in a separate thread that do not allow to
392  // use code location data stored in TLS. So we keep submission code location
393  // as Command field and put it here to TLS so that thrown exception could
394  // query and report it.
395  std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
396  if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
397  AsyncCodeLocationPtr.reset(
399  }
400 #endif
401 
402  if (!waitForEvents()) {
403  std::exception_ptr EPtr = std::make_exception_ptr(sycl::exception(
405  std::string("Couldn't wait for host-task's dependencies")));
406 
407  MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(EPtr);
408  // reset host-task's lambda and quit
409  HostTask.MHostTask.reset();
411  return;
412  }
413 
414  try {
415  // we're ready to call the user-defined lambda now
416  if (HostTask.MHostTask->isInteropTask()) {
417  assert(HostTask.MQueue &&
418  "Host task submissions should have an associated queue");
419  interop_handle IH{MReqToMem, HostTask.MQueue,
420  HostTask.MQueue->getDeviceImplPtr(),
421  HostTask.MQueue->getContextImplPtr()};
422  // TODO: should all the backends that support this entry point use this
423  // for host task?
424  auto &Queue = HostTask.MQueue;
425  bool NativeCommandSupport = false;
426  Queue->getPlugin()->call(
427  urDeviceGetInfo,
428  detail::getSyclObjImpl(Queue->get_device())->getHandleRef(),
429  UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP,
430  sizeof(NativeCommandSupport), &NativeCommandSupport, nullptr);
431  if (NativeCommandSupport) {
432  EnqueueNativeCommandData CustomOpData{
433  IH, HostTask.MHostTask->MInteropTask};
434 
435  // We are assuming that we have already synchronized with the HT's
436  // dependent events, and that the user will synchronize before the end
437  // of the HT lambda. As such we don't pass in any events, or ask for
438  // one back.
439  //
440  // This entry point is needed in order to migrate memory across
441  // devices in the same context for CUDA and HIP backends
442  Queue->getPlugin()->call(
443  urEnqueueNativeCommandExp, HostTask.MQueue->getHandleRef(),
444  InteropFreeFunc, &CustomOpData, MReqUrMem.size(),
445  MReqUrMem.data(), nullptr, 0, nullptr, nullptr);
446  } else {
447  HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo(),
448  IH);
449  }
450  } else
451  HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo());
452  } catch (...) {
453  auto CurrentException = std::current_exception();
454 #ifdef XPTI_ENABLE_INSTRUMENTATION
455  // sycl::exception emit tracing of message with code location if
456  // available. For other types of exception we need to explicitly trigger
457  // tracing by calling TraceEventXPTI.
458  if (xptiTraceEnabled()) {
459  try {
460  rethrow_exception(CurrentException);
461  } catch (const sycl::exception &) {
462  // it is already traced, nothing to care about
463  } catch (const std::exception &StdException) {
464  GlobalHandler::instance().TraceEventXPTI(StdException.what());
465  } catch (...) {
467  "Host task lambda thrown non standard exception");
468  }
469  }
470 #endif
471  MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(
472  CurrentException);
473  }
474 
475  HostTask.MHostTask.reset();
476 
477 #ifdef XPTI_ENABLE_INSTRUMENTATION
478  // Host Task is done, clear its submittion location to not interfere with
479  // following dependent kernels submission.
480  AsyncCodeLocationPtr.reset();
481 #endif
482 
483  try {
484  // If we enqueue blocked users - ur level could throw exception that
485  // should be treated as async now.
487  } catch (...) {
488  auto CurrentException = std::current_exception();
489  MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(
490  CurrentException);
491  }
492  }
493 };
494 
496  for (const EventImplPtr &HostEvent : MPreparedHostDepsEvents)
497  HostEvent->waitInternal();
498 }
499 
501  std::vector<EventImplPtr> &EventImpls,
502  ur_event_handle_t &Event) {
503 #ifndef NDEBUG
504  for (const EventImplPtr &Event : EventImpls)
505  assert(!Event->isHost() &&
506  "Only non-host events are expected to be waited for here");
507 #endif
508  if (!EventImpls.empty()) {
509  if (!Queue) {
510  // Host queue can wait for events from different contexts, i.e. it may
511  // contain events with different contexts in its MPreparedDepsEvents.
512  // OpenCL 2.1 spec says that clWaitForEvents will return
513  // CL_INVALID_CONTEXT if events specified in the list do not belong to
514  // the same context. Thus we split all the events into per-context map.
515  // An example. We have two queues for the same CPU device: Q1, Q2. Thus
516  // we will have two different contexts for the same CPU device: C1, C2.
517  // Also we have default host queue. This queue is accessible via
518  // Scheduler. Now, let's assume we have three different events: E1(C1),
519  // E2(C1), E3(C2). The command's MPreparedDepsEvents will contain all
520  // three events (E1, E2, E3). Now, if urEventWait is called for all
521  // three events we'll experience failure with CL_INVALID_CONTEXT 'cause
522  // these events refer to different contexts.
523  std::map<context_impl *, std::vector<EventImplPtr>>
524  RequiredEventsPerContext;
525 
526  for (const EventImplPtr &Event : EventImpls) {
527  ContextImplPtr Context = Event->getContextImpl();
528  assert(Context.get() &&
529  "Only non-host events are expected to be waited for here");
530  RequiredEventsPerContext[Context.get()].push_back(Event);
531  }
532 
533  for (auto &CtxWithEvents : RequiredEventsPerContext) {
534  std::vector<ur_event_handle_t> RawEvents =
535  getUrEvents(CtxWithEvents.second);
536  if (!RawEvents.empty()) {
537  CtxWithEvents.first->getPlugin()->call(urEventWait, RawEvents.size(),
538  RawEvents.data());
539  }
540  }
541  } else {
542  std::vector<ur_event_handle_t> RawEvents = getUrEvents(EventImpls);
543  flushCrossQueueDeps(EventImpls, MWorkerQueue);
544  const PluginPtr &Plugin = Queue->getPlugin();
545 
546  if (MEvent != nullptr)
547  MEvent->setHostEnqueueTime();
548  Plugin->call(urEnqueueEventsWait, Queue->getHandleRef(), RawEvents.size(),
549  &RawEvents[0], &Event);
550  }
551  }
552 }
553 
558  CommandType Type, QueueImplPtr Queue,
559  ur_exp_command_buffer_handle_t CommandBuffer,
560  const std::vector<ur_exp_command_buffer_sync_point_t> &SyncPoints)
561  : MQueue(std::move(Queue)),
562  MEvent(std::make_shared<detail::event_impl>(MQueue)),
563  MPreparedDepsEvents(MEvent->getPreparedDepsEvents()),
564  MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()), MType(Type),
565  MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints) {
567  MEvent->setWorkerQueue(MWorkerQueue);
568  MEvent->setSubmittedQueue(MWorkerQueue);
569  MEvent->setCommand(this);
570  if (MQueue)
571  MEvent->setContextImpl(MQueue->getContextImplPtr());
572  MEvent->setStateIncomplete();
574 
575 #ifdef XPTI_ENABLE_INSTRUMENTATION
576  if (!xptiTraceEnabled())
577  return;
578  // Obtain the stream ID so all commands can emit traces to that stream
579  MStreamID = xptiRegisterStream(SYCL_STREAM_NAME);
580 #endif
581 }
582 
584 #ifdef XPTI_ENABLE_INSTRUMENTATION
586 #endif
587 }
588 
600  Command *Cmd, void *ObjAddr, bool IsCommand,
601  std::optional<access::mode> AccMode) {
602 #ifdef XPTI_ENABLE_INSTRUMENTATION
603  // Bail early if either the source or the target node for the given
604  // dependency is undefined or NULL
605  constexpr uint16_t NotificationTraceType = xpti::trace_edge_create;
606  if (!(xptiCheckTraceEnabled(MStreamID, NotificationTraceType) &&
607  MTraceEvent && Cmd && Cmd->MTraceEvent))
608  return;
609 
610  // If all the information we need for creating an edge event is available,
611  // then go ahead with creating it; if not, bail early!
612  xpti::utils::StringHelper SH;
613  std::string AddressStr = SH.addressAsString<void *>(ObjAddr);
614  std::string Prefix = AccMode ? accessModeToString(AccMode.value()) : "Event";
615  std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
616  // Create an edge with the dependent buffer address for which a command
617  // object has been created as one of the properties of the edge
618  xpti::payload_t Payload(TypeString.c_str(), MAddress);
619  uint64_t EdgeInstanceNo;
620  xpti_td *EdgeEvent =
621  xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
622  xpti_at::active, &EdgeInstanceNo);
623  if (EdgeEvent) {
624  xpti_td *SrcEvent = static_cast<xpti_td *>(Cmd->MTraceEvent);
625  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
626  EdgeEvent->source_id = SrcEvent->unique_id;
627  EdgeEvent->target_id = TgtEvent->unique_id;
628  if (IsCommand) {
629  xpti::addMetadata(EdgeEvent, "access_mode",
630  static_cast<int>(AccMode.value()));
631  xpti::addMetadata(EdgeEvent, "memory_object",
632  reinterpret_cast<size_t>(ObjAddr));
633  } else {
634  xpti::addMetadata(EdgeEvent, "event", reinterpret_cast<size_t>(ObjAddr));
635  }
636  xptiNotifySubscribers(MStreamID, NotificationTraceType,
637  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
638  nullptr);
639  }
640  // General comment - None of these are serious errors as the instrumentation
641  // layer MUST be tolerant of errors. If we need to let the end user know, we
642  // throw exceptions in the future
643 #endif
644 }
645 
651  ur_event_handle_t &UrEventAddr) {
652 #ifdef XPTI_ENABLE_INSTRUMENTATION
653  // If we have failed to create an event to represent the Command, then we
654  // cannot emit an edge event. Bail early!
655  if (!(xptiCheckTraceEnabled(MStreamID) && MTraceEvent))
656  return;
657 
658  if (Cmd && Cmd->MTraceEvent) {
659  // If the event is associated with a command, we use this command's trace
660  // event as the source of edge, hence modeling the control flow
661  emitEdgeEventForCommandDependence(Cmd, (void *)UrEventAddr, false);
662  return;
663  }
664  if (UrEventAddr) {
665  xpti::utils::StringHelper SH;
666  std::string AddressStr = SH.addressAsString<ur_event_handle_t>(UrEventAddr);
667  // This is the case when it is a OCL event enqueued by the user or another
668  // event is registered by the runtime as a dependency The dependency on
669  // this occasion is an OCL event; so we build a virtual node in the graph
670  // with the event as the metadata for the node
671  std::string NodeName = SH.nameWithAddressString("virtual_node", AddressStr);
672  // Node name is "virtual_node[<event_addr>]"
673  xpti::payload_t VNPayload(NodeName.c_str(), MAddress);
674  uint64_t VNodeInstanceNo;
675  xpti_td *NodeEvent =
676  xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
677  xpti_at::active, &VNodeInstanceNo);
678  // Emit the virtual node first
679  xpti::addMetadata(NodeEvent, "kernel_name", NodeName);
680  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
681  detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
682  nullptr);
683  // Create a new event for the edge
684  std::string EdgeName = SH.nameWithAddressString("Event", AddressStr);
685  xpti::payload_t EdgePayload(EdgeName.c_str(), MAddress);
686  uint64_t EdgeInstanceNo;
687  xpti_td *EdgeEvent =
688  xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
689  xpti_at::active, &EdgeInstanceNo);
690  if (EdgeEvent && NodeEvent) {
691  // Source node represents the event and this event needs to be completed
692  // before target node can execute
693  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
694  EdgeEvent->source_id = NodeEvent->unique_id;
695  EdgeEvent->target_id = TgtEvent->unique_id;
696  xpti::addMetadata(EdgeEvent, "event",
697  reinterpret_cast<size_t>(UrEventAddr));
698  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
699  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
700  nullptr);
701  }
702  return;
703  }
704 #endif
705 }
706 
707 uint64_t Command::makeTraceEventProlog(void *MAddress) {
708  uint64_t CommandInstanceNo = 0;
709 #ifdef XPTI_ENABLE_INSTRUMENTATION
710  if (!xptiCheckTraceEnabled(MStreamID))
711  return CommandInstanceNo;
712 
714  // Setup the member variables with information needed for event notification
715  MCommandNodeType = commandToNodeType(MType);
716  MCommandName = commandToName(MType);
717  xpti::utils::StringHelper SH;
718  MAddressString = SH.addressAsString<void *>(MAddress);
719  std::string CommandString =
720  SH.nameWithAddressString(MCommandName, MAddressString);
721 
722  xpti::payload_t p(CommandString.c_str(), MAddress);
723  xpti_td *CmdTraceEvent =
724  xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
725  xpti_at::active, &CommandInstanceNo);
726  MInstanceID = CommandInstanceNo;
727  if (CmdTraceEvent) {
728  MTraceEvent = (void *)CmdTraceEvent;
729  // If we are seeing this event again, then the instance ID will be greater
730  // than 1; in this case, we must skip sending a notification to create a
731  // node as this node has already been created. We return this value so the
732  // epilog method can be called selectively.
733  MFirstInstance = (CommandInstanceNo == 1);
734  }
735 #endif
736  return CommandInstanceNo;
737 }
738 
740 #ifdef XPTI_ENABLE_INSTRUMENTATION
741  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
742  if (!(xptiCheckTraceEnabled(MStreamID, NotificationTraceType) && MTraceEvent))
743  return;
745  xptiNotifySubscribers(MStreamID, NotificationTraceType,
746  detail::GSYCLGraphEvent,
747  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
748  static_cast<const void *>(MCommandNodeType.c_str()));
749 #endif
750 }
751 
753  std::vector<Command *> &ToCleanUp) {
754  const ContextImplPtr &WorkerContext = getWorkerContext();
755 
756  // 1. Non-host events can be ignored if they are not fully initialized.
757  // 2. Some types of commands do not produce UR events after they are
758  // enqueued (e.g. alloca). Note that we can't check the ur event to make that
759  // distinction since the command might still be unenqueued at this point.
760  bool PiEventExpected =
761  (!DepEvent->isHost() && !DepEvent->isDefaultConstructed());
762  if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
763  PiEventExpected &= DepCmd->producesPiEvent();
764 
765  if (!PiEventExpected) {
766  // call to waitInternal() is in waitForPreparedHostEvents() as it's called
767  // from enqueue process functions
768  MPreparedHostDepsEvents.push_back(DepEvent);
769  return nullptr;
770  }
771 
772  Command *ConnectionCmd = nullptr;
773 
774  ContextImplPtr DepEventContext = DepEvent->getContextImpl();
775  // If contexts don't match we'll connect them using host task
776  if (DepEventContext != WorkerContext && WorkerContext) {
778  ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp);
779  } else
780  MPreparedDepsEvents.push_back(std::move(DepEvent));
781 
782  return ConnectionCmd;
783 }
784 
786  if (!MQueue)
787  return nullptr;
788  return MQueue->getContextImplPtr();
789 }
790 
791 bool Command::producesPiEvent() const { return true; }
792 
793 bool Command::supportsPostEnqueueCleanup() const { return true; }
794 
796  return MLeafCounter == 0 &&
798 }
799 
800 Command *Command::addDep(DepDesc NewDep, std::vector<Command *> &ToCleanUp) {
801  Command *ConnectionCmd = nullptr;
802 
803  if (NewDep.MDepCommand) {
804  ConnectionCmd =
805  processDepEvent(NewDep.MDepCommand->getEvent(), NewDep, ToCleanUp);
806  }
807  // ConnectionCmd insertion builds the following dependency structure:
808  // this -> emptyCmd (for ConnectionCmd) -> ConnectionCmd -> NewDep
809  // that means that this and NewDep are already dependent
810  if (!ConnectionCmd) {
811  MDeps.push_back(NewDep);
812  if (NewDep.MDepCommand)
813  NewDep.MDepCommand->addUser(this);
814  }
815 
816 #ifdef XPTI_ENABLE_INSTRUMENTATION
818  (void *)NewDep.MDepRequirement->MSYCLMemObj,
819  true, NewDep.MDepRequirement->MAccessMode);
820 #endif
821 
822  return ConnectionCmd;
823 }
824 
826  std::vector<Command *> &ToCleanUp) {
827 #ifdef XPTI_ENABLE_INSTRUMENTATION
828  // We need this for just the instrumentation, so guarding it will prevent
829  // unused variable warnings when instrumentation is turned off
830  Command *Cmd = (Command *)Event->getCommand();
831  ur_event_handle_t &UrEventAddr = Event->getHandleRef();
832  // Now make an edge for the dependent event
833  emitEdgeEventForEventDependence(Cmd, UrEventAddr);
834 #endif
835 
836  return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr},
837  ToCleanUp);
838 }
839 
840 void Command::emitEnqueuedEventSignal(ur_event_handle_t &UrEventAddr) {
841 #ifdef XPTI_ENABLE_INSTRUMENTATION
842  emitInstrumentationGeneral(
843  MStreamID, MInstanceID, static_cast<xpti_td *>(MTraceEvent),
844  xpti::trace_signal, static_cast<const void *>(UrEventAddr));
845 #endif
846  std::ignore = UrEventAddr;
847 }
848 
849 void Command::emitInstrumentation(uint16_t Type, const char *Txt) {
850 #ifdef XPTI_ENABLE_INSTRUMENTATION
851  return emitInstrumentationGeneral(MStreamID, MInstanceID,
852  static_cast<xpti_td *>(MTraceEvent), Type,
853  static_cast<const void *>(Txt));
854 #else
855  std::ignore = Type;
856  std::ignore = Txt;
857 #endif
858 }
859 
860 bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking,
861  std::vector<Command *> &ToCleanUp) {
862 #ifdef XPTI_ENABLE_INSTRUMENTATION
863  // If command is enqueued from host task thread - it will not have valid
864  // submission code location set. So we set it manually to properly trace
865  // failures if ur level report any.
866  std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
867  if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
868  AsyncCodeLocationPtr.reset(
870  }
871 #endif
872  // Exit if already enqueued
874  return true;
875 
876  // If the command is blocked from enqueueing
878  // Exit if enqueue type is not blocking
879  if (!Blocking) {
880  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, this);
881  return false;
882  }
883 
884 #ifdef XPTI_ENABLE_INSTRUMENTATION
885  // Scoped trace event notifier that emits a barrier begin and barrier end
886  // event, which models the barrier while enqueuing along with the blocked
887  // reason, as determined by the scheduler
888  std::string Info = "enqueue.barrier[";
889  Info += std::string(getBlockReason()) + "]";
890  emitInstrumentation(xpti::trace_barrier_begin, Info.c_str());
891 #endif
892 
893  // Wait if blocking
895  ;
896 #ifdef XPTI_ENABLE_INSTRUMENTATION
897  emitInstrumentation(xpti::trace_barrier_end, Info.c_str());
898 #endif
899  }
900 
901  std::lock_guard<std::mutex> Lock(MEnqueueMtx);
902 
903  // Exit if the command is already enqueued
905  return true;
906 
907 #ifdef XPTI_ENABLE_INSTRUMENTATION
908  emitInstrumentation(xpti::trace_task_begin, nullptr);
909 #endif
910 
912  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueFailed, this);
913  return false;
914  }
915 
916  // Command status set to "failed" beforehand, so this command
917  // has already been marked as "failed" if enqueueImp throws an exception.
918  // This will avoid execution of the same failed command twice.
921  ur_result_t Res = enqueueImp();
922 
923  if (UR_RESULT_SUCCESS != Res)
924  EnqueueResult =
926  else {
927  MEvent->setEnqueued();
929  (MEvent->isHost() || MEvent->getHandleRef() == nullptr))
930  MEvent->setComplete();
931 
932  // Consider the command is successfully enqueued if return code is
933  // UR_RESULT_SUCCESS
938  assert(!MMarkedForCleanup);
939  MMarkedForCleanup = true;
940  ToCleanUp.push_back(this);
941  }
942  }
943 
944  // Emit this correlation signal before the task end
945  emitEnqueuedEventSignal(MEvent->getHandleRef());
946 #ifdef XPTI_ENABLE_INSTRUMENTATION
947  emitInstrumentation(xpti::trace_task_end, nullptr);
948 #endif
950 }
951 
952 void Command::resolveReleaseDependencies(std::set<Command *> &DepList) {
953 #ifdef XPTI_ENABLE_INSTRUMENTATION
954  assert(MType == CommandType::RELEASE && "Expected release command");
955  if (!MTraceEvent)
956  return;
957  // The current command is the target node for all dependencies as the source
958  // nodes have to be completed first before the current node can begin to
959  // execute; these edges model control flow
960  xpti_td *TgtTraceEvent = static_cast<xpti_td *>(MTraceEvent);
961  // We have all the Commands that must be completed before the release
962  // command can be enqueued; here we'll find the command that is an Alloca
963  // with the same SYCLMemObject address and create a dependency line (edge)
964  // between them in our sematic modeling
965  for (auto &Item : DepList) {
966  if (Item->MTraceEvent && Item->MAddress == MAddress) {
967  xpti::utils::StringHelper SH;
968  std::string AddressStr = SH.addressAsString<void *>(MAddress);
969  std::string TypeString =
970  "Edge:" + SH.nameWithAddressString(commandToName(MType), AddressStr);
971 
972  // Create an edge with the dependent buffer address being one of the
973  // properties of the edge
974  xpti::payload_t p(TypeString.c_str(), MAddress);
975  uint64_t EdgeInstanceNo;
976  xpti_td *EdgeEvent =
977  xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
978  xpti_at::active, &EdgeInstanceNo);
979  if (EdgeEvent) {
980  xpti_td *SrcTraceEvent = static_cast<xpti_td *>(Item->MTraceEvent);
981  EdgeEvent->target_id = TgtTraceEvent->unique_id;
982  EdgeEvent->source_id = SrcTraceEvent->unique_id;
983  xpti::addMetadata(EdgeEvent, "memory_object",
984  reinterpret_cast<size_t>(MAddress));
985  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
986  detail::GSYCLGraphEvent, EdgeEvent,
987  EdgeInstanceNo, nullptr);
988  }
989  }
990  }
991 #endif
992 }
993 
994 const char *Command::getBlockReason() const {
995  switch (MBlockReason) {
997  return "A Buffer is locked by the host accessor";
999  return "Blocked by host task";
1000  }
1001 
1002  return "Unknown block reason";
1003 }
1004 
1006 #ifdef XPTI_ENABLE_INSTRUMENTATION
1007  if (!xptiTraceEnabled())
1008  return;
1009 
1011  auto TData = Tls.query();
1012  if (TData.fileName())
1013  MSubmissionFileName = TData.fileName();
1014  if (TData.functionName())
1015  MSubmissionFunctionName = TData.functionName();
1016  if (MSubmissionFileName.size() || MSubmissionFunctionName.size())
1019  (int)TData.lineNumber(), (int)TData.columnNumber()};
1020 #endif
1021 }
1022 
1024  Requirement Req,
1025  AllocaCommandBase *LinkedAllocaCmd,
1026  bool IsConst)
1027  : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
1028  MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst),
1029  MRequirement(std::move(Req)), MReleaseCmd(Queue, this) {
1032 }
1033 
1035 #ifdef XPTI_ENABLE_INSTRUMENTATION
1036  if (!xptiCheckTraceEnabled(MStreamID))
1037  return;
1038  // Create a payload with the command name and an event using this payload to
1039  // emit a node_create
1042  // Set the relevant meta data properties for this command
1043  if (MTraceEvent && MFirstInstance) {
1044  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1045  addDeviceMetadata(TE, MQueue);
1046  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1047  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1048  // as this data is mutable and the metadata is supposed to be invariant
1049  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1050  getQueueID(MQueue));
1051  }
1052 #endif
1053 }
1054 
1055 bool AllocaCommandBase::producesPiEvent() const { return false; }
1056 
1058 
1059 bool AllocaCommandBase::readyForCleanup() const { return false; }
1060 
1062  bool InitFromUserData,
1063  AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
1064  : AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
1065  LinkedAllocaCmd, IsConst),
1066  MInitFromUserData(InitFromUserData) {
1067  // Node event must be created before the dependent edge is added to this
1068  // node, so this call must be before the addDep() call.
1070  // "Nothing to depend on"
1071  std::vector<Command *> ToCleanUp;
1072  Command *ConnectionCmd =
1073  addDep(DepDesc(nullptr, getRequirement(), this), ToCleanUp);
1074  assert(ConnectionCmd == nullptr);
1075  assert(ToCleanUp.empty());
1076  (void)ConnectionCmd;
1077 }
1078 
1080 #ifdef XPTI_ENABLE_INSTRUMENTATION
1081  if (!xptiCheckTraceEnabled(MStreamID))
1082  return;
1083 
1084  // Only if it is the first event, we emit a node create event
1085  if (MFirstInstance) {
1087  }
1088 #endif
1089 }
1090 
1091 ur_result_t AllocaCommand::enqueueImp() {
1093  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1094 
1095  ur_event_handle_t &Event = MEvent->getHandleRef();
1096 
1097  void *HostPtr = nullptr;
1098  if (!MIsLeaderAlloca) {
1099 
1100  if (!MQueue) {
1101  // Do not need to make allocation if we have a linked device allocation
1102  Command::waitForEvents(MQueue, EventImpls, Event);
1103 
1104  return UR_RESULT_SUCCESS;
1105  }
1106  HostPtr = MLinkedAllocaCmd->getMemAllocation();
1107  }
1108  // TODO: Check if it is correct to use std::move on stack variable and
1109  // delete it RawEvents below.
1111  MInitFromUserData, HostPtr,
1112  std::move(EventImpls), Event);
1113 
1114  return UR_RESULT_SUCCESS;
1115 }
1116 
1117 void AllocaCommand::printDot(std::ostream &Stream) const {
1118  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1119 
1120  Stream << "ID = " << this << "\\n";
1121  Stream << "ALLOCA ON " << queueDeviceToString(MQueue.get()) << "\\n";
1122  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
1123  Stream << " Link : " << this->MLinkedAllocaCmd << "\\n";
1124  Stream << "\"];" << std::endl;
1125 
1126  for (const auto &Dep : MDeps) {
1127  if (Dep.MDepCommand == nullptr)
1128  continue;
1129  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1130  << " [ label = \"Access mode: "
1131  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1132  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1133  << std::endl;
1134  }
1135 }
1136 
1138  AllocaCommandBase *ParentAlloca,
1139  std::vector<Command *> &ToEnqueue,
1140  std::vector<Command *> &ToCleanUp)
1141  : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue),
1142  std::move(Req),
1143  /*LinkedAllocaCmd*/ nullptr, /*IsConst*/ false),
1144  MParentAlloca(ParentAlloca) {
1145  // Node event must be created before the dependent edge
1146  // is added to this node, so this call must be before
1147  // the addDep() call.
1149  Command *ConnectionCmd = addDep(
1150  DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp);
1151  if (ConnectionCmd)
1152  ToEnqueue.push_back(ConnectionCmd);
1153 }
1154 
1156 #ifdef XPTI_ENABLE_INSTRUMENTATION
1157  if (!xptiCheckTraceEnabled(MStreamID))
1158  return;
1159 
1160  // Only if it is the first event, we emit a node create event and any meta
1161  // data that is available for the command
1162  if (MFirstInstance) {
1163  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1164  xpti::addMetadata(TE, "offset", this->MRequirement.MOffsetInBytes);
1165  xpti::addMetadata(TE, "access_range_start",
1166  this->MRequirement.MAccessRange[0]);
1167  xpti::addMetadata(TE, "access_range_end",
1168  this->MRequirement.MAccessRange[1]);
1169  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1170  getQueueID(MQueue));
1172  }
1173 #endif
1174 }
1175 
1177  // In some cases parent`s memory allocation might change (e.g., after
1178  // map/unmap operations). If parent`s memory allocation changes, sub-buffer
1179  // memory allocation should be changed as well.
1180  if (!MQueue) {
1181  return static_cast<void *>(
1182  static_cast<char *>(MParentAlloca->getMemAllocation()) +
1184  }
1185  return MMemAllocation;
1186 }
1187 
1188 ur_result_t AllocaSubBufCommand::enqueueImp() {
1190  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1191  ur_event_handle_t &Event = MEvent->getHandleRef();
1192 
1194  getContext(MQueue), MParentAlloca->getMemAllocation(),
1196  MRequirement.MAccessRange, std::move(EventImpls), Event);
1197 
1199  MMemAllocation);
1200  return UR_RESULT_SUCCESS;
1201 }
1202 
1203 void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
1204  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1205 
1206  Stream << "ID = " << this << "\\n";
1207  Stream << "ALLOCA SUB BUF ON " << queueDeviceToString(MQueue.get()) << "\\n";
1208  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
1209  Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n";
1210  Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n";
1211  Stream << "\"];" << std::endl;
1212 
1213  for (const auto &Dep : MDeps) {
1214  if (Dep.MDepCommand == nullptr)
1215  continue;
1216  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1217  << " [ label = \"Access mode: "
1218  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1219  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1220  << std::endl;
1221  }
1222 }
1223 
1225  : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) {
1227 }
1228 
1230 #ifdef XPTI_ENABLE_INSTRUMENTATION
1231  if (!xptiCheckTraceEnabled(MStreamID))
1232  return;
1233  // Create a payload with the command name and an event using this payload to
1234  // emit a node_create
1235  MAddress = MAllocaCmd->getSYCLMemObj();
1237 
1238  if (MFirstInstance) {
1239  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1240  addDeviceMetadata(TE, MQueue);
1241  xpti::addMetadata(TE, "allocation_type",
1242  commandToName(MAllocaCmd->getType()));
1243  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1244  // as this data is mutable and the metadata is supposed to be invariant
1245  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1246  getQueueID(MQueue));
1248  }
1249 #endif
1250 }
1251 
1252 ur_result_t ReleaseCommand::enqueueImp() {
1254  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1255  std::vector<ur_event_handle_t> RawEvents = getUrEvents(EventImpls);
1256  bool SkipRelease = false;
1257 
1258  // On host side we only allocate memory for full buffers.
1259  // Thus, deallocating sub buffers leads to double memory freeing.
1260  SkipRelease |= !MQueue && MAllocaCmd->getType() == ALLOCA_SUB_BUF;
1261 
1262  const bool CurAllocaIsHost = !MAllocaCmd->getQueue();
1263  bool NeedUnmap = false;
1264  if (MAllocaCmd->MLinkedAllocaCmd) {
1265 
1266  // When releasing one of the "linked" allocations special rules take
1267  // place:
1268  // 1. Device allocation should always be released.
1269  // 2. Host allocation should be released if host allocation is "leader".
1270  // 3. Device alloca in the pair should be in active state in order to be
1271  // correctly released.
1272 
1273  // There is no actual memory allocation if a host alloca command is
1274  // created being linked to a device allocation.
1275  SkipRelease |= CurAllocaIsHost && !MAllocaCmd->MIsLeaderAlloca;
1276 
1277  NeedUnmap |= CurAllocaIsHost == MAllocaCmd->MIsActive;
1278  }
1279 
1280  if (NeedUnmap) {
1281  const QueueImplPtr &Queue = CurAllocaIsHost
1282  ? MAllocaCmd->MLinkedAllocaCmd->getQueue()
1283  : MAllocaCmd->getQueue();
1284 
1285  EventImplPtr UnmapEventImpl(new event_impl(Queue));
1286  UnmapEventImpl->setContextImpl(getContext(Queue));
1287  UnmapEventImpl->setStateIncomplete();
1288  ur_event_handle_t &UnmapEvent = UnmapEventImpl->getHandleRef();
1289 
1290  void *Src = CurAllocaIsHost
1291  ? MAllocaCmd->getMemAllocation()
1292  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1293 
1294  void *Dst = !CurAllocaIsHost
1295  ? MAllocaCmd->getMemAllocation()
1296  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1297 
1298  MemoryManager::unmap(MAllocaCmd->getSYCLMemObj(), Dst, Queue, Src,
1299  RawEvents, UnmapEvent);
1300 
1301  std::swap(MAllocaCmd->MIsActive, MAllocaCmd->MLinkedAllocaCmd->MIsActive);
1302  EventImpls.clear();
1303  EventImpls.push_back(UnmapEventImpl);
1304  }
1305  ur_event_handle_t &Event = MEvent->getHandleRef();
1306  if (SkipRelease)
1307  Command::waitForEvents(MQueue, EventImpls, Event);
1308  else {
1310  MAllocaCmd->getMemAllocation(),
1311  std::move(EventImpls), Event);
1312  }
1313  return UR_RESULT_SUCCESS;
1314 }
1315 
1316 void ReleaseCommand::printDot(std::ostream &Stream) const {
1317  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\"";
1318 
1319  Stream << "ID = " << this << " ; ";
1320  Stream << "RELEASE ON " << queueDeviceToString(MQueue.get()) << "\\n";
1321  Stream << " Alloca : " << MAllocaCmd << "\\n";
1322  Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n";
1323  Stream << "\"];" << std::endl;
1324 
1325  for (const auto &Dep : MDeps) {
1326  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1327  << " [ label = \"Access mode: "
1328  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1329  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1330  << std::endl;
1331  }
1332 }
1333 
1334 bool ReleaseCommand::producesPiEvent() const { return false; }
1335 
1336 bool ReleaseCommand::supportsPostEnqueueCleanup() const { return false; }
1337 
1338 bool ReleaseCommand::readyForCleanup() const { return false; }
1339 
1341  void **DstPtr, QueueImplPtr Queue,
1342  access::mode MapMode)
1343  : Command(CommandType::MAP_MEM_OBJ, std::move(Queue)),
1344  MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr),
1345  MMapMode(MapMode) {
1347 }
1348 
1350 #ifdef XPTI_ENABLE_INSTRUMENTATION
1351  if (!xptiCheckTraceEnabled(MStreamID))
1352  return;
1353  // Create a payload with the command name and an event using this payload to
1354  // emit a node_create
1355  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1357 
1358  if (MFirstInstance) {
1359  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1360  addDeviceMetadata(TE, MQueue);
1361  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1362  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1363  // as this data is mutable and the metadata is supposed to be invariant
1364  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1365  getQueueID(MQueue));
1367  }
1368 #endif
1369 }
1370 
1371 ur_result_t MapMemObject::enqueueImp() {
1373  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1374  std::vector<ur_event_handle_t> RawEvents = getUrEvents(EventImpls);
1375  flushCrossQueueDeps(EventImpls, MWorkerQueue);
1376 
1377  ur_event_handle_t &Event = MEvent->getHandleRef();
1378  *MDstPtr = MemoryManager::map(
1379  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(), MQueue,
1380  MMapMode, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1381  MSrcReq.MOffset, MSrcReq.MElemSize, std::move(RawEvents), Event);
1382 
1383  return UR_RESULT_SUCCESS;
1384 }
1385 
1386 void MapMemObject::printDot(std::ostream &Stream) const {
1387  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
1388 
1389  Stream << "ID = " << this << " ; ";
1390  Stream << "MAP ON " << queueDeviceToString(MQueue.get()) << "\\n";
1391 
1392  Stream << "\"];" << std::endl;
1393 
1394  for (const auto &Dep : MDeps) {
1395  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1396  << " [ label = \"Access mode: "
1397  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1398  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1399  << std::endl;
1400  }
1401 }
1402 
1404  void **SrcPtr, QueueImplPtr Queue)
1405  : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)),
1406  MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) {
1408 }
1409 
1411 #ifdef XPTI_ENABLE_INSTRUMENTATION
1412  if (!xptiCheckTraceEnabled(MStreamID))
1413  return;
1414  // Create a payload with the command name and an event using this payload to
1415  // emit a node_create
1416  MAddress = MDstAllocaCmd->getSYCLMemObj();
1418 
1419  if (MFirstInstance) {
1420  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1421  addDeviceMetadata(TE, MQueue);
1422  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1423  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1424  // as this data is mutable and the metadata is supposed to be invariant
1425  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1426  getQueueID(MQueue));
1428  }
1429 #endif
1430 }
1431 
1433  // TODO remove this workaround once the batching issue is addressed in Level
1434  // Zero plugin.
1435  // Consider the following scenario on Level Zero:
1436  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1437  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1438  // 3. queueA.wait().
1439  // 4. queueB.wait().
1440  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1441  // dependencies (i.e. they were waited for prior to enqueueing any command
1442  // that's dependent on them). This allowed Level Zero plugin to detect that
1443  // each queue is idle on steps 1/2 and submit the command list right away.
1444  // This is no longer the case since we started passing these dependencies in
1445  // an event waitlist and Level Zero plugin attempts to batch these commands,
1446  // so the execution of kernel B starts only on step 4. This workaround
1447  // restores the old behavior in this case until this is resolved.
1448  return MQueue && (MQueue->getDeviceImplPtr()->getBackend() !=
1450  MEvent->getHandleRef() != nullptr);
1451 }
1452 
1453 ur_result_t UnMapMemObject::enqueueImp() {
1455  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1456  std::vector<ur_event_handle_t> RawEvents = getUrEvents(EventImpls);
1457  flushCrossQueueDeps(EventImpls, MWorkerQueue);
1458 
1459  ur_event_handle_t &Event = MEvent->getHandleRef();
1460  MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(),
1461  MDstAllocaCmd->getMemAllocation(), MQueue, *MSrcPtr,
1462  std::move(RawEvents), Event);
1463 
1464  return UR_RESULT_SUCCESS;
1465 }
1466 
1467 void UnMapMemObject::printDot(std::ostream &Stream) const {
1468  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
1469 
1470  Stream << "ID = " << this << " ; ";
1471  Stream << "UNMAP ON " << queueDeviceToString(MQueue.get()) << "\\n";
1472 
1473  Stream << "\"];" << std::endl;
1474 
1475  for (const auto &Dep : MDeps) {
1476  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1477  << " [ label = \"Access mode: "
1478  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1479  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1480  << std::endl;
1481  }
1482 }
1483 
1485  AllocaCommandBase *SrcAllocaCmd,
1486  Requirement DstReq,
1487  AllocaCommandBase *DstAllocaCmd,
1488  QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
1489  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1490  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1491  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)),
1492  MDstAllocaCmd(DstAllocaCmd) {
1493  if (MSrcQueue) {
1494  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1495  }
1496 
1497  MWorkerQueue = !MQueue ? MSrcQueue : MQueue;
1498  MEvent->setWorkerQueue(MWorkerQueue);
1499 
1501 }
1502 
1504 #ifdef XPTI_ENABLE_INSTRUMENTATION
1505  if (!xptiCheckTraceEnabled(MStreamID))
1506  return;
1507  // Create a payload with the command name and an event using this payload to
1508  // emit a node_create
1509  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1511 
1512  if (MFirstInstance) {
1513  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1514  addDeviceMetadata(CmdTraceEvent, MQueue);
1515  xpti::addMetadata(CmdTraceEvent, "memory_object",
1516  reinterpret_cast<size_t>(MAddress));
1517  xpti::addMetadata(CmdTraceEvent, "copy_from",
1518  MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0);
1519  xpti::addMetadata(CmdTraceEvent, "copy_to",
1520  MQueue ? deviceToID(MQueue->get_device()) : 0);
1521  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1522  // as this data is mutable and the metadata is supposed to be invariant
1523  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1524  getQueueID(MQueue));
1526  }
1527 #endif
1528 }
1529 
1531  if (!MWorkerQueue)
1532  return nullptr;
1533  return MWorkerQueue->getContextImplPtr();
1534 }
1535 
1537  // TODO remove this workaround once the batching issue is addressed in Level
1538  // Zero plugin.
1539  // Consider the following scenario on Level Zero:
1540  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1541  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1542  // 3. queueA.wait().
1543  // 4. queueB.wait().
1544  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1545  // dependencies (i.e. they were waited for prior to enqueueing any command
1546  // that's dependent on them). This allowed Level Zero plugin to detect that
1547  // each queue is idle on steps 1/2 and submit the command list right away.
1548  // This is no longer the case since we started passing these dependencies in
1549  // an event waitlist and Level Zero plugin attempts to batch these commands,
1550  // so the execution of kernel B starts only on step 4. This workaround
1551  // restores the old behavior in this case until this is resolved.
1552  return !MQueue ||
1553  MQueue->getDeviceImplPtr()->getBackend() !=
1555  MEvent->getHandleRef() != nullptr;
1556 }
1557 
1558 ur_result_t MemCpyCommand::enqueueImp() {
1560  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1561 
1562  ur_event_handle_t &Event = MEvent->getHandleRef();
1563 
1564  auto RawEvents = getUrEvents(EventImpls);
1565  flushCrossQueueDeps(EventImpls, MWorkerQueue);
1566 
1568  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1569  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1570  MSrcReq.MOffset, MSrcReq.MElemSize, MDstAllocaCmd->getMemAllocation(),
1571  MQueue, MDstReq.MDims, MDstReq.MMemoryRange, MDstReq.MAccessRange,
1572  MDstReq.MOffset, MDstReq.MElemSize, std::move(RawEvents), Event, MEvent);
1573 
1574  return UR_RESULT_SUCCESS;
1575 }
1576 
1577 void MemCpyCommand::printDot(std::ostream &Stream) const {
1578  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
1579 
1580  Stream << "ID = " << this << " ; ";
1581  Stream << "MEMCPY ON " << queueDeviceToString(MQueue.get()) << "\\n";
1582  Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue << "\\n";
1583  Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue << "\\n";
1584 
1585  Stream << "\"];" << std::endl;
1586 
1587  for (const auto &Dep : MDeps) {
1588  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1589  << " [ label = \"Access mode: "
1590  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1591  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1592  << std::endl;
1593  }
1594 }
1595 
1596 AllocaCommandBase *ExecCGCommand::getAllocaForReq(Requirement *Req) {
1597  for (const DepDesc &Dep : MDeps) {
1598  if (Dep.MDepRequirement == Req)
1599  return Dep.MAllocaCmd;
1600  }
1601  // Default constructed accessors do not add dependencies, but they can be
1602  // passed to commands. Simply return nullptr, since they are empty and don't
1603  // really require any memory.
1604  return nullptr;
1605 }
1606 
1607 std::vector<std::shared_ptr<const void>>
1609  if (MCommandGroup->getType() == CGType::Kernel)
1610  return ((CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
1611  return {};
1612 }
1613 
1615  if (MCommandGroup->getType() == CGType::Kernel)
1616  ((CGExecKernel *)MCommandGroup.get())->clearAuxiliaryResources();
1617 }
1618 
1619 ur_result_t UpdateHostRequirementCommand::enqueueImp() {
1621  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1622  ur_event_handle_t &Event = MEvent->getHandleRef();
1623  Command::waitForEvents(MQueue, EventImpls, Event);
1624 
1625  assert(MSrcAllocaCmd && "Expected valid alloca command");
1626  assert(MSrcAllocaCmd->getMemAllocation() && "Expected valid source pointer");
1627  assert(MDstPtr && "Expected valid target pointer");
1628  *MDstPtr = MSrcAllocaCmd->getMemAllocation();
1629 
1630  return UR_RESULT_SUCCESS;
1631 }
1632 
1633 void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const {
1634  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\"";
1635 
1636  Stream << "ID = " << this << "\\n";
1637  Stream << "UPDATE REQ ON " << queueDeviceToString(MQueue.get()) << "\\n";
1638  bool IsReqOnBuffer =
1639  MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer;
1640  Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n";
1641  if (IsReqOnBuffer)
1642  Stream << "Is sub buffer: " << std::boolalpha << MDstReq.MIsSubBuffer
1643  << "\\n";
1644 
1645  Stream << "\"];" << std::endl;
1646 
1647  for (const auto &Dep : MDeps) {
1648  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1649  << " [ label = \"Access mode: "
1650  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1651  << "MemObj: " << Dep.MAllocaCmd->getSYCLMemObj() << " \" ]"
1652  << std::endl;
1653  }
1654 }
1655 
1657  AllocaCommandBase *SrcAllocaCmd,
1658  Requirement DstReq, void **DstPtr,
1659  QueueImplPtr SrcQueue,
1660  QueueImplPtr DstQueue)
1661  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1662  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1663  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) {
1664  if (MSrcQueue) {
1665  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1666  }
1667 
1668  MWorkerQueue = !MQueue ? MSrcQueue : MQueue;
1669  MEvent->setWorkerQueue(MWorkerQueue);
1670 
1672 }
1673 
1675 #ifdef XPTI_ENABLE_INSTRUMENTATION
1676  if (!xptiCheckTraceEnabled(MStreamID))
1677  return;
1678  // Create a payload with the command name and an event using this payload to
1679  // emit a node_create
1680  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1682 
1683  if (MFirstInstance) {
1684  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1685  addDeviceMetadata(CmdTraceEvent, MQueue);
1686  xpti::addMetadata(CmdTraceEvent, "memory_object",
1687  reinterpret_cast<size_t>(MAddress));
1688  xpti::addMetadata(CmdTraceEvent, "copy_from",
1689  MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0);
1690  xpti::addMetadata(CmdTraceEvent, "copy_to",
1691  MQueue ? deviceToID(MQueue->get_device()) : 0);
1692  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1693  // as this data is mutable and the metadata is supposed to be invariant
1694  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1695  getQueueID(MQueue));
1697  }
1698 #endif
1699 }
1700 
1702  if (!MWorkerQueue)
1703  return nullptr;
1704  return MWorkerQueue->getContextImplPtr();
1705 }
1706 
1707 ur_result_t MemCpyCommandHost::enqueueImp() {
1708  const QueueImplPtr &Queue = MWorkerQueue;
1710  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1711  std::vector<ur_event_handle_t> RawEvents = getUrEvents(EventImpls);
1712 
1713  ur_event_handle_t &Event = MEvent->getHandleRef();
1714  // Omit copying if mode is discard one.
1715  // TODO: Handle this at the graph building time by, for example, creating
1716  // empty node instead of memcpy.
1719  Command::waitForEvents(Queue, EventImpls, Event);
1720 
1721  return UR_RESULT_SUCCESS;
1722  }
1723 
1724  flushCrossQueueDeps(EventImpls, MWorkerQueue);
1726  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1727  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1728  MSrcReq.MOffset, MSrcReq.MElemSize, *MDstPtr, MQueue, MDstReq.MDims,
1729  MDstReq.MMemoryRange, MDstReq.MAccessRange, MDstReq.MOffset,
1730  MDstReq.MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent);
1731  return UR_RESULT_SUCCESS;
1732 }
1733 
1736 }
1737 
1738 ur_result_t EmptyCommand::enqueueImp() {
1740  waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
1741 
1742  return UR_RESULT_SUCCESS;
1743 }
1744 
1746  const Requirement *Req) {
1747  const Requirement &ReqRef = *Req;
1748  MRequirements.emplace_back(ReqRef);
1749  const Requirement *const StoredReq = &MRequirements.back();
1750 
1751  // EmptyCommand is always host one, so we believe that result of addDep is
1752  // nil
1753  std::vector<Command *> ToCleanUp;
1754  Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp);
1755  assert(Cmd == nullptr && "Conection command should be null for EmptyCommand");
1756  assert(ToCleanUp.empty() && "addDep should add a command for cleanup only if "
1757  "there's a connection command");
1758  (void)Cmd;
1759 }
1760 
1762 #ifdef XPTI_ENABLE_INSTRUMENTATION
1763  if (!xptiCheckTraceEnabled(MStreamID))
1764  return;
1765  // Create a payload with the command name and an event using this payload to
1766  // emit a node_create
1767  if (MRequirements.empty())
1768  return;
1769 
1770  Requirement &Req = *MRequirements.begin();
1771 
1772  MAddress = Req.MSYCLMemObj;
1774 
1775  if (MFirstInstance) {
1776  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1777  addDeviceMetadata(CmdTraceEvent, MQueue);
1778  xpti::addMetadata(CmdTraceEvent, "memory_object",
1779  reinterpret_cast<size_t>(MAddress));
1780  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1781  // as this data is mutable and the metadata is supposed to be invariant
1782  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1783  getQueueID(MQueue));
1785  }
1786 #endif
1787 }
1788 
1789 void EmptyCommand::printDot(std::ostream &Stream) const {
1790  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
1791 
1792  Stream << "ID = " << this << "\\n";
1793  Stream << "EMPTY NODE"
1794  << "\\n";
1795 
1796  Stream << "\"];" << std::endl;
1797 
1798  for (const auto &Dep : MDeps) {
1799  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1800  << " [ label = \"Access mode: "
1801  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1802  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1803  << std::endl;
1804  }
1805 }
1806 
1807 bool EmptyCommand::producesPiEvent() const { return false; }
1808 
1809 void MemCpyCommandHost::printDot(std::ostream &Stream) const {
1810  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
1811 
1812  Stream << "ID = " << this << "\\n";
1813  Stream << "MEMCPY HOST ON " << queueDeviceToString(MQueue.get()) << "\\n";
1814 
1815  Stream << "\"];" << std::endl;
1816 
1817  for (const auto &Dep : MDeps) {
1818  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1819  << " [ label = \"Access mode: "
1820  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1821  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1822  << std::endl;
1823  }
1824 }
1825 
1827  QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd,
1828  void **DstPtr)
1829  : Command(CommandType::UPDATE_REQUIREMENT, std::move(Queue)),
1830  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(Req)), MDstPtr(DstPtr) {
1831 
1833 }
1834 
1836 #ifdef XPTI_ENABLE_INSTRUMENTATION
1837  if (!xptiCheckTraceEnabled(MStreamID))
1838  return;
1839  // Create a payload with the command name and an event using this payload to
1840  // emit a node_create
1841  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1843 
1844  if (MFirstInstance) {
1845  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1846  addDeviceMetadata(CmdTraceEvent, MQueue);
1847  xpti::addMetadata(CmdTraceEvent, "memory_object",
1848  reinterpret_cast<size_t>(MAddress));
1849  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1850  // as this data is mutable and the metadata is supposed to be invariant
1851  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1852  getQueueID(MQueue));
1854  }
1855 #endif
1856 }
1857 
1858 static std::string_view cgTypeToString(detail::CGType Type) {
1859  switch (Type) {
1861  return "Kernel";
1862  break;
1864  return "update_host";
1865  break;
1866  case detail::CGType::Fill:
1867  return "fill";
1868  break;
1870  return "copy acc to acc";
1871  break;
1873  return "copy acc to ptr";
1874  break;
1876  return "copy ptr to acc";
1877  break;
1879  return "barrier";
1881  return "barrier waitlist";
1883  return "copy usm";
1884  break;
1886  return "fill usm";
1887  break;
1889  return "prefetch usm";
1890  break;
1892  return "host task";
1893  break;
1895  return "copy 2d usm";
1896  break;
1898  return "fill 2d usm";
1899  break;
1901  return "advise usm";
1903  return "memset 2d usm";
1904  break;
1906  return "copy to device_global";
1907  break;
1909  return "copy from device_global";
1910  break;
1912  return "read_write host pipe";
1914  return "exec command buffer";
1916  return "copy image";
1918  return "semaphore wait";
1920  return "semaphore signal";
1921  default:
1922  return "unknown";
1923  break;
1924  }
1925 }
1926 
1928  std::unique_ptr<detail::CG> CommandGroup, QueueImplPtr Queue,
1929  bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer,
1930  const std::vector<ur_exp_command_buffer_sync_point_t> &Dependencies)
1931  : Command(CommandType::RUN_CG, std::move(Queue), CommandBuffer,
1932  Dependencies),
1933  MEventNeeded(EventNeeded), MCommandGroup(std::move(CommandGroup)) {
1934  if (MCommandGroup->getType() == detail::CGType::CodeplayHostTask) {
1935  MEvent->setSubmittedQueue(
1936  static_cast<detail::CGHostTask *>(MCommandGroup.get())->MQueue);
1937  }
1938  if (MCommandGroup->getType() == detail::CGType::ProfilingTag)
1939  MEvent->markAsProfilingTagEvent();
1940 
1942 }
1943 
1944 #ifdef XPTI_ENABLE_INSTRUMENTATION
1945 std::string instrumentationGetKernelName(
1946  const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1947  const std::string &FunctionName, const std::string &SyclKernelName,
1948  void *&Address, std::optional<bool> &FromSource) {
1949  std::string KernelName;
1950  if (SyclKernel && SyclKernel->isCreatedFromSource()) {
1951  FromSource = true;
1952  ur_kernel_handle_t KernelHandle = SyclKernel->getHandleRef();
1953  Address = KernelHandle;
1954  KernelName = FunctionName;
1955  } else {
1956  FromSource = false;
1957  KernelName = demangleKernelName(SyclKernelName);
1958  }
1959  return KernelName;
1960 }
1961 
1962 void instrumentationAddExtraKernelMetadata(
1963  xpti_td *&CmdTraceEvent, const NDRDescT &NDRDesc,
1964  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
1965  const std::string &KernelName,
1966  const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1967  const QueueImplPtr &Queue,
1968  std::vector<ArgDesc> &CGArgs) // CGArgs are not const since they could be
1969  // sorted in this function
1970 {
1971  std::vector<ArgDesc> Args;
1972 
1973  auto FilterArgs = [&Args](detail::ArgDesc &Arg, int NextTrueIndex) {
1974  Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex});
1975  };
1976  ur_program_handle_t Program = nullptr;
1977  ur_kernel_handle_t Kernel = nullptr;
1978  std::mutex *KernelMutex = nullptr;
1979  const KernelArgMask *EliminatedArgMask = nullptr;
1980 
1981  std::shared_ptr<kernel_impl> SyclKernelImpl;
1982  std::shared_ptr<device_image_impl> DeviceImageImpl;
1983 
1984  // Use kernel_bundle if available unless it is interop.
1985  // Interop bundles can't be used in the first branch, because the
1986  // kernels in interop kernel bundles (if any) do not have kernel_id and
1987  // can therefore not be looked up, but since they are self-contained
1988  // they can simply be launched directly.
1989  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
1990  kernel_id KernelID =
1992  kernel SyclKernel =
1993  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
1994  std::shared_ptr<kernel_impl> KernelImpl =
1995  detail::getSyclObjImpl(SyclKernel);
1996 
1997  EliminatedArgMask = KernelImpl->getKernelArgMask();
1998  Program = KernelImpl->getDeviceImage()->get_ur_program_ref();
1999  } else if (nullptr != SyclKernel) {
2000  Program = SyclKernel->getProgramRef();
2001  if (!SyclKernel->isCreatedFromSource())
2002  EliminatedArgMask = SyclKernel->getKernelArgMask();
2003  } else {
2004  assert(Queue && "Kernel submissions should have an associated queue");
2005  std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
2007  Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName);
2008  }
2009 
2010  applyFuncOnFilteredArgs(EliminatedArgMask, CGArgs, FilterArgs);
2011 
2012  xpti::offload_kernel_enqueue_data_t KernelData{
2013  {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
2014  {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
2015  {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
2016  NDRDesc.GlobalOffset[2]},
2017  Args.size()};
2018  xpti::addMetadata(CmdTraceEvent, "enqueue_kernel_data", KernelData);
2019  for (size_t i = 0; i < Args.size(); i++) {
2020  std::string Prefix("arg");
2021  xpti::offload_kernel_arg_data_t arg{(int)Args[i].MType, Args[i].MPtr,
2022  Args[i].MSize, Args[i].MIndex};
2023  xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i), arg);
2024  }
2025 }
2026 
2027 void instrumentationFillCommonData(const std::string &KernelName,
2028  const std::string &FileName, uint64_t Line,
2029  uint64_t Column, const void *const Address,
2030  const QueueImplPtr &Queue,
2031  std::optional<bool> &FromSource,
2032  uint64_t &OutInstanceID,
2033  xpti_td *&OutTraceEvent) {
2034  // Get source file, line number information from the CommandGroup object
2035  // and create payload using name, address, and source info
2036  //
2037  // On Windows, since the support for builtin functions is not available in
2038  // MSVC, the MFileName, MLine will be set to nullptr and "0" respectively.
2039  // Handle this condition explicitly here.
2040  bool HasSourceInfo = false;
2041  xpti::payload_t Payload;
2042  if (!FileName.empty()) {
2043  // File name has a valid string
2044  Payload = xpti::payload_t(KernelName.c_str(), FileName.c_str(), Line,
2045  Column, Address);
2046  HasSourceInfo = true;
2047  } else if (Address) {
2048  // We have a valid function name and an address
2049  Payload = xpti::payload_t(KernelName.c_str(), Address);
2050  } else {
2051  // In any case, we will have a valid function name and we'll use that to
2052  // create the hash
2053  Payload = xpti::payload_t(KernelName.c_str());
2054  }
2055  uint64_t CGKernelInstanceNo;
2056  // Create event using the payload
2057  xpti_td *CmdTraceEvent =
2058  xptiMakeEvent("ExecCG", &Payload, xpti::trace_graph_event,
2059  xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
2060  if (CmdTraceEvent) {
2061  OutInstanceID = CGKernelInstanceNo;
2062  OutTraceEvent = CmdTraceEvent;
2063  // If we are seeing this event again, then the instance ID will be greater
2064  // than 1; in this case, we will skip sending a notification to create a
2065  // node as this node has already been created.
2066  if (CGKernelInstanceNo > 1)
2067  return;
2068 
2069  addDeviceMetadata(CmdTraceEvent, Queue);
2070  if (!KernelName.empty()) {
2071  xpti::addMetadata(CmdTraceEvent, "kernel_name", KernelName);
2072  }
2073  if (FromSource.has_value()) {
2074  xpti::addMetadata(CmdTraceEvent, "from_source", FromSource.value());
2075  }
2076  if (HasSourceInfo) {
2077  xpti::addMetadata(CmdTraceEvent, "sym_function_name", KernelName);
2078  xpti::addMetadata(CmdTraceEvent, "sym_source_file_name", FileName);
2079  xpti::addMetadata(CmdTraceEvent, "sym_line_no", static_cast<int>(Line));
2080  xpti::addMetadata(CmdTraceEvent, "sym_column_no",
2081  static_cast<int>(Column));
2082  }
2083  // We no longer set the 'queue_id' in the metadata structure as it is a
2084  // mutable value and multiple threads using the same queue created at the
2085  // same location will overwrite the metadata values creating inconsistencies
2086  }
2087 }
2088 #endif
2089 
2090 #ifdef XPTI_ENABLE_INSTRUMENTATION
2091 std::pair<xpti_td *, uint64_t> emitKernelInstrumentationData(
2092  int32_t StreamID, const std::shared_ptr<detail::kernel_impl> &SyclKernel,
2093  const detail::code_location &CodeLoc, const std::string &SyclKernelName,
2094  const QueueImplPtr &Queue, const NDRDescT &NDRDesc,
2095  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
2096  std::vector<ArgDesc> &CGArgs) {
2097 
2098  auto XptiObjects = std::make_pair<xpti_td *, uint64_t>(nullptr, -1);
2099  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2100  if (!xptiCheckTraceEnabled(StreamID))
2101  return XptiObjects;
2102 
2103  void *Address = nullptr;
2104  std::optional<bool> FromSource;
2105  std::string KernelName = instrumentationGetKernelName(
2106  SyclKernel, std::string(CodeLoc.functionName()), SyclKernelName, Address,
2107  FromSource);
2108 
2109  auto &[CmdTraceEvent, InstanceID] = XptiObjects;
2110 
2111  std::string FileName =
2112  CodeLoc.fileName() ? CodeLoc.fileName() : std::string();
2113  instrumentationFillCommonData(KernelName, FileName, CodeLoc.lineNumber(),
2114  CodeLoc.columnNumber(), Address, Queue,
2115  FromSource, InstanceID, CmdTraceEvent);
2116 
2117  if (CmdTraceEvent) {
2118  // Stash the queue_id mutable metadata in TLS
2119  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(Queue));
2120  instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc,
2121  KernelBundleImplPtr, SyclKernelName,
2122  SyclKernel, Queue, CGArgs);
2123 
2124  xptiNotifySubscribers(
2125  StreamID, NotificationTraceType, detail::GSYCLGraphEvent, CmdTraceEvent,
2126  InstanceID,
2127  static_cast<const void *>(
2128  commandToNodeType(Command::CommandType::RUN_CG).c_str()));
2129  }
2130 
2131  return XptiObjects;
2132 }
2133 #endif
2134 
2136 #ifdef XPTI_ENABLE_INSTRUMENTATION
2137  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2138  if (!xptiCheckTraceEnabled(MStreamID))
2139  return;
2140 
2141  std::string KernelName;
2142  std::optional<bool> FromSource;
2143  switch (MCommandGroup->getType()) {
2144  case detail::CGType::Kernel: {
2145  auto KernelCG =
2146  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2147  KernelName = instrumentationGetKernelName(
2148  KernelCG->MSyclKernel, MCommandGroup->MFunctionName,
2149  KernelCG->getKernelName(), MAddress, FromSource);
2150  } break;
2151  default:
2152  KernelName = getTypeString();
2153  break;
2154  }
2155 
2156  xpti_td *CmdTraceEvent = nullptr;
2157  instrumentationFillCommonData(KernelName, MCommandGroup->MFileName,
2158  MCommandGroup->MLine, MCommandGroup->MColumn,
2159  MAddress, MQueue, FromSource, MInstanceID,
2160  CmdTraceEvent);
2161 
2162  if (CmdTraceEvent) {
2163  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
2164  getQueueID(MQueue));
2165  MTraceEvent = static_cast<void *>(CmdTraceEvent);
2166  if (MCommandGroup->getType() == detail::CGType::Kernel) {
2167  auto KernelCG =
2168  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2169  instrumentationAddExtraKernelMetadata(
2170  CmdTraceEvent, KernelCG->MNDRDesc, KernelCG->getKernelBundle(),
2171  KernelCG->MKernelName, KernelCG->MSyclKernel, MQueue,
2172  KernelCG->MArgs);
2173  }
2174 
2175  xptiNotifySubscribers(
2176  MStreamID, NotificationTraceType, detail::GSYCLGraphEvent,
2177  CmdTraceEvent, MInstanceID,
2178  static_cast<const void *>(commandToNodeType(MType).c_str()));
2179  }
2180 #endif
2181 }
2182 
2183 void ExecCGCommand::printDot(std::ostream &Stream) const {
2184  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2185 
2186  Stream << "ID = " << this << "\\n";
2187  Stream << "EXEC CG ON " << queueDeviceToString(MQueue.get()) << "\\n";
2188 
2189  switch (MCommandGroup->getType()) {
2190  case detail::CGType::Kernel: {
2191  auto KernelCG =
2192  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2193  Stream << "Kernel name: ";
2194  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
2195  Stream << "created from source";
2196  else
2197  Stream << demangleKernelName(KernelCG->getKernelName());
2198  Stream << "\\n";
2199  break;
2200  }
2201  default:
2202  Stream << "CG type: " << getTypeString() << "\\n";
2203  break;
2204  }
2205 
2206  Stream << "\"];" << std::endl;
2207 
2208  for (const auto &Dep : MDeps) {
2209  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
2210  << " [ label = \"Access mode: "
2211  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
2212  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
2213  << std::endl;
2214  }
2215 }
2216 
2217 std::string_view ExecCGCommand::getTypeString() const {
2218  return cgTypeToString(MCommandGroup->getType());
2219 }
2220 
2221 // SYCL has a parallel_for_work_group variant where the only NDRange
2222 // characteristics set by a user is the number of work groups. This does not
2223 // map to the OpenCL clEnqueueNDRangeAPI, which requires global work size to
2224 // be set as well. This function determines local work size based on the
2225 // device characteristics and the number of work groups requested by the user,
2226 // then calculates the global work size. SYCL specification (from 4.8.5.3):
2227 // The member function handler::parallel_for_work_group is parameterized by
2228 // the number of work - groups, such that the size of each group is chosen by
2229 // the runtime, or by the number of work - groups and number of work - items
2230 // for users who need more control.
2231 static void adjustNDRangePerKernel(NDRDescT &NDR, ur_kernel_handle_t Kernel,
2232  const device_impl &DeviceImpl) {
2233  if (NDR.GlobalSize[0] != 0)
2234  return; // GlobalSize is set - no need to adjust
2235  // check the prerequisites:
2236  assert(NDR.LocalSize[0] == 0);
2237  // TODO might be good to cache this info together with the kernel info to
2238  // avoid get_kernel_work_group_info on every kernel run
2240  sycl::info::kernel_device_specific::compile_work_group_size>(
2241  Kernel, DeviceImpl.getHandleRef(), DeviceImpl.getPlugin());
2242 
2243  if (WGSize[0] == 0) {
2244  WGSize = {1, 1, 1};
2245  }
2246  NDR = sycl::detail::NDRDescT{nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize),
2247  static_cast<int>(NDR.Dims)};
2248 }
2249 
2250 // We have the following mapping between dimensions with SPIR-V builtins:
2251 // 1D: id[0] -> x
2252 // 2D: id[0] -> y, id[1] -> x
2253 // 3D: id[0] -> z, id[1] -> y, id[2] -> x
2254 // So in order to ensure the correctness we update all the kernel
2255 // parameters accordingly.
2256 // Initially we keep the order of NDRDescT as it provided by the user, this
2257 // simplifies overall handling and do the reverse only when
2258 // the kernel is enqueued.
2260  if (NDR.Dims > 1) {
2261  std::swap(NDR.GlobalSize[0], NDR.GlobalSize[NDR.Dims - 1]);
2262  std::swap(NDR.LocalSize[0], NDR.LocalSize[NDR.Dims - 1]);
2263  std::swap(NDR.GlobalOffset[0], NDR.GlobalOffset[NDR.Dims - 1]);
2264  }
2265 }
2266 
2267 ur_mem_flags_t AccessModeToUr(access::mode AccessorMode) {
2268  switch (AccessorMode) {
2269  case access::mode::read:
2270  return UR_MEM_FLAG_READ_ONLY;
2271  case access::mode::write:
2273  return UR_MEM_FLAG_WRITE_ONLY;
2274  default:
2275  return UR_MEM_FLAG_READ_WRITE;
2276  }
2277 }
2278 
2280  const PluginPtr &Plugin, ur_kernel_handle_t Kernel,
2281  const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2282  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2283  const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex) {
2284  switch (Arg.MType) {
2286  break;
2288  Requirement *Req = (Requirement *)(Arg.MPtr);
2289 
2290  // getMemAllocationFunc is nullptr when there are no requirements. However,
2291  // we may pass default constructed accessors to a command, which don't add
2292  // requirements. In such case, getMemAllocationFunc is nullptr, but it's a
2293  // valid case, so we need to properly handle it.
2294  ur_mem_handle_t MemArg =
2295  getMemAllocationFunc
2296  ? reinterpret_cast<ur_mem_handle_t>(getMemAllocationFunc(Req))
2297  : nullptr;
2298  ur_kernel_arg_mem_obj_properties_t MemObjData{};
2299  MemObjData.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
2300  MemObjData.memoryAccess = AccessModeToUr(Req->MAccessMode);
2301  Plugin->call(urKernelSetArgMemObj, Kernel, NextTrueIndex, &MemObjData,
2302  MemArg);
2303  break;
2304  }
2306  if (Arg.MPtr) {
2307  Plugin->call(urKernelSetArgValue, Kernel, NextTrueIndex, Arg.MSize,
2308  nullptr, Arg.MPtr);
2309  } else {
2310  Plugin->call(urKernelSetArgLocal, Kernel, NextTrueIndex, Arg.MSize,
2311  nullptr);
2312  }
2313 
2314  break;
2315  }
2317  sampler *SamplerPtr = (sampler *)Arg.MPtr;
2318  ur_sampler_handle_t Sampler =
2319  (ur_sampler_handle_t)detail::getSyclObjImpl(*SamplerPtr)
2320  ->getOrCreateSampler(Context);
2321  Plugin->call(urKernelSetArgSampler, Kernel, NextTrueIndex, nullptr,
2322  Sampler);
2323  break;
2324  }
2326  // We need to de-rerence this to get the actual USM allocation - that's the
2327  // pointer UR is expecting.
2328  const void *Ptr = *static_cast<const void *const *>(Arg.MPtr);
2329  Plugin->call(urKernelSetArgPointer, Kernel, NextTrueIndex, nullptr, Ptr);
2330  break;
2331  }
2333  assert(DeviceImageImpl != nullptr);
2334  ur_mem_handle_t SpecConstsBuffer =
2335  DeviceImageImpl->get_spec_const_buffer_ref();
2336 
2337  ur_kernel_arg_mem_obj_properties_t MemObjProps{};
2338  MemObjProps.pNext = nullptr;
2339  MemObjProps.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
2340  MemObjProps.memoryAccess = UR_MEM_FLAG_READ_ONLY;
2341  Plugin->call(urKernelSetArgMemObj, Kernel, NextTrueIndex, &MemObjProps,
2342  SpecConstsBuffer);
2343  break;
2344  }
2346  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
2347  "Invalid kernel param kind " +
2348  codeToString(UR_RESULT_ERROR_INVALID_VALUE));
2349  break;
2350  }
2351 }
2352 
2353 static ur_result_t SetKernelParamsAndLaunch(
2354  const QueueImplPtr &Queue, std::vector<ArgDesc> &Args,
2355  const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2356  ur_kernel_handle_t Kernel, NDRDescT &NDRDesc,
2357  std::vector<ur_event_handle_t> &RawEvents,
2358  const detail::EventImplPtr &OutEventImpl,
2359  const KernelArgMask *EliminatedArgMask,
2360  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2361  bool IsCooperative, bool KernelUsesClusterLaunch,
2362  const RTDeviceBinaryImage *BinImage, const std::string &KernelName) {
2363  assert(Queue && "Kernel submissions should have an associated queue");
2364  const PluginPtr &Plugin = Queue->getPlugin();
2365 
2367  std::vector<unsigned char> Empty;
2369  Queue, BinImage, KernelName,
2370  DeviceImageImpl.get() ? DeviceImageImpl->get_spec_const_blob_ref()
2371  : Empty);
2372  }
2373 
2374  auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
2375  &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
2376  SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc,
2377  Queue->get_context(), Arg, NextTrueIndex);
2378  };
2379 
2380  applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
2381 
2382  adjustNDRangePerKernel(NDRDesc, Kernel, *(Queue->getDeviceImplPtr()));
2383 
2384  // Remember this information before the range dimensions are reversed
2385  const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
2386 
2388 
2389  size_t RequiredWGSize[3] = {0, 0, 0};
2390  size_t *LocalSize = nullptr;
2391 
2392  if (HasLocalSize)
2393  LocalSize = &NDRDesc.LocalSize[0];
2394  else {
2395  Plugin->call(urKernelGetGroupInfo, Kernel,
2396  Queue->getDeviceImplPtr()->getHandleRef(),
2397  UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
2398  sizeof(RequiredWGSize), RequiredWGSize,
2399  /* pPropSizeRet = */ nullptr);
2400 
2401  const bool EnforcedLocalSize =
2402  (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2403  RequiredWGSize[2] != 0);
2404  if (EnforcedLocalSize)
2405  LocalSize = RequiredWGSize;
2406  }
2407  if (OutEventImpl != nullptr)
2408  OutEventImpl->setHostEnqueueTime();
2409  if (KernelUsesClusterLaunch) {
2410  std::vector<ur_exp_launch_property_t> property_list;
2411 
2412  ur_exp_launch_property_value_t launch_property_value_cluster_range;
2413  launch_property_value_cluster_range.clusterDim[0] =
2414  NDRDesc.ClusterDimensions[0];
2415  launch_property_value_cluster_range.clusterDim[1] =
2416  NDRDesc.ClusterDimensions[1];
2417  launch_property_value_cluster_range.clusterDim[2] =
2418  NDRDesc.ClusterDimensions[2];
2419 
2420  property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION,
2421  launch_property_value_cluster_range});
2422 
2423  if (IsCooperative) {
2424  ur_exp_launch_property_value_t launch_property_value_cooperative;
2425  launch_property_value_cooperative.cooperative = 1;
2426  property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE,
2427  launch_property_value_cooperative});
2428  }
2429 
2430  return Plugin->call_nocheck(
2431  urEnqueueKernelLaunchCustomExp, Queue->getHandleRef(), Kernel,
2432  NDRDesc.Dims, &NDRDesc.GlobalSize[0], LocalSize, property_list.size(),
2433  property_list.data(), RawEvents.size(),
2434  RawEvents.empty() ? nullptr : &RawEvents[0],
2435  OutEventImpl ? &OutEventImpl->getHandleRef() : nullptr);
2436  }
2437  ur_result_t Error =
2438  [&](auto... Args) {
2439  if (IsCooperative) {
2440  return Plugin->call_nocheck(urEnqueueCooperativeKernelLaunchExp,
2441  Args...);
2442  }
2443  return Plugin->call_nocheck(urEnqueueKernelLaunch, Args...);
2444  }(Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
2445  &NDRDesc.GlobalSize[0], LocalSize, RawEvents.size(),
2446  RawEvents.empty() ? nullptr : &RawEvents[0],
2447  OutEventImpl ? &OutEventImpl->getHandleRef() : nullptr);
2448  return Error;
2449 }
2450 
2452  context Ctx, DeviceImplPtr DeviceImpl,
2453  ur_exp_command_buffer_handle_t CommandBuffer,
2454  const CGExecKernel &CommandGroup,
2455  std::vector<ur_exp_command_buffer_sync_point_t> &SyncPoints,
2456  ur_exp_command_buffer_sync_point_t *OutSyncPoint,
2457  ur_exp_command_buffer_command_handle_t *OutCommand,
2458  const std::function<void *(Requirement *Req)> &getMemAllocationFunc) {
2459  auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx);
2460  const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin();
2461  ur_kernel_handle_t UrKernel = nullptr;
2462  ur_program_handle_t UrProgram = nullptr;
2463  std::shared_ptr<kernel_impl> SyclKernelImpl = nullptr;
2464  std::shared_ptr<device_image_impl> DeviceImageImpl = nullptr;
2465 
2466  auto Kernel = CommandGroup.MSyclKernel;
2467  auto KernelBundleImplPtr = CommandGroup.MKernelBundle;
2468  const KernelArgMask *EliminatedArgMask = nullptr;
2469 
2470  // Use kernel_bundle if available unless it is interop.
2471  // Interop bundles can't be used in the first branch, because the kernels
2472  // in interop kernel bundles (if any) do not have kernel_id
2473  // and can therefore not be looked up, but since they are self-contained
2474  // they can simply be launched directly.
2475  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
2476  auto KernelName = CommandGroup.MKernelName;
2477  kernel_id KernelID =
2479  kernel SyclKernel =
2480  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
2481  SyclKernelImpl = detail::getSyclObjImpl(SyclKernel);
2482  UrKernel = SyclKernelImpl->getHandleRef();
2483  DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2484  UrProgram = DeviceImageImpl->get_ur_program_ref();
2485  EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2486  } else if (Kernel != nullptr) {
2487  UrKernel = Kernel->getHandleRef();
2488  UrProgram = Kernel->getProgramRef();
2489  EliminatedArgMask = Kernel->getKernelArgMask();
2490  } else {
2491  std::tie(UrKernel, std::ignore, EliminatedArgMask, UrProgram) =
2492  sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
2493  ContextImpl, DeviceImpl, CommandGroup.MKernelName);
2494  }
2495 
2496  auto SetFunc = [&Plugin, &UrKernel, &DeviceImageImpl, &Ctx,
2497  &getMemAllocationFunc](sycl::detail::ArgDesc &Arg,
2498  size_t NextTrueIndex) {
2499  sycl::detail::SetArgBasedOnType(Plugin, UrKernel, DeviceImageImpl,
2500  getMemAllocationFunc, Ctx, Arg,
2501  NextTrueIndex);
2502  };
2503  // Copy args for modification
2504  auto Args = CommandGroup.MArgs;
2505  sycl::detail::applyFuncOnFilteredArgs(EliminatedArgMask, Args, SetFunc);
2506 
2507  // Remember this information before the range dimensions are reversed
2508  const bool HasLocalSize = (CommandGroup.MNDRDesc.LocalSize[0] != 0);
2509 
2510  // Copy NDRDesc for modification
2511  auto NDRDesc = CommandGroup.MNDRDesc;
2512  // Reverse kernel dims
2514 
2515  size_t RequiredWGSize[3] = {0, 0, 0};
2516  size_t *LocalSize = nullptr;
2517 
2518  if (HasLocalSize)
2519  LocalSize = &NDRDesc.LocalSize[0];
2520  else {
2521  Plugin->call(urKernelGetGroupInfo, UrKernel, DeviceImpl->getHandleRef(),
2522  UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
2523  sizeof(RequiredWGSize), RequiredWGSize,
2524  /* pPropSizeRet = */ nullptr);
2525 
2526  const bool EnforcedLocalSize =
2527  (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2528  RequiredWGSize[2] != 0);
2529  if (EnforcedLocalSize)
2530  LocalSize = RequiredWGSize;
2531  }
2532 
2533  ur_result_t Res = Plugin->call_nocheck(
2534  urCommandBufferAppendKernelLaunchExp, CommandBuffer, UrKernel,
2535  NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], LocalSize,
2536  SyncPoints.size(), SyncPoints.size() ? SyncPoints.data() : nullptr,
2537  OutSyncPoint, OutCommand);
2538 
2539  if (!SyclKernelImpl && !Kernel) {
2540  Plugin->call(urKernelRelease, UrKernel);
2541  Plugin->call(urProgramRelease, UrProgram);
2542  }
2543 
2544  if (Res != UR_RESULT_SUCCESS) {
2545  const device_impl &DeviceImplem = *(DeviceImpl);
2547  UrKernel, NDRDesc);
2548  }
2549 
2550  return Res;
2551 }
2552 
2554  const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector<ArgDesc> &Args,
2555  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
2556  const std::shared_ptr<detail::kernel_impl> &MSyclKernel,
2557  const std::string &KernelName, std::vector<ur_event_handle_t> &RawEvents,
2558  const detail::EventImplPtr &OutEventImpl,
2559  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2560  ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative,
2561  const bool KernelUsesClusterLaunch, const RTDeviceBinaryImage *BinImage) {
2562  assert(Queue && "Kernel submissions should have an associated queue");
2563  // Run OpenCL kernel
2564  auto ContextImpl = Queue->getContextImplPtr();
2565  auto DeviceImpl = Queue->getDeviceImplPtr();
2566  ur_kernel_handle_t Kernel = nullptr;
2567  std::mutex *KernelMutex = nullptr;
2568  ur_program_handle_t Program = nullptr;
2569  const KernelArgMask *EliminatedArgMask;
2570 
2571  std::shared_ptr<kernel_impl> SyclKernelImpl;
2572  std::shared_ptr<device_image_impl> DeviceImageImpl;
2573 
2574  // Use kernel_bundle if available unless it is interop.
2575  // Interop bundles can't be used in the first branch, because the kernels
2576  // in interop kernel bundles (if any) do not have kernel_id
2577  // and can therefore not be looked up, but since they are self-contained
2578  // they can simply be launched directly.
2579  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
2580  kernel_id KernelID =
2582  kernel SyclKernel =
2583  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
2584 
2585  SyclKernelImpl = detail::getSyclObjImpl(SyclKernel);
2586 
2587  Kernel = SyclKernelImpl->getHandleRef();
2588  DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2589 
2590  Program = DeviceImageImpl->get_ur_program_ref();
2591 
2592  EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2593  KernelMutex = SyclKernelImpl->getCacheMutex();
2594  } else if (nullptr != MSyclKernel) {
2595  assert(MSyclKernel->get_info<info::kernel::context>() ==
2596  Queue->get_context());
2597  Kernel = MSyclKernel->getHandleRef();
2598  Program = MSyclKernel->getProgramRef();
2599 
2600  // Non-cacheable kernels use mutexes from kernel_impls.
2601  // TODO this can still result in a race condition if multiple SYCL
2602  // kernels are created with the same native handle. To address this,
2603  // we need to either store and use a ur_native_handle_t -> mutex map or
2604  // reuse and return existing SYCL kernels from make_native to avoid
2605  // their duplication in such cases.
2606  KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex();
2607  EliminatedArgMask = MSyclKernel->getKernelArgMask();
2608  } else {
2609  std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
2611  ContextImpl, DeviceImpl, KernelName, NDRDesc);
2612  }
2613 
2614  // We may need more events for the launch, so we make another reference.
2615  std::vector<ur_event_handle_t> &EventsWaitList = RawEvents;
2616 
2617  // Initialize device globals associated with this.
2618  std::vector<ur_event_handle_t> DeviceGlobalInitEvents =
2619  ContextImpl->initializeDeviceGlobals(Program, Queue);
2620  std::vector<ur_event_handle_t> EventsWithDeviceGlobalInits;
2621  if (!DeviceGlobalInitEvents.empty()) {
2622  EventsWithDeviceGlobalInits.reserve(RawEvents.size() +
2623  DeviceGlobalInitEvents.size());
2624  EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2625  RawEvents.begin(), RawEvents.end());
2626  EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2627  DeviceGlobalInitEvents.begin(),
2628  DeviceGlobalInitEvents.end());
2629  EventsWaitList = EventsWithDeviceGlobalInits;
2630  }
2631 
2632  ur_result_t Error = UR_RESULT_SUCCESS;
2633  {
2634  // When KernelMutex is null, this means that in-memory caching is
2635  // disabled, which means that kernel object is not shared, so no locking
2636  // is necessary.
2637  using LockT = std::unique_lock<std::mutex>;
2638  auto Lock = KernelMutex ? LockT(*KernelMutex) : LockT();
2639 
2640  // Set SLM/Cache configuration for the kernel if non-default value is
2641  // provided.
2642  if (KernelCacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_SLM ||
2643  KernelCacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_DATA) {
2644  const PluginPtr &Plugin = Queue->getPlugin();
2645  Plugin->call(
2646  urKernelSetExecInfo, Kernel, UR_KERNEL_EXEC_INFO_CACHE_CONFIG,
2647  sizeof(ur_kernel_cache_config_t), nullptr, &KernelCacheConfig);
2648  }
2649 
2650  Error = SetKernelParamsAndLaunch(
2651  Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList,
2652  OutEventImpl, EliminatedArgMask, getMemAllocationFunc,
2653  KernelIsCooperative, KernelUsesClusterLaunch, BinImage, KernelName);
2654 
2655  const PluginPtr &Plugin = Queue->getPlugin();
2656  if (!SyclKernelImpl && !MSyclKernel) {
2657  Plugin->call(urKernelRelease, Kernel);
2658  Plugin->call(urProgramRelease, Program);
2659  }
2660  }
2661  if (UR_RESULT_SUCCESS != Error) {
2662  // If we have got non-success error code, let's analyze it to emit nice
2663  // exception explaining what was wrong
2664  const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr());
2666  Kernel, NDRDesc);
2667  }
2668 }
2669 
2670 ur_result_t enqueueReadWriteHostPipe(const QueueImplPtr &Queue,
2671  const std::string &PipeName, bool blocking,
2672  void *ptr, size_t size,
2673  std::vector<ur_event_handle_t> &RawEvents,
2674  const detail::EventImplPtr &OutEventImpl,
2675  bool read) {
2676  assert(Queue &&
2677  "ReadWrite host pipe submissions should have an associated queue");
2678  detail::HostPipeMapEntry *hostPipeEntry =
2680 
2681  ur_program_handle_t Program = nullptr;
2682  device Device = Queue->get_device();
2683  ContextImplPtr ContextImpl = Queue->getContextImplPtr();
2684  std::optional<ur_program_handle_t> CachedProgram =
2685  ContextImpl->getProgramForHostPipe(Device, hostPipeEntry);
2686  if (CachedProgram)
2687  Program = *CachedProgram;
2688  else {
2689  // If there was no cached program, build one.
2690  device_image_plain devImgPlain =
2692  hostPipeEntry->getDevBinImage(), Queue->get_context(),
2693  Queue->get_device());
2694  device_image_plain BuiltImage =
2695  ProgramManager::getInstance().build(devImgPlain, {Device}, {});
2696  Program = getSyclObjImpl(BuiltImage)->get_ur_program_ref();
2697  }
2698  assert(Program && "Program for this hostpipe is not compiled.");
2699 
2700  const PluginPtr &Plugin = Queue->getPlugin();
2701 
2702  ur_queue_handle_t ur_q = Queue->getHandleRef();
2703  ur_result_t Error;
2704 
2705  auto OutEvent = OutEventImpl ? &OutEventImpl->getHandleRef() : nullptr;
2706  if (OutEventImpl != nullptr)
2707  OutEventImpl->setHostEnqueueTime();
2708  if (read) {
2709  Error = Plugin->call_nocheck(
2710  urEnqueueReadHostPipe, ur_q, Program, PipeName.c_str(), blocking, ptr,
2711  size, RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2712  OutEvent);
2713  } else {
2714  Error = Plugin->call_nocheck(
2715  urEnqueueWriteHostPipe, ur_q, Program, PipeName.c_str(), blocking, ptr,
2716  size, RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2717  OutEvent);
2718  }
2719 
2720  return Error;
2721 }
2722 
2723 ur_result_t ExecCGCommand::enqueueImpCommandBuffer() {
2724  assert(MQueue && "Command buffer enqueue should have an associated queue");
2725  // Wait on host command dependencies
2727 
2728  // Any device dependencies need to be waited on here since subsequent
2729  // submissions of the command buffer itself will not receive dependencies on
2730  // them, e.g. initial copies from host to device
2731  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
2732  flushCrossQueueDeps(EventImpls, MWorkerQueue);
2733  std::vector<ur_event_handle_t> RawEvents = getUrEvents(EventImpls);
2734  if (!RawEvents.empty()) {
2735  MQueue->getPlugin()->call(urEventWait, RawEvents.size(), &RawEvents[0]);
2736  }
2737 
2738  // We can omit creating a UR event and create a "discarded" event if either
2739  // the queue has the discard property or the command has been explicitly
2740  // marked as not needing an event, e.g. if the user did not ask for one, and
2741  // if the queue supports discarded UR event and there are no requirements.
2742  bool DiscardUrEvent = (MQueue->MDiscardEvents || !MEventNeeded) &&
2743  MQueue->supportsDiscardingPiEvents() &&
2744  MCommandGroup->getRequirements().size() == 0;
2745  ur_event_handle_t *Event = DiscardUrEvent ? nullptr : &MEvent->getHandleRef();
2746  ur_exp_command_buffer_sync_point_t OutSyncPoint;
2747  ur_exp_command_buffer_command_handle_t OutCommand = nullptr;
2748  switch (MCommandGroup->getType()) {
2749  case CGType::Kernel: {
2750  CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2751 
2752  auto getMemAllocationFunc = [this](Requirement *Req) {
2753  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2754  return AllocaCmd->getMemAllocation();
2755  };
2756 
2757  if (!Event) {
2758  // Kernel only uses assert if it's non interop one
2759  bool KernelUsesAssert =
2760  !(ExecKernel->MSyclKernel && ExecKernel->MSyclKernel->isInterop()) &&
2762  ExecKernel->MKernelName);
2763  if (KernelUsesAssert) {
2764  Event = &MEvent->getHandleRef();
2765  }
2766  }
2767  auto result = enqueueImpCommandBufferKernel(
2768  MQueue->get_context(), MQueue->getDeviceImplPtr(), MCommandBuffer,
2769  *ExecKernel, MSyncPointDeps, &OutSyncPoint, &OutCommand,
2770  getMemAllocationFunc);
2771  MEvent->setSyncPoint(OutSyncPoint);
2772  MEvent->setCommandBufferCommand(OutCommand);
2773  return result;
2774  }
2775  case CGType::CopyUSM: {
2776  CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2778  MQueue->getContextImplPtr(), Copy->getSrc(), MCommandBuffer,
2779  Copy->getLength(), Copy->getDst(), MSyncPointDeps, &OutSyncPoint);
2780  MEvent->setSyncPoint(OutSyncPoint);
2781  return UR_RESULT_SUCCESS;
2782  }
2783  case CGType::CopyAccToAcc: {
2784  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2785  Requirement *ReqSrc = (Requirement *)(Copy->getSrc());
2786  Requirement *ReqDst = (Requirement *)(Copy->getDst());
2787 
2788  AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2789  AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2790 
2792  MQueue->getContextImplPtr(), MCommandBuffer,
2793  AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
2794  ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2795  ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2796  ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2797  ReqDst->MOffset, ReqDst->MElemSize, std::move(MSyncPointDeps),
2798  &OutSyncPoint);
2799  MEvent->setSyncPoint(OutSyncPoint);
2800  return UR_RESULT_SUCCESS;
2801  }
2802  case CGType::CopyAccToPtr: {
2803  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2804  Requirement *Req = (Requirement *)Copy->getSrc();
2805  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2806 
2808  MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(),
2809  AllocaCmd->getMemAllocation(), Req->MDims, Req->MMemoryRange,
2810  Req->MAccessRange, Req->MOffset, Req->MElemSize, (char *)Copy->getDst(),
2811  Req->MDims, Req->MAccessRange,
2812  /*DstOffset=*/{0, 0, 0}, Req->MElemSize, std::move(MSyncPointDeps),
2813  &OutSyncPoint);
2814  MEvent->setSyncPoint(OutSyncPoint);
2815  return UR_RESULT_SUCCESS;
2816  }
2817  case CGType::CopyPtrToAcc: {
2818  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2819  Requirement *Req = (Requirement *)(Copy->getDst());
2820  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2821 
2823  MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(),
2824  (char *)Copy->getSrc(), Req->MDims, Req->MAccessRange,
2825  /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2826  Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2827  Req->MElemSize, std::move(MSyncPointDeps), &OutSyncPoint);
2828  MEvent->setSyncPoint(OutSyncPoint);
2829  return UR_RESULT_SUCCESS;
2830  }
2831  case CGType::Fill: {
2832  CGFill *Fill = (CGFill *)MCommandGroup.get();
2833  Requirement *Req = (Requirement *)(Fill->getReqToFill());
2834  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2835 
2837  MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(),
2838  AllocaCmd->getMemAllocation(), Fill->MPattern.size(),
2839  Fill->MPattern.data(), Req->MDims, Req->MMemoryRange, Req->MAccessRange,
2840  Req->MOffset, Req->MElemSize, std::move(MSyncPointDeps), &OutSyncPoint);
2841  MEvent->setSyncPoint(OutSyncPoint);
2842  return UR_RESULT_SUCCESS;
2843  }
2844  case CGType::FillUSM: {
2845  CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
2847  MQueue->getContextImplPtr(), MCommandBuffer, Fill->getDst(),
2848  Fill->getLength(), Fill->getPattern(), std::move(MSyncPointDeps),
2849  &OutSyncPoint);
2850  MEvent->setSyncPoint(OutSyncPoint);
2851  return UR_RESULT_SUCCESS;
2852  }
2853  case CGType::PrefetchUSM: {
2854  CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
2856  MQueue->getContextImplPtr(), MCommandBuffer, Prefetch->getDst(),
2857  Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint);
2858  MEvent->setSyncPoint(OutSyncPoint);
2859  return UR_RESULT_SUCCESS;
2860  }
2861  case CGType::AdviseUSM: {
2862  CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
2864  MQueue->getContextImplPtr(), MCommandBuffer, Advise->getDst(),
2865  Advise->getLength(), Advise->getAdvice(), std::move(MSyncPointDeps),
2866  &OutSyncPoint);
2867  MEvent->setSyncPoint(OutSyncPoint);
2868  return UR_RESULT_SUCCESS;
2869  }
2870 
2871  default:
2872  throw exception(make_error_code(errc::runtime),
2873  "CG type not implemented for command buffers.");
2874  }
2875 }
2876 
2877 ur_result_t ExecCGCommand::enqueueImp() {
2878  if (MCommandBuffer) {
2879  return enqueueImpCommandBuffer();
2880  } else {
2881  return enqueueImpQueue();
2882  }
2883 }
2884 
2885 ur_result_t ExecCGCommand::enqueueImpQueue() {
2888  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
2889  auto RawEvents = getUrEvents(EventImpls);
2890  flushCrossQueueDeps(EventImpls, MWorkerQueue);
2891 
2892  // We can omit creating a UR event and create a "discarded" event if either
2893  // the queue has the discard property or the command has been explicitly
2894  // marked as not needing an event, e.g. if the user did not ask for one, and
2895  // if the queue supports discarded UR event and there are no requirements.
2896  bool DiscardUrEvent = MQueue && (MQueue->MDiscardEvents || !MEventNeeded) &&
2897  MQueue->supportsDiscardingPiEvents() &&
2898  MCommandGroup->getRequirements().size() == 0;
2899 
2900  ur_event_handle_t *Event = DiscardUrEvent ? nullptr : &MEvent->getHandleRef();
2901  detail::EventImplPtr EventImpl = DiscardUrEvent ? nullptr : MEvent;
2902 
2903  switch (MCommandGroup->getType()) {
2904 
2905  case CGType::UpdateHost: {
2906  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
2907  "Update host should be handled by the Scheduler. " +
2908  codeToString(UR_RESULT_ERROR_INVALID_VALUE));
2909  }
2910  case CGType::CopyAccToPtr: {
2911  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2912  Requirement *Req = (Requirement *)Copy->getSrc();
2913  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2914 
2916  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2917  Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2918  Req->MElemSize, Copy->getDst(), nullptr, Req->MDims, Req->MAccessRange,
2919  Req->MAccessRange, /*DstOffset=*/{0, 0, 0}, Req->MElemSize,
2920  std::move(RawEvents), MEvent->getHandleRef(), MEvent);
2921 
2922  return UR_RESULT_SUCCESS;
2923  }
2924  case CGType::CopyPtrToAcc: {
2925  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2926  Requirement *Req = (Requirement *)(Copy->getDst());
2927  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2928 
2930  AllocaCmd->getSYCLMemObj(), Copy->getSrc(), nullptr, Req->MDims,
2931  Req->MAccessRange, Req->MAccessRange,
2932  /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2933  MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2934  Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent);
2935 
2936  return UR_RESULT_SUCCESS;
2937  }
2938  case CGType::CopyAccToAcc: {
2939  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2940  Requirement *ReqSrc = (Requirement *)(Copy->getSrc());
2941  Requirement *ReqDst = (Requirement *)(Copy->getDst());
2942 
2943  AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2944  AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2945 
2947  AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(), MQueue,
2948  ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2949  ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2950  MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2951  ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents),
2952  MEvent->getHandleRef(), MEvent);
2953 
2954  return UR_RESULT_SUCCESS;
2955  }
2956  case CGType::Fill: {
2957  CGFill *Fill = (CGFill *)MCommandGroup.get();
2958  Requirement *Req = (Requirement *)(Fill->getReqToFill());
2959  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2960 
2962  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2963  Fill->MPattern.size(), Fill->MPattern.data(), Req->MDims,
2964  Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
2965  std::move(RawEvents), MEvent->getHandleRef(), MEvent);
2966 
2967  return UR_RESULT_SUCCESS;
2968  }
2969  case CGType::Kernel: {
2970  assert(MQueue && "Kernel submissions should have an associated queue");
2971  CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2972 
2973  NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
2974  std::vector<ArgDesc> &Args = ExecKernel->MArgs;
2975 
2976  auto getMemAllocationFunc = [this](Requirement *Req) {
2977  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2978  // getAllocaForReq may return nullptr if Req is a default constructed
2979  // accessor. Simply return nullptr in such a case.
2980  return AllocaCmd ? AllocaCmd->getMemAllocation() : nullptr;
2981  };
2982 
2983  const std::shared_ptr<detail::kernel_impl> &SyclKernel =
2984  ExecKernel->MSyclKernel;
2985  const std::string &KernelName = ExecKernel->MKernelName;
2986 
2987  if (!EventImpl) {
2988  // Kernel only uses assert if it's non interop one
2989  bool KernelUsesAssert =
2990  !(SyclKernel && SyclKernel->isInterop()) &&
2992  if (KernelUsesAssert) {
2993  EventImpl = MEvent;
2994  }
2995  }
2996 
2997  const RTDeviceBinaryImage *BinImage = nullptr;
2999  std::tie(BinImage, std::ignore) =
3000  retrieveKernelBinary(MQueue, KernelName.c_str());
3001  assert(BinImage && "Failed to obtain a binary image.");
3002  }
3003  enqueueImpKernel(MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(),
3004  SyclKernel, KernelName, RawEvents, EventImpl,
3005  getMemAllocationFunc, ExecKernel->MKernelCacheConfig,
3006  ExecKernel->MKernelIsCooperative,
3007  ExecKernel->MKernelUsesClusterLaunch, BinImage);
3008 
3009  return UR_RESULT_SUCCESS;
3010  }
3011  case CGType::CopyUSM: {
3012  CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
3013  MemoryManager::copy_usm(Copy->getSrc(), MQueue, Copy->getLength(),
3014  Copy->getDst(), std::move(RawEvents), Event,
3015  MEvent);
3016 
3017  return UR_RESULT_SUCCESS;
3018  }
3019  case CGType::FillUSM: {
3020  CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
3021  MemoryManager::fill_usm(Fill->getDst(), MQueue, Fill->getLength(),
3022  Fill->getPattern(), std::move(RawEvents), Event,
3023  MEvent);
3024 
3025  return UR_RESULT_SUCCESS;
3026  }
3027  case CGType::PrefetchUSM: {
3028  CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
3029  MemoryManager::prefetch_usm(Prefetch->getDst(), MQueue,
3030  Prefetch->getLength(), std::move(RawEvents),
3031  Event, MEvent);
3032 
3033  return UR_RESULT_SUCCESS;
3034  }
3035  case CGType::AdviseUSM: {
3036  CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
3037  MemoryManager::advise_usm(Advise->getDst(), MQueue, Advise->getLength(),
3038  Advise->getAdvice(), std::move(RawEvents), Event,
3039  MEvent);
3040 
3041  return UR_RESULT_SUCCESS;
3042  }
3043  case CGType::Copy2DUSM: {
3044  CGCopy2DUSM *Copy = (CGCopy2DUSM *)MCommandGroup.get();
3045  MemoryManager::copy_2d_usm(Copy->getSrc(), Copy->getSrcPitch(), MQueue,
3046  Copy->getDst(), Copy->getDstPitch(),
3047  Copy->getWidth(), Copy->getHeight(),
3048  std::move(RawEvents), Event, MEvent);
3049  return UR_RESULT_SUCCESS;
3050  }
3051  case CGType::Fill2DUSM: {
3052  CGFill2DUSM *Fill = (CGFill2DUSM *)MCommandGroup.get();
3053  MemoryManager::fill_2d_usm(Fill->getDst(), MQueue, Fill->getPitch(),
3054  Fill->getWidth(), Fill->getHeight(),
3055  Fill->getPattern(), std::move(RawEvents), Event,
3056  MEvent);
3057  return UR_RESULT_SUCCESS;
3058  }
3059  case CGType::Memset2DUSM: {
3060  CGMemset2DUSM *Memset = (CGMemset2DUSM *)MCommandGroup.get();
3061  MemoryManager::memset_2d_usm(Memset->getDst(), MQueue, Memset->getPitch(),
3062  Memset->getWidth(), Memset->getHeight(),
3063  Memset->getValue(), std::move(RawEvents),
3064  Event, MEvent);
3065  return UR_RESULT_SUCCESS;
3066  }
3067  case CGType::CodeplayHostTask: {
3068  CGHostTask *HostTask = static_cast<CGHostTask *>(MCommandGroup.get());
3069 
3070  for (ArgDesc &Arg : HostTask->MArgs) {
3071  switch (Arg.MType) {
3073  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
3074  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
3075 
3076  if (AllocaCmd)
3077  Req->MData = AllocaCmd->getMemAllocation();
3078  break;
3079  }
3080  default:
3081  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
3082  "Unsupported arg type " +
3083  codeToString(UR_RESULT_ERROR_INVALID_VALUE));
3084  }
3085  }
3086 
3087  std::vector<interop_handle::ReqToMem> ReqToMem;
3088  std::vector<ur_mem_handle_t> ReqUrMem;
3089 
3090  if (HostTask->MHostTask->isInteropTask()) {
3091  // Extract the Mem Objects for all Requirements, to ensure they are
3092  // available if a user asks for them inside the interop task scope
3093  const std::vector<Requirement *> &HandlerReq =
3094  HostTask->getRequirements();
3095  auto ReqToMemConv = [&ReqToMem, &ReqUrMem, HostTask](Requirement *Req) {
3096  const std::vector<AllocaCommandBase *> &AllocaCmds =
3097  Req->MSYCLMemObj->MRecord->MAllocaCommands;
3098 
3099  for (AllocaCommandBase *AllocaCmd : AllocaCmds)
3100  if (getContext(HostTask->MQueue) ==
3101  getContext(AllocaCmd->getQueue())) {
3102  auto MemArg = reinterpret_cast<ur_mem_handle_t>(
3103  AllocaCmd->getMemAllocation());
3104  ReqToMem.emplace_back(std::make_pair(Req, MemArg));
3105  ReqUrMem.emplace_back(MemArg);
3106 
3107  return;
3108  }
3109 
3110  assert(false &&
3111  "Can't get memory object due to no allocation available");
3112 
3113  throw sycl::exception(
3114  sycl::make_error_code(sycl::errc::runtime),
3115  "Can't get memory object due to no allocation available " +
3116  codeToString(UR_RESULT_ERROR_INVALID_MEM_OBJECT));
3117  };
3118  std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
3119  std::sort(std::begin(ReqToMem), std::end(ReqToMem));
3120  }
3121 
3122  // Host task is executed asynchronously so we should record where it was
3123  // submitted to report exception origin properly.
3125 
3127  DispatchHostTask(this, std::move(ReqToMem), std::move(ReqUrMem)));
3128 
3130 
3131  return UR_RESULT_SUCCESS;
3132  }
3134  CGHostTask *HostTask = static_cast<CGHostTask *>(MCommandGroup.get());
3135 
3136  for (ArgDesc &Arg : HostTask->MArgs) {
3137  switch (Arg.MType) {
3139  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
3140  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
3141 
3142  if (AllocaCmd)
3143  Req->MData = AllocaCmd->getMemAllocation();
3144  break;
3145  }
3146  default:
3147  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
3148  "Unsupported arg type ");
3149  }
3150  }
3151 
3152  std::vector<interop_handle::ReqToMem> ReqToMem;
3153  std::vector<ur_mem_handle_t> ReqMems;
3154 
3155  if (HostTask->MHostTask->isInteropTask()) {
3156  // Extract the Mem Objects for all Requirements, to ensure they are
3157  // available if a user asks for them inside the interop task scope
3158  const std::vector<Requirement *> &HandlerReq =
3159  HostTask->getRequirements();
3160  auto ReqToMemConv = [&ReqToMem, &ReqMems, HostTask](Requirement *Req) {
3161  const std::vector<AllocaCommandBase *> &AllocaCmds =
3162  Req->MSYCLMemObj->MRecord->MAllocaCommands;
3163 
3164  for (AllocaCommandBase *AllocaCmd : AllocaCmds)
3165  if (HostTask->MQueue->getContextImplPtr() ==
3166  AllocaCmd->getQueue()->getContextImplPtr()) {
3167  auto MemArg = reinterpret_cast<ur_mem_handle_t>(
3168  AllocaCmd->getMemAllocation());
3169  ReqToMem.emplace_back(std::make_pair(Req, MemArg));
3170  ReqMems.emplace_back(MemArg);
3171 
3172  return;
3173  }
3174 
3175  assert(false &&
3176  "Can't get memory object due to no allocation available");
3177 
3178  throw sycl::exception(
3179  sycl::make_error_code(sycl::errc::runtime),
3180  "Can't get memory object due to no allocation available ");
3181  };
3182  std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
3183  std::sort(std::begin(ReqToMem), std::end(ReqToMem));
3184  }
3185 
3186  EnqueueNativeCommandData CustomOpData{
3187  interop_handle{ReqToMem, HostTask->MQueue,
3188  HostTask->MQueue->getDeviceImplPtr(),
3189  HostTask->MQueue->getContextImplPtr()},
3190  HostTask->MHostTask->MInteropTask};
3191 
3192  ur_bool_t NativeCommandSupport = false;
3193  MQueue->getPlugin()->call(
3194  urDeviceGetInfo,
3195  detail::getSyclObjImpl(MQueue->get_device())->getHandleRef(),
3196  UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP,
3197  sizeof(NativeCommandSupport), &NativeCommandSupport, nullptr);
3198  assert(NativeCommandSupport && "ext_codeplay_enqueue_native_command is not "
3199  "supported on this device");
3200  MQueue->getPlugin()->call(urEnqueueNativeCommandExp, MQueue->getHandleRef(),
3201  InteropFreeFunc, &CustomOpData, ReqMems.size(),
3202  ReqMems.data(), nullptr, RawEvents.size(),
3203  RawEvents.data(), Event);
3204 
3205  return UR_RESULT_SUCCESS;
3206  }
3207  case CGType::Barrier: {
3208  assert(MQueue && "Barrier submission should have an associated queue");
3209  const PluginPtr &Plugin = MQueue->getPlugin();
3210  if (MEvent != nullptr)
3211  MEvent->setHostEnqueueTime();
3212  Plugin->call(urEnqueueEventsWaitWithBarrier, MQueue->getHandleRef(), 0,
3213  nullptr, Event);
3214 
3215  return UR_RESULT_SUCCESS;
3216  }
3217  case CGType::BarrierWaitlist: {
3218  assert(MQueue && "Barrier submission should have an associated queue");
3219  CGBarrier *Barrier = static_cast<CGBarrier *>(MCommandGroup.get());
3220  std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
3221  std::vector<ur_event_handle_t> UrEvents = getUrEventsBlocking(Events);
3222  if (UrEvents.empty()) {
3223  // If Events is empty, then the barrier has no effect.
3224  return UR_RESULT_SUCCESS;
3225  }
3226  const PluginPtr &Plugin = MQueue->getPlugin();
3227  if (MEvent != nullptr)
3228  MEvent->setHostEnqueueTime();
3229  Plugin->call(urEnqueueEventsWaitWithBarrier, MQueue->getHandleRef(),
3230  UrEvents.size(), &UrEvents[0], Event);
3231 
3232  return UR_RESULT_SUCCESS;
3233  }
3234  case CGType::ProfilingTag: {
3235  const auto &Plugin = MQueue->getPlugin();
3236  // If the queue is not in-order, we need to insert a barrier. This barrier
3237  // does not need output events as it will implicitly enforce the following
3238  // enqueue is blocked until it finishes.
3239  if (!MQueue->isInOrder())
3240  Plugin->call(urEnqueueEventsWaitWithBarrier, MQueue->getHandleRef(),
3241  /*num_events_in_wait_list=*/0,
3242  /*event_wait_list=*/nullptr, /*event=*/nullptr);
3243 
3244  Plugin->call(urEnqueueTimestampRecordingExp, MQueue->getHandleRef(),
3245  /*blocking=*/false,
3246  /*num_events_in_wait_list=*/0, /*event_wait_list=*/nullptr,
3247  Event);
3248 
3249  return UR_RESULT_SUCCESS;
3250  }
3252  CGCopyToDeviceGlobal *Copy = (CGCopyToDeviceGlobal *)MCommandGroup.get();
3254  Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(), MQueue,
3255  Copy->getNumBytes(), Copy->getOffset(), Copy->getSrc(),
3256  std::move(RawEvents), Event, MEvent);
3257 
3258  return UR_RESULT_SUCCESS;
3259  }
3261  CGCopyFromDeviceGlobal *Copy =
3262  (CGCopyFromDeviceGlobal *)MCommandGroup.get();
3264  Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(), MQueue,
3265  Copy->getNumBytes(), Copy->getOffset(), Copy->getDest(),
3266  std::move(RawEvents), Event, MEvent);
3267 
3268  return UR_RESULT_SUCCESS;
3269  }
3271  CGReadWriteHostPipe *ExecReadWriteHostPipe =
3272  (CGReadWriteHostPipe *)MCommandGroup.get();
3273  std::string pipeName = ExecReadWriteHostPipe->getPipeName();
3274  void *hostPtr = ExecReadWriteHostPipe->getHostPtr();
3275  size_t typeSize = ExecReadWriteHostPipe->getTypeSize();
3276  bool blocking = ExecReadWriteHostPipe->isBlocking();
3277  bool read = ExecReadWriteHostPipe->isReadHostPipe();
3278 
3279  if (!EventImpl) {
3280  EventImpl = MEvent;
3281  }
3282  return enqueueReadWriteHostPipe(MQueue, pipeName, blocking, hostPtr,
3283  typeSize, RawEvents, EventImpl, read);
3284  }
3286  assert(MQueue &&
3287  "Command buffer submissions should have an associated queue");
3288  CGExecCommandBuffer *CmdBufferCG =
3289  static_cast<CGExecCommandBuffer *>(MCommandGroup.get());
3290  if (MEvent != nullptr)
3291  MEvent->setHostEnqueueTime();
3292  return MQueue->getPlugin()->call_nocheck(
3293  urCommandBufferEnqueueExp, CmdBufferCG->MCommandBuffer,
3294  MQueue->getHandleRef(), RawEvents.size(),
3295  RawEvents.empty() ? nullptr : &RawEvents[0], Event);
3296  }
3297  case CGType::CopyImage: {
3298  CGCopyImage *Copy = (CGCopyImage *)MCommandGroup.get();
3299 
3301  MQueue, Copy->getSrc(), Copy->getDst(), Copy->getSrcDesc(),
3302  Copy->getDstDesc(), Copy->getSrcFormat(), Copy->getDstFormat(),
3303  Copy->getCopyFlags(), Copy->getSrcOffset(), Copy->getDstOffset(),
3304  Copy->getCopyExtent(), std::move(RawEvents), Event);
3305  return UR_RESULT_SUCCESS;
3306  }
3307  case CGType::SemaphoreWait: {
3308  assert(MQueue &&
3309  "Semaphore wait submissions should have an associated queue");
3310  CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get();
3311  const detail::PluginPtr &Plugin = MQueue->getPlugin();
3312  auto OptWaitValue = SemWait->getWaitValue();
3313  uint64_t WaitValue = OptWaitValue.has_value() ? OptWaitValue.value() : 0;
3314  Plugin->call(urBindlessImagesWaitExternalSemaphoreExp,
3315  MQueue->getHandleRef(), SemWait->getInteropSemaphoreHandle(),
3316  OptWaitValue.has_value(), WaitValue, 0, nullptr, nullptr);
3317 
3318  return UR_RESULT_SUCCESS;
3319  }
3320  case CGType::SemaphoreSignal: {
3321  assert(MQueue &&
3322  "Semaphore signal submissions should have an associated queue");
3323  CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get();
3324  const detail::PluginPtr &Plugin = MQueue->getPlugin();
3325  auto OptSignalValue = SemSignal->getSignalValue();
3326  uint64_t SignalValue =
3327  OptSignalValue.has_value() ? OptSignalValue.value() : 0;
3328  Plugin->call(urBindlessImagesSignalExternalSemaphoreExp,
3329  MQueue->getHandleRef(), SemSignal->getInteropSemaphoreHandle(),
3330  OptSignalValue.has_value(), SignalValue, 0, nullptr, nullptr);
3331 
3332  return UR_RESULT_SUCCESS;
3333  }
3334  case CGType::None:
3335  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
3336  "CG type not implemented. " +
3337  codeToString(UR_RESULT_ERROR_INVALID_OPERATION));
3338  }
3339  return UR_RESULT_ERROR_INVALID_OPERATION;
3340 }
3341 
3343  return !MCommandBuffer &&
3344  MCommandGroup->getType() != CGType::CodeplayHostTask;
3345 }
3346 
3348  // Host tasks are cleaned up upon completion instead.
3350  (MCommandGroup->getType() != CGType::CodeplayHostTask);
3351 }
3352 
3354  if (MCommandGroup->getType() == CGType::CodeplayHostTask)
3355  return MLeafCounter == 0 && MEvent->isCompleted();
3356  return Command::readyForCleanup();
3357 }
3358 
3360  : Command(Command::CommandType::FUSION, Queue),
3361  MStatus(FusionStatus::ACTIVE) {
3363 }
3364 
3365 std::vector<Command *> &KernelFusionCommand::auxiliaryCommands() {
3366  return MAuxiliaryCommands;
3367 }
3368 
3370  MFusionList.push_back(Kernel);
3371 }
3372 
3373 std::vector<ExecCGCommand *> &KernelFusionCommand::getFusionList() {
3374  return MFusionList;
3375 }
3376 
3377 bool KernelFusionCommand::producesPiEvent() const { return false; }
3378 
3379 ur_result_t KernelFusionCommand::enqueueImp() {
3381  waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
3382 
3383  // We need to release the queue here because KernelFusionCommands are
3384  // held back by the scheduler thus prevent the deallocation of the queue.
3385  resetQueue();
3386  return UR_RESULT_SUCCESS;
3387 }
3388 
3390  MStatus = Status;
3391 }
3392 
3394  assert(MStatus != FusionStatus::ACTIVE &&
3395  "Cannot release the queue attached to the KernelFusionCommand if it "
3396  "is active.");
3397  MQueue.reset();
3398  MWorkerQueue.reset();
3399 }
3400 
3402 #ifdef XPTI_ENABLE_INSTRUMENTATION
3403  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
3404  if (!xptiCheckTraceEnabled(MStreamID)) {
3405  return;
3406  }
3407  // Create a payload with the command name and an event using this payload to
3408  // emit a node_create
3409  MCommandNodeType = commandToNodeType(MType);
3410  MCommandName = commandToName(MType);
3411 
3412  static unsigned FusionNodeCount = 0;
3413  std::stringstream PayloadStr;
3414  PayloadStr << "Fusion command #" << FusionNodeCount++;
3415  xpti::payload_t Payload = xpti::payload_t(PayloadStr.str().c_str());
3416 
3417  uint64_t CommandInstanceNo = 0;
3418  xpti_td *CmdTraceEvent =
3419  xptiMakeEvent(MCommandName.c_str(), &Payload, xpti::trace_graph_event,
3420  xpti_at::active, &CommandInstanceNo);
3421 
3422  MInstanceID = CommandInstanceNo;
3423  if (CmdTraceEvent) {
3424  MTraceEvent = static_cast<void *>(CmdTraceEvent);
3425  // If we are seeing this event again, then the instance ID
3426  // will be greater
3427  // than 1; in this case, we must skip sending a
3428  // notification to create a node as this node has already
3429  // been created. We return this value so the epilog method
3430  // can be called selectively.
3431  // See makeTraceEventProlog.
3432  MFirstInstance = (CommandInstanceNo == 1);
3433  }
3434 
3435  // This function is called in the constructor of the command. At this point
3436  // the kernel fusion list is still empty, so we don't have a terrible lot of
3437  // information we could attach to this node here.
3438  if (MFirstInstance && CmdTraceEvent)
3439  addDeviceMetadata(CmdTraceEvent, MQueue);
3440 
3441  if (MFirstInstance) {
3442  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
3443  // as this data is mutable and the metadata is supposed to be invariant
3444  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
3445  getQueueID(MQueue));
3446  xptiNotifySubscribers(MStreamID, NotificationTraceType,
3447  detail::GSYCLGraphEvent,
3448  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
3449  static_cast<const void *>(MCommandNodeType.c_str()));
3450  }
3451 
3452 #endif
3453 }
3454 
3455 void KernelFusionCommand::printDot(std::ostream &Stream) const {
3456  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
3457 
3458  Stream << "ID = " << this << "\\n";
3459  Stream << "KERNEL FUSION on " << queueDeviceToString(MQueue.get()) << "\\n"
3460  << "FUSION LIST: {";
3461  bool Initial = true;
3462  for (auto *Cmd : MFusionList) {
3463  if (!Initial) {
3464  Stream << ",\\n";
3465  }
3466  Initial = false;
3467  auto *KernelCG = static_cast<detail::CGExecKernel *>(&Cmd->getCG());
3468  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
3469  Stream << "created from source";
3470  } else {
3471  Stream << demangleKernelName(KernelCG->getKernelName());
3472  }
3473  }
3474  Stream << "}\\n";
3475 
3476  Stream << "\"];" << std::endl;
3477 
3478  for (const auto &Dep : MDeps) {
3479  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
3480  << " [ label = \"Access mode: "
3481  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
3482  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
3483  << std::endl;
3484  }
3485 }
3486 
3488  QueueImplPtr Queue,
3490  std::vector<std::shared_ptr<ext::oneapi::experimental::detail::node_impl>>
3491  Nodes)
3492  : Command(CommandType::UPDATE_CMD_BUFFER, Queue), MGraph(Graph),
3493  MNodes(Nodes) {}
3494 
3495 ur_result_t UpdateCommandBufferCommand::enqueueImp() {
3497  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
3498  ur_event_handle_t &Event = MEvent->getHandleRef();
3499  Command::waitForEvents(MQueue, EventImpls, Event);
3500 
3501  for (auto &Node : MNodes) {
3502  auto CG = static_cast<CGExecKernel *>(Node->MCommandGroup.get());
3503  for (auto &Arg : CG->MArgs) {
3504  if (Arg.MType != kernel_param_kind_t::kind_accessor) {
3505  continue;
3506  }
3507  // Search through deps to get actual allocation for accessor args.
3508  for (const DepDesc &Dep : MDeps) {
3509  Requirement *Req = static_cast<AccessorImplHost *>(Arg.MPtr);
3510  if (Dep.MDepRequirement == Req) {
3511  if (Dep.MAllocaCmd) {
3512  Req->MData = Dep.MAllocaCmd->getMemAllocation();
3513  } else {
3515  "No allocation available for accessor when "
3516  "updating command buffer!");
3517  }
3518  }
3519  }
3520  }
3521  MGraph->updateImpl(Node);
3522  }
3523 
3524  return UR_RESULT_SUCCESS;
3525 }
3526 
3527 void UpdateCommandBufferCommand::printDot(std::ostream &Stream) const {
3528  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
3529 
3530  Stream << "ID = " << this << "\\n";
3531  Stream << "CommandBuffer Command Update"
3532  << "\\n";
3533 
3534  Stream << "\"];" << std::endl;
3535 
3536  for (const auto &Dep : MDeps) {
3537  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
3538  << " [ label = \"Access mode: "
3539  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
3540  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
3541  << std::endl;
3542  }
3543 }
3544 
3546 bool UpdateCommandBufferCommand::producesPiEvent() const { return false; }
3547 
3548 } // namespace detail
3549 } // namespace _V1
3550 } // namespace sycl
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:50
Base class for memory allocation commands.
Definition: commands.hpp:457
const Requirement * getRequirement() const final
Definition: commands.hpp:468
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
Definition: commands.hpp:485
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:1059
virtual void * getMemAllocation() const =0
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:1055
bool MIsActive
Indicates that current alloca is active one.
Definition: commands.hpp:487
bool MIsLeaderAlloca
Indicates that the command owns memory allocation in case of connected alloca command.
Definition: commands.hpp:491
AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
Definition: commands.cpp:1023
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:1057
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1034
SYCLMemObjI * getSYCLMemObj() const
Definition: commands.hpp:464
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1117
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1079
AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData=true, AllocaCommandBase *LinkedAllocaCmd=nullptr, bool IsConst=false)
Definition: commands.cpp:1061
void * getMemAllocation() const final
Definition: commands.cpp:1176
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1155
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1203
AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, std::vector< Command * > &ToEnqueue, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:1137
sycl::detail::kernel_param_kind_t MType
Definition: cg.hpp:55
"Execute kernel" command group class.
Definition: cg.hpp:246
std::shared_ptr< detail::kernel_bundle_impl > MKernelBundle
Definition: cg.hpp:252
std::vector< ArgDesc > MArgs
Definition: cg.hpp:253
NDRDescT MNDRDesc
Stores ND-range description.
Definition: cg.hpp:249
std::shared_ptr< detail::kernel_impl > MSyclKernel
Definition: cg.hpp:251
std::shared_ptr< detail::queue_impl > MQueue
Definition: cg.hpp:673
Base class for all types of command groups.
Definition: cg.hpp:160
CGType getType() const
Definition: cg.hpp:207
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:109
virtual ur_result_t enqueueImp()=0
Private interface. Derived classes should implement this method.
Command * processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform glueing of events from different contexts.
Definition: commands.cpp:752
void * MTraceEvent
The event for node_create and task_begin.
Definition: commands.hpp:350
CommandType MType
The type of the command.
Definition: commands.hpp:288
virtual bool producesPiEvent() const
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:791
int32_t MStreamID
The stream under which the traces are emitted.
Definition: commands.hpp:354
void emitInstrumentation(uint16_t Type, const char *Txt=nullptr)
Emits an event of Type.
Definition: commands.cpp:849
virtual void emitInstrumentationData()=0
Instrumentation method which emits telemetry data.
void resolveReleaseDependencies(std::set< Command * > &list)
Looks at all the dependencies for the release command and enables instrumentation to report these dep...
Definition: commands.cpp:952
std::string MCommandName
Buffer to build the command end-user understandable name.
Definition: commands.hpp:363
bool MMarkedForCleanup
Indicates that the node will be freed by graph cleanup.
Definition: commands.hpp:390
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
Definition: commands.hpp:325
std::vector< ur_exp_command_buffer_sync_point_t > MSyncPointDeps
List of sync points for submissions to a command buffer.
Definition: commands.hpp:410
virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector< Command * > &ToCleanUp)
Checks if the command is enqueued, and calls enqueueImp.
Definition: commands.cpp:860
void waitForEvents(QueueImplPtr Queue, std::vector< EventImplPtr > &RawEvents, ur_event_handle_t &Event)
Definition: commands.cpp:500
void waitForPreparedHostEvents() const
Definition: commands.cpp:495
std::string MSubmissionFileName
Introduces string to handle memory management since code_location struct works with raw char arrays.
Definition: commands.hpp:376
std::vector< ur_event_handle_t > getUrEvents(const std::vector< EventImplPtr > &EventImpls) const
Collect UR events from EventImpls and filter out some of them in case of in order queue.
Definition: commands.cpp:236
std::mutex MEnqueueMtx
Mutex used to protect enqueueing from race conditions.
Definition: commands.hpp:290
void emitInstrumentationDataProxy()
Proxy method which calls emitInstrumentationData.
Definition: commands.cpp:583
code_location MSubmissionCodeLocation
Represents code location of command submission to SYCL API, assigned with the valid value only if com...
Definition: commands.hpp:373
void emitEnqueuedEventSignal(ur_event_handle_t &UrEventAddr)
Creates a signal event with the enqueued kernel event handle.
Definition: commands.cpp:840
void makeTraceEventEpilog()
If prolog has been run, run epilog; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:739
std::vector< EventImplPtr > & MPreparedHostDepsEvents
Definition: commands.hpp:262
std::vector< EventImplPtr > & MPreparedDepsEvents
Dependency events prepared for waiting by backend.
Definition: commands.hpp:261
std::string MSubmissionFunctionName
Definition: commands.hpp:377
uint64_t MInstanceID
Instance ID tracked for the command.
Definition: commands.hpp:369
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
Definition: commands.hpp:319
const char * getBlockReason() const
Definition: commands.cpp:994
std::vector< ur_event_handle_t > getUrEventsBlocking(const std::vector< EventImplPtr > &EventImpls) const
Collect UR events from EventImpls and filter out some of them in case of in order queue.
Definition: commands.cpp:260
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:795
void addUser(Command *NewUser)
Definition: commands.hpp:144
virtual ContextImplPtr getWorkerContext() const
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:785
std::atomic< EnqueueResultT::ResultT > MEnqueueStatus
Describes the status of the command.
Definition: commands.hpp:342
const EventImplPtr & getEvent() const
Definition: commands.hpp:183
ur_exp_command_buffer_handle_t MCommandBuffer
CommandBuffer which will be used to submit to instead of the queue, if set.
Definition: commands.hpp:408
uint64_t makeTraceEventProlog(void *MAddress)
Create a trace event of node_create type; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:707
friend class DispatchHostTask
Definition: commands.hpp:292
CommandType getType() const
Definition: commands.hpp:147
void * MAddress
Reserved for storing the object address such as SPIR-V or memory object address.
Definition: commands.hpp:357
std::string MAddressString
Buffer to build the address string.
Definition: commands.hpp:359
bool MIsBlockable
Indicates whether the command can be blocked from enqueueing.
Definition: commands.hpp:323
virtual bool supportsPostEnqueueCleanup() const
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:793
bool MTraceEventPrologComplete
Flag to indicate if makeTraceEventProlog() has been run.
Definition: commands.hpp:365
std::string MCommandNodeType
Buffer to build the command node type.
Definition: commands.hpp:361
void emitEdgeEventForEventDependence(Command *Cmd, ur_event_handle_t &EventAddr)
Creates an edge event when the dependency is an event.
Definition: commands.cpp:650
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:800
void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, bool IsCommand, std::optional< access::mode > AccMode=std::nullopt)
Creates an edge event when the dependency is a command.
Definition: commands.cpp:599
const QueueImplPtr & getQueue() const
Definition: commands.hpp:181
Command(CommandType Type, QueueImplPtr Queue, ur_exp_command_buffer_handle_t CommandBuffer=nullptr, const std::vector< ur_exp_command_buffer_sync_point_t > &SyncPoints={})
It is safe to bind MPreparedDepsEvents and MPreparedHostDepsEvents references to event_impl class mem...
Definition: commands.cpp:557
bool MFirstInstance
Flag to indicate if this is the first time we are seeing this payload.
Definition: commands.hpp:367
DispatchHostTask(ExecCGCommand *ThisCmd, std::vector< interop_handle::ReqToMem > ReqToMem, std::vector< ur_mem_handle_t > ReqUrMem)
Definition: commands.cpp:379
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:1807
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1789
void addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, const Requirement *Req)
Definition: commands.cpp:1745
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1761
The exec CG command enqueues execution of kernel or explicit memory operation.
Definition: commands.hpp:647
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:3342
ExecCGCommand(std::unique_ptr< detail::CG > CommandGroup, QueueImplPtr Queue, bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer=nullptr, const std::vector< ur_exp_command_buffer_sync_point_t > &Dependencies={})
Definition: commands.cpp:1927
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:3347
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:3353
std::string_view getTypeString() const
Definition: commands.cpp:2217
detail::CG & getCG() const
Definition: commands.hpp:662
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:2183
std::vector< std::shared_ptr< const void > > getAuxiliaryResources() const
Definition: commands.cpp:1608
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:2135
void TraceEventXPTI(const char *Message)
static GlobalHandler & instance()
void call(HostProfilingInfo *HPI)
Definition: host_task.hpp:35
void addToFusionList(ExecCGCommand *Kernel)
Definition: commands.cpp:3369
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:3377
void setFusionStatus(FusionStatus Status)
Set the status of this fusion command to Status.
Definition: commands.cpp:3389
std::vector< ExecCGCommand * > & getFusionList()
Definition: commands.cpp:3373
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:3455
KernelFusionCommand(QueueImplPtr Queue)
Definition: commands.cpp:3359
void resetQueue()
Reset the queue.
Definition: commands.cpp:3393
std::vector< Command * > & auxiliaryCommands()
Definition: commands.cpp:3365
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:3401
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1349
MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode)
Definition: commands.cpp:1340
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1386
ContextImplPtr getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1701
MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, void **DstPtr, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1656
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1674
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1809
ContextImplPtr getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1530
MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, AllocaCommandBase *DstAllocaCmd, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1484
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1503
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:1536
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1577
static void ext_oneapi_fill_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, void *DstMem, size_t Len, const std::vector< unsigned char > &Pattern, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void ext_oneapi_copyH2D_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, char *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void copy_image_bindless(QueueImplPtr Queue, const void *Src, void *Dst, const ur_image_desc_t &SrcDesc, const ur_image_desc_t &DstDesc, const ur_image_format_t &SrcFormat, const ur_image_format_t &DstFormat, const ur_exp_image_copy_flags_t Flags, ur_rect_offset_t SrcOffset, ur_rect_offset_t DstOffset, ur_rect_region_t CopyExtent, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent)
static void fill_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, const std::vector< unsigned char > &Pattern, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_advise_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, const void *Mem, size_t Length, ur_usm_advice_flags_t Advice, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void fill(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, size_t PatternSize, const unsigned char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void * allocateMemSubBuffer(ContextImplPtr TargetContext, void *ParentMemObj, size_t ElemSize, size_t Offset, range< 3 > Range, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
static void ext_oneapi_fill_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, void *Mem, size_t PatternSize, const unsigned char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void memset_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, char Value, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copyD2H_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, char *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void copy_to_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, const void *SrcMem, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copyD2D_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void copy_2d_usm(const void *SrcMem, size_t SrcPitch, QueueImplPtr Queue, void *DstMem, size_t DstPitch, size_t Width, size_t Height, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void advise_usm(const void *Ptr, QueueImplPtr Queue, size_t Len, ur_usm_advice_flags_t Advice, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void * map(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, access::mode AccessMode, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent)
static void copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_usm(const void *SrcMem, QueueImplPtr Queue, size_t Len, void *DstMem, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, const std::vector< unsigned char > &Pattern, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_from_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, void *DstMem, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void * allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
static void ext_oneapi_copy_usm_cmd_buffer(ContextImplPtr Context, const void *SrcMem, ur_exp_command_buffer_handle_t CommandBuffer, size_t Len, void *DstMem, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void unmap(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, void *MappedPtr, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent)
static void release(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
static void ext_oneapi_prefetch_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, void *Mem, size_t Length, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void prefetch_usm(void *Ptr, QueueImplPtr Queue, size_t Len, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
sycl::range< 3 > GlobalSize
Definition: cg.hpp:148
sycl::range< 3 > NumWorkGroups
Number of workgroups, used to record the number of workgroups from the simplest form of parallel_for_...
Definition: cg.hpp:154
sycl::id< 3 > GlobalOffset
Definition: cg.hpp:150
sycl::range< 3 > LocalSize
Definition: cg.hpp:149
sycl::range< 3 > ClusterDimensions
Definition: cg.hpp:155
std::tuple< ur_kernel_handle_t, std::mutex *, const KernelArgMask *, ur_program_handle_t > getOrCreateKernel(const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, const NDRDescT &NDRDesc={})
static ProgramManager & getInstance()
kernel_id getSYCLKernelID(const std::string &KernelName)
device_image_plain getDeviceImageFromBinaryImage(RTDeviceBinaryImage *BinImage, const context &Ctx, const device &Dev)
bool kernelUsesAssert(const std::string &KernelName) const
HostPipeMapEntry * getHostPipeEntry(const std::string &UniqueId)
device_image_plain build(const device_image_plain &DeviceImage, const std::vector< device > &Devs, const property_list &PropList)
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:1334
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:1336
ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd)
Definition: commands.cpp:1224
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1316
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1229
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:1338
static const char * get()
Definition: config.hpp:115
virtual MemObjType getType() const =0
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
void enqueueCommandForCG(EventImplPtr NewEvent, std::vector< Command * > &AuxilaryCmds, BlockingT Blocking=NON_BLOCKING)
Definition: scheduler.cpp:151
ur_kernel_handle_t completeSpecConstMaterialization(QueueImplPtr Queue, const RTDeviceBinaryImage *BinImage, const std::string &KernelName, std::vector< unsigned char > &SpecConstBlob)
Definition: scheduler.cpp:610
static Scheduler & getInstance()
Definition: scheduler.cpp:248
void NotifyHostTaskCompletion(Command *Cmd)
Definition: scheduler.cpp:447
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1467
UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, void **SrcPtr, QueueImplPtr Queue)
Definition: commands.cpp:1403
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1410
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:1432
UpdateCommandBufferCommand(QueueImplPtr Queue, ext::oneapi::experimental::detail::exec_graph_impl *Graph, std::vector< std::shared_ptr< ext::oneapi::experimental::detail::node_impl >> Nodes)
Definition: commands.cpp:3487
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:3527
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:3545
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:3546
UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd, void **DstPtr)
Definition: commands.cpp:1826
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1835
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1633
static void bufferAssociateNotification(const void *, const void *)
const PluginPtr & getPlugin() const
ur_device_handle_t & getHandleRef()
Get reference to UR device.
Definition: device_impl.hpp:65
static ThreadPool & getThreadPool()
Definition: queue_impl.hpp:698
Data type that manages the code_location information in TLS.
Definition: common.hpp:131
const detail::code_location & query()
Query the information in the TLS slot.
Definition: common.cpp:66
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
Class representing the implementation of command_graph<executable>.
void updateImpl(std::shared_ptr< node_impl > NodeImpl)
Objects of the class identify kernel is some kernel_bundle related APIs.
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:71
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:22
Objects of the property_list class are containers for the SYCL properties.
std::function< void(interop_handle)> func
Definition: commands.cpp:322
sycl::interop_handle ih
Definition: commands.cpp:321
void handleErrorOrWarning(ur_result_t Error, const device_impl &DeviceImpl, ur_kernel_handle_t Kernel, const NDRDescT &NDRDesc)
Analyzes error code and arguments of urEnqueueKernelLaunch to emit user-friendly exception describing...
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:266
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:31
static void flushCrossQueueDeps(const std::vector< EventImplPtr > &EventImpls, const QueueImplPtr &Queue)
Definition: commands.cpp:311
static std::string demangleKernelName(std::string Name)
Definition: commands.cpp:120
static ur_result_t SetKernelParamsAndLaunch(const QueueImplPtr &Queue, std::vector< ArgDesc > &Args, const std::shared_ptr< device_image_impl > &DeviceImageImpl, ur_kernel_handle_t Kernel, NDRDescT &NDRDesc, std::vector< ur_event_handle_t > &RawEvents, const detail::EventImplPtr &OutEventImpl, const KernelArgMask *EliminatedArgMask, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, bool IsCooperative, bool KernelUsesClusterLaunch, const RTDeviceBinaryImage *BinImage, const std::string &KernelName)
Definition: commands.cpp:2353
std::vector< bool > KernelArgMask
std::string codeToString(int32_t code)
Definition: exception.hpp:57
constexpr const char * SYCL_STREAM_NAME
ur_mem_flags_t AccessModeToUr(access::mode AccessorMode)
Definition: commands.cpp:2267
static std::string_view cgTypeToString(detail::CGType Type)
Definition: commands.cpp:1858
void ReverseRangeDimensionsForKernel(NDRDescT &NDR)
Definition: commands.cpp:2259
void enqueueImpKernel(const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector< ArgDesc > &Args, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, const std::shared_ptr< detail::kernel_impl > &MSyclKernel, const std::string &KernelName, std::vector< ur_event_handle_t > &RawEvents, const detail::EventImplPtr &OutEventImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative, const bool KernelUsesClusterLaunch, const RTDeviceBinaryImage *BinImage)
Definition: commands.cpp:2553
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:32
void SetArgBasedOnType(const PluginPtr &Plugin, ur_kernel_handle_t Kernel, const std::shared_ptr< device_image_impl > &DeviceImageImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex)
Definition: commands.cpp:2279
std::shared_ptr< event_impl > EventImplPtr
Definition: handler.hpp:183
std::shared_ptr< plugin > PluginPtr
Definition: ur.hpp:60
AccessorImplHost Requirement
ur_result_t enqueueImpCommandBufferKernel(context Ctx, DeviceImplPtr DeviceImpl, ur_exp_command_buffer_handle_t CommandBuffer, const CGExecKernel &CommandGroup, std::vector< ur_exp_command_buffer_sync_point_t > &SyncPoints, ur_exp_command_buffer_sync_point_t *OutSyncPoint, ur_exp_command_buffer_command_handle_t *OutCommand, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
Definition: commands.cpp:2451
std::shared_ptr< device_impl > DeviceImplPtr
CGType
Type of the command group.
Definition: cg_types.hpp:42
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
static void adjustNDRangePerKernel(NDRDescT &NDR, ur_kernel_handle_t Kernel, const device_impl &DeviceImpl)
Definition: commands.cpp:2231
void applyFuncOnFilteredArgs(const KernelArgMask *EliminatedArgMask, std::vector< ArgDesc > &Args, std::function< void(detail::ArgDesc &Arg, int NextTrueIndex)> Func)
Definition: commands.cpp:123
ur_result_t enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, bool blocking, void *ptr, size_t size, std::vector< ur_event_handle_t > &RawEvents, const detail::EventImplPtr &OutEventImpl, bool read)
Definition: commands.cpp:2670
auto tie(Ts &...Args)
Definition: tuple.hpp:39
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: helpers.hpp:45
static ContextImplPtr getContext(const QueueImplPtr &Queue)
Definition: commands.cpp:98
std::enable_if< !std::is_same< typename Param::return_type, sycl::range< 3 > >::value, typename Param::return_type >::type get_kernel_device_specific_info(ur_kernel_handle_t Kernel, ur_device_handle_t Device, const PluginPtr &Plugin)
Definition: kernel_info.hpp:97
static std::string accessModeToString(access::mode Mode)
Definition: commands.cpp:157
std::string queueDeviceToString(const queue_impl *const &Queue)
std::tuple< const RTDeviceBinaryImage *, ur_program_handle_t > retrieveKernelBinary(const QueueImplPtr &, const char *KernelName, CGExecKernel *CGKernel=nullptr)
Definition: helpers.cpp:38
Function for_each(Group g, Ptr first, Ptr last, Function f)
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, _Tp > arg(const complex< _Tp > &__c)
PropertyListT int access::address_space multi_ptr & operator=(multi_ptr &&)=default
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:65
Definition: access.hpp:18
Dependency between two commands.
Definition: commands.hpp:83
const Requirement * MDepRequirement
Requirement for the dependency.
Definition: commands.hpp:96
Command * MDepCommand
The actual dependency command.
Definition: commands.hpp:94
Result of command enqueueing.
Definition: commands.hpp:64