DPC++ Runtime
Runtime libraries for oneAPI DPC++
commands.cpp
Go to the documentation of this file.
1 //
2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3 // See https://llvm.org/LICENSE.txt for license information.
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5 //
6 //===----------------------------------------------------------------------===//
7 
8 #include "ur_api.h"
10 
11 #include <detail/context_impl.hpp>
12 #include <detail/event_impl.hpp>
15 #include <detail/kernel_impl.hpp>
16 #include <detail/kernel_info.hpp>
19 #include <detail/queue_impl.hpp>
20 #include <detail/sampler_impl.hpp>
23 #include <detail/stream_impl.hpp>
24 #include <detail/xpti_registry.hpp>
25 #include <sycl/access/access.hpp>
26 #include <sycl/backend_types.hpp>
27 #include <sycl/detail/cg_types.hpp>
28 #include <sycl/detail/helpers.hpp>
30 #include <sycl/sampler.hpp>
31 
32 #include <cassert>
33 #include <optional>
34 #include <string>
35 #include <vector>
36 
37 #ifdef __has_include
38 #if __has_include(<cxxabi.h>)
39 #define __SYCL_ENABLE_GNU_DEMANGLING
40 #include <cstdlib>
41 #include <cxxabi.h>
42 #include <memory>
43 #endif
44 #endif
45 
46 #ifdef XPTI_ENABLE_INSTRUMENTATION
47 #include "xpti/xpti_trace_framework.hpp"
48 #include <detail/xpti_registry.hpp>
49 #endif
50 
51 namespace sycl {
52 inline namespace _V1 {
53 namespace detail {
54 
55 #ifdef XPTI_ENABLE_INSTRUMENTATION
56 // Global graph for the application
57 extern xpti::trace_event_data_t *GSYCLGraphEvent;
58 
59 static bool CurrentCodeLocationValid() {
60  detail::tls_code_loc_t Tls;
61  auto CodeLoc = Tls.query();
62  auto FileName = CodeLoc.fileName();
63  auto FunctionName = CodeLoc.functionName();
64  return (FileName && FileName[0] != '\0') ||
65  (FunctionName && FunctionName[0] != '\0');
66 }
67 
68 void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
69  xpti_td *TraceEvent, uint16_t Type,
70  const void *Addr) {
71  if (!(xptiCheckTraceEnabled(StreamID, Type) && TraceEvent))
72  return;
73  // Trace event notifier that emits a Type event
74  xptiNotifySubscribers(StreamID, Type, detail::GSYCLGraphEvent,
75  static_cast<xpti_td *>(TraceEvent), InstanceID, Addr);
76 }
77 
78 static size_t deviceToID(const device &Device) {
79  return reinterpret_cast<size_t>(getSyclObjImpl(Device)->getHandleRef());
80 }
81 
82 static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) {
83  xpti::addMetadata(TraceEvent, "sycl_device_type",
84  queueDeviceToString(Queue.get()));
85  if (Queue) {
86  xpti::addMetadata(TraceEvent, "sycl_device",
87  deviceToID(Queue->get_device()));
88  xpti::addMetadata(TraceEvent, "sycl_device_name",
89  getSyclObjImpl(Queue->get_device())->getDeviceName());
90  }
91 }
92 
93 static unsigned long long getQueueID(const QueueImplPtr &Queue) {
94  return Queue ? Queue->getQueueID() : 0;
95 }
96 #endif
97 
98 static ContextImplPtr getContext(const QueueImplPtr &Queue) {
99  if (Queue)
100  return Queue->getContextImplPtr();
101  return nullptr;
102 }
103 
104 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
105 struct DemangleHandle {
106  char *p;
107  DemangleHandle(char *ptr) : p(ptr) {}
108 
109  DemangleHandle(const DemangleHandle &) = delete;
110  DemangleHandle &operator=(const DemangleHandle &) = delete;
111 
112  ~DemangleHandle() { std::free(p); }
113 };
114 static std::string demangleKernelName(std::string Name) {
115  int Status = -1; // some arbitrary value to eliminate the compiler warning
116  DemangleHandle result(abi::__cxa_demangle(Name.c_str(), NULL, NULL, &Status));
117  return (Status == 0) ? result.p : Name;
118 }
119 #else
120 static std::string demangleKernelName(std::string Name) { return Name; }
121 #endif
122 
124  const KernelArgMask *EliminatedArgMask, std::vector<ArgDesc> &Args,
125  std::function<void(detail::ArgDesc &Arg, int NextTrueIndex)> Func) {
126  if (!EliminatedArgMask || EliminatedArgMask->size() == 0) {
127  for (ArgDesc &Arg : Args) {
128  Func(Arg, Arg.MIndex);
129  }
130  } else {
131  // TODO this is not necessary as long as we can guarantee that the
132  // arguments are already sorted (e. g. handle the sorting in handler
133  // if necessary due to set_arg(...) usage).
134  std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
135  return A.MIndex < B.MIndex;
136  });
137  int LastIndex = -1;
138  size_t NextTrueIndex = 0;
139 
140  for (ArgDesc &Arg : Args) {
141  // Handle potential gaps in set arguments (e. g. if some of them are
142  // set on the user side).
143  for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx)
144  if (!(*EliminatedArgMask)[Idx])
145  ++NextTrueIndex;
146  LastIndex = Arg.MIndex;
147 
148  if ((*EliminatedArgMask)[Arg.MIndex])
149  continue;
150 
151  Func(Arg, NextTrueIndex);
152  ++NextTrueIndex;
153  }
154  }
155 }
156 
157 static std::string accessModeToString(access::mode Mode) {
158  switch (Mode) {
159  case access::mode::read:
160  return "read";
161  case access::mode::write:
162  return "write";
164  return "read_write";
166  return "discard_write";
168  return "discard_read_write";
169  default:
170  return "unknown";
171  }
172 }
173 
174 #ifdef XPTI_ENABLE_INSTRUMENTATION
175 // Using the command group type to create node types for the asynchronous task
176 // graph modeling
177 static std::string commandToNodeType(Command::CommandType Type) {
178  switch (Type) {
179  case Command::CommandType::RUN_CG:
180  return "command_group_node";
181  case Command::CommandType::COPY_MEMORY:
182  return "memory_transfer_node";
183  case Command::CommandType::ALLOCA:
184  return "memory_allocation_node";
185  case Command::CommandType::ALLOCA_SUB_BUF:
186  return "sub_buffer_creation_node";
187  case Command::CommandType::RELEASE:
188  return "memory_deallocation_node";
189  case Command::CommandType::MAP_MEM_OBJ:
190  return "memory_transfer_node";
191  case Command::CommandType::UNMAP_MEM_OBJ:
192  return "memory_transfer_node";
193  case Command::CommandType::UPDATE_REQUIREMENT:
194  return "host_acc_create_buffer_lock_node";
195  case Command::CommandType::EMPTY_TASK:
196  return "host_acc_destroy_buffer_release_node";
197  default:
198  return "unknown_node";
199  }
200 }
201 
202 // Using the names being generated and the string are subject to change to
203 // something more meaningful to end-users as this will be visible in analysis
204 // tools that subscribe to this data
205 static std::string commandToName(Command::CommandType Type) {
206  switch (Type) {
207  case Command::CommandType::RUN_CG:
208  return "Command Group Action";
209  case Command::CommandType::COPY_MEMORY:
210  return "Memory Transfer (Copy)";
211  case Command::CommandType::ALLOCA:
212  return "Memory Allocation";
213  case Command::CommandType::ALLOCA_SUB_BUF:
214  return "Sub Buffer Creation";
215  case Command::CommandType::RELEASE:
216  return "Memory Deallocation";
217  case Command::CommandType::MAP_MEM_OBJ:
218  return "Memory Transfer (Map)";
219  case Command::CommandType::UNMAP_MEM_OBJ:
220  return "Memory Transfer (Unmap)";
221  case Command::CommandType::UPDATE_REQUIREMENT:
222  return "Host Accessor Creation/Buffer Lock";
223  case Command::CommandType::EMPTY_TASK:
224  return "Host Accessor Destruction/Buffer Lock Release";
225  default:
226  return "Unknown Action";
227  }
228 }
229 #endif
230 
231 std::vector<ur_event_handle_t>
232 Command::getUrEvents(const std::vector<EventImplPtr> &EventImpls) const {
233  std::vector<ur_event_handle_t> RetUrEvents;
234  for (auto &EventImpl : EventImpls) {
235  auto Handle = EventImpl->getHandle();
236  if (Handle == nullptr)
237  continue;
238 
239  // Do not add redundant event dependencies for in-order queues.
240  // At this stage dependency is definitely ur task and need to check if
241  // current one is a host task. In this case we should not skip ur event due
242  // to different sync mechanisms for different task types on in-order queue.
243  if (MWorkerQueue && EventImpl->getWorkerQueue() == MWorkerQueue &&
244  MWorkerQueue->isInOrder() && !isHostTask())
245  continue;
246 
247  RetUrEvents.push_back(Handle);
248  }
249 
250  return RetUrEvents;
251 }
252 
253 // This function is implemented (duplicating getUrEvents a lot) as short term
254 // solution for the issue that barrier with wait list could not
255 // handle empty ur event handles when kernel is enqueued on host task
256 // completion.
257 std::vector<ur_event_handle_t> Command::getUrEventsBlocking(
258  const std::vector<EventImplPtr> &EventImpls) const {
259  std::vector<ur_event_handle_t> RetUrEvents;
260  for (auto &EventImpl : EventImpls) {
261  // Throwaway events created with empty constructor will not have a context
262  // (which is set lazily) calling getContextImpl() would set that
263  // context, which we wish to avoid as it is expensive.
264  // Skip host task and NOP events also.
265  if (EventImpl->isDefaultConstructed() || EventImpl->isHost() ||
266  EventImpl->isNOP())
267  continue;
268 
269  // If command has not been enqueued then we have to enqueue it.
270  // It may happen if async enqueue in a host task is involved.
271  // Interoperability events are special cases and they are not enqueued, as
272  // they don't have an associated queue and command.
273  if (!EventImpl->isInterop() && !EventImpl->isEnqueued()) {
274  if (!EventImpl->getCommand() ||
275  !static_cast<Command *>(EventImpl->getCommand())->producesPiEvent())
276  continue;
277  std::vector<Command *> AuxCmds;
278  Scheduler::getInstance().enqueueCommandForCG(EventImpl, AuxCmds,
279  BLOCKING);
280  }
281  // Do not add redundant event dependencies for in-order queues.
282  // At this stage dependency is definitely ur task and need to check if
283  // current one is a host task. In this case we should not skip pi event due
284  // to different sync mechanisms for different task types on in-order queue.
285  if (MWorkerQueue && EventImpl->getWorkerQueue() == MWorkerQueue &&
286  MWorkerQueue->isInOrder() && !isHostTask())
287  continue;
288 
289  RetUrEvents.push_back(EventImpl->getHandle());
290  }
291 
292  return RetUrEvents;
293 }
294 
295 bool Command::isHostTask() const {
296  return (MType == CommandType::RUN_CG) /* host task has this type also */ &&
297  ((static_cast<const ExecCGCommand *>(this))->getCG().getType() ==
299 }
300 
301 bool Command::isFusable() const {
302  if ((MType != CommandType::RUN_CG)) {
303  return false;
304  }
305  const auto &CG = (static_cast<const ExecCGCommand &>(*this)).getCG();
306  return (CG.getType() == CGType::Kernel) &&
307  (!static_cast<const CGExecKernel &>(CG).MKernelIsCooperative) &&
308  (!static_cast<const CGExecKernel &>(CG).MKernelUsesClusterLaunch);
309 }
310 
311 static void flushCrossQueueDeps(const std::vector<EventImplPtr> &EventImpls,
312  const QueueImplPtr &Queue) {
313  for (auto &EventImpl : EventImpls) {
314  EventImpl->flushIfNeeded(Queue);
315  }
316 }
317 
318 namespace {
319 
320 struct EnqueueNativeCommandData {
322  std::function<void(interop_handle)> func;
323 };
324 
325 void InteropFreeFunc(ur_queue_handle_t, void *InteropData) {
326  auto *Data = reinterpret_cast<EnqueueNativeCommandData *>(InteropData);
327  return Data->func(Data->ih);
328 }
329 } // namespace
330 
332  ExecCGCommand *MThisCmd;
333  std::vector<interop_handle::ReqToMem> MReqToMem;
334  std::vector<ur_mem_handle_t> MReqUrMem;
335 
336  bool waitForEvents() const {
337  std::map<const PluginPtr, std::vector<EventImplPtr>>
338  RequiredEventsPerPlugin;
339 
340  for (const EventImplPtr &Event : MThisCmd->MPreparedDepsEvents) {
341  const PluginPtr &Plugin = Event->getPlugin();
342  RequiredEventsPerPlugin[Plugin].push_back(Event);
343  }
344 
345  // wait for dependency device events
346  // FIXME Current implementation of waiting for events will make the thread
347  // 'sleep' until all of dependency events are complete. We need a bit more
348  // sophisticated waiting mechanism to allow to utilize this thread for any
349  // other available job and resume once all required events are ready.
350  for (auto &PluginWithEvents : RequiredEventsPerPlugin) {
351  std::vector<ur_event_handle_t> RawEvents =
352  MThisCmd->getUrEvents(PluginWithEvents.second);
353  if (RawEvents.size() == 0)
354  continue;
355  try {
356  PluginWithEvents.first->call<UrApiKind::urEventWait>(RawEvents.size(),
357  RawEvents.data());
358  } catch (const sycl::exception &) {
359  MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(
360  std::current_exception());
361  return false;
362  } catch (...) {
363  MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(
364  std::current_exception());
365  return false;
366  }
367  }
368 
369  // Wait for dependency host events.
370  // Host events can't throw exceptions so don't try to catch it.
371  for (const EventImplPtr &Event : MThisCmd->MPreparedHostDepsEvents) {
372  Event->waitInternal();
373  }
374 
375  return true;
376  }
377 
378 public:
380  std::vector<interop_handle::ReqToMem> ReqToMem,
381  std::vector<ur_mem_handle_t> ReqUrMem)
382  : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)),
383  MReqUrMem(std::move(ReqUrMem)) {}
384 
385  void operator()() const {
386  assert(MThisCmd->getCG().getType() == CGType::CodeplayHostTask);
387 
388  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
389 
390 #ifdef XPTI_ENABLE_INSTRUMENTATION
391  // Host task is executed async and in a separate thread that do not allow to
392  // use code location data stored in TLS. So we keep submission code location
393  // as Command field and put it here to TLS so that thrown exception could
394  // query and report it.
395  std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
396  if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
397  AsyncCodeLocationPtr.reset(
399  }
400 #endif
401 
402  if (!waitForEvents()) {
403  std::exception_ptr EPtr = std::make_exception_ptr(sycl::exception(
405  std::string("Couldn't wait for host-task's dependencies")));
406 
407  MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(EPtr);
408  // reset host-task's lambda and quit
409  HostTask.MHostTask.reset();
411  return;
412  }
413 
414  try {
415  // we're ready to call the user-defined lambda now
416  if (HostTask.MHostTask->isInteropTask()) {
417  assert(HostTask.MQueue &&
418  "Host task submissions should have an associated queue");
419  interop_handle IH{MReqToMem, HostTask.MQueue,
420  HostTask.MQueue->getDeviceImplPtr(),
421  HostTask.MQueue->getContextImplPtr()};
422  // TODO: should all the backends that support this entry point use this
423  // for host task?
424  auto &Queue = HostTask.MQueue;
425  bool NativeCommandSupport = false;
426  Queue->getPlugin()->call<UrApiKind::urDeviceGetInfo>(
427  detail::getSyclObjImpl(Queue->get_device())->getHandleRef(),
428  UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP,
429  sizeof(NativeCommandSupport), &NativeCommandSupport, nullptr);
430  if (NativeCommandSupport) {
431  EnqueueNativeCommandData CustomOpData{
432  IH, HostTask.MHostTask->MInteropTask};
433 
434  // We are assuming that we have already synchronized with the HT's
435  // dependent events, and that the user will synchronize before the end
436  // of the HT lambda. As such we don't pass in any events, or ask for
437  // one back.
438  //
439  // This entry point is needed in order to migrate memory across
440  // devices in the same context for CUDA and HIP backends
441  Queue->getPlugin()->call<UrApiKind::urEnqueueNativeCommandExp>(
442  HostTask.MQueue->getHandleRef(), InteropFreeFunc, &CustomOpData,
443  MReqUrMem.size(), MReqUrMem.data(), nullptr, 0, nullptr, nullptr);
444  } else {
445  HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo(),
446  IH);
447  }
448  } else
449  HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo());
450  } catch (...) {
451  auto CurrentException = std::current_exception();
452 #ifdef XPTI_ENABLE_INSTRUMENTATION
453  // sycl::exception emit tracing of message with code location if
454  // available. For other types of exception we need to explicitly trigger
455  // tracing by calling TraceEventXPTI.
456  if (xptiTraceEnabled()) {
457  try {
458  rethrow_exception(CurrentException);
459  } catch (const sycl::exception &) {
460  // it is already traced, nothing to care about
461  } catch (const std::exception &StdException) {
462  GlobalHandler::instance().TraceEventXPTI(StdException.what());
463  } catch (...) {
465  "Host task lambda thrown non standard exception");
466  }
467  }
468 #endif
469  MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(
470  CurrentException);
471  }
472 
473  HostTask.MHostTask.reset();
474 
475 #ifdef XPTI_ENABLE_INSTRUMENTATION
476  // Host Task is done, clear its submittion location to not interfere with
477  // following dependent kernels submission.
478  AsyncCodeLocationPtr.reset();
479 #endif
480 
481  try {
482  // If we enqueue blocked users - ur level could throw exception that
483  // should be treated as async now.
485  } catch (...) {
486  auto CurrentException = std::current_exception();
487  MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(
488  CurrentException);
489  }
490  }
491 };
492 
494  for (const EventImplPtr &HostEvent : MPreparedHostDepsEvents)
495  HostEvent->waitInternal();
496 }
497 
499  std::vector<EventImplPtr> &EventImpls,
500  ur_event_handle_t &Event) {
501 #ifndef NDEBUG
502  for (const EventImplPtr &Event : EventImpls)
503  assert(!Event->isHost() &&
504  "Only non-host events are expected to be waited for here");
505 #endif
506  if (!EventImpls.empty()) {
507  if (!Queue) {
508  // Host queue can wait for events from different contexts, i.e. it may
509  // contain events with different contexts in its MPreparedDepsEvents.
510  // OpenCL 2.1 spec says that clWaitForEvents will return
511  // CL_INVALID_CONTEXT if events specified in the list do not belong to
512  // the same context. Thus we split all the events into per-context map.
513  // An example. We have two queues for the same CPU device: Q1, Q2. Thus
514  // we will have two different contexts for the same CPU device: C1, C2.
515  // Also we have default host queue. This queue is accessible via
516  // Scheduler. Now, let's assume we have three different events: E1(C1),
517  // E2(C1), E3(C2). The command's MPreparedDepsEvents will contain all
518  // three events (E1, E2, E3). Now, if urEventWait is called for all
519  // three events we'll experience failure with CL_INVALID_CONTEXT 'cause
520  // these events refer to different contexts.
521  std::map<context_impl *, std::vector<EventImplPtr>>
522  RequiredEventsPerContext;
523 
524  for (const EventImplPtr &Event : EventImpls) {
525  ContextImplPtr Context = Event->getContextImpl();
526  assert(Context.get() &&
527  "Only non-host events are expected to be waited for here");
528  RequiredEventsPerContext[Context.get()].push_back(Event);
529  }
530 
531  for (auto &CtxWithEvents : RequiredEventsPerContext) {
532  std::vector<ur_event_handle_t> RawEvents =
533  getUrEvents(CtxWithEvents.second);
534  if (!RawEvents.empty()) {
535  CtxWithEvents.first->getPlugin()->call<UrApiKind::urEventWait>(
536  RawEvents.size(), RawEvents.data());
537  }
538  }
539  } else {
540  std::vector<ur_event_handle_t> RawEvents = getUrEvents(EventImpls);
541  flushCrossQueueDeps(EventImpls, MWorkerQueue);
542  const PluginPtr &Plugin = Queue->getPlugin();
543 
544  if (MEvent != nullptr)
545  MEvent->setHostEnqueueTime();
546  Plugin->call<UrApiKind::urEnqueueEventsWait>(
547  Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
548  }
549  }
550 }
551 
556  CommandType Type, QueueImplPtr Queue,
557  ur_exp_command_buffer_handle_t CommandBuffer,
558  const std::vector<ur_exp_command_buffer_sync_point_t> &SyncPoints)
559  : MQueue(std::move(Queue)),
560  MEvent(std::make_shared<detail::event_impl>(MQueue)),
561  MPreparedDepsEvents(MEvent->getPreparedDepsEvents()),
562  MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()), MType(Type),
563  MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints) {
565  MEvent->setWorkerQueue(MWorkerQueue);
566  MEvent->setSubmittedQueue(MWorkerQueue);
567  MEvent->setCommand(this);
568  if (MQueue)
569  MEvent->setContextImpl(MQueue->getContextImplPtr());
570  MEvent->setStateIncomplete();
572 
573 #ifdef XPTI_ENABLE_INSTRUMENTATION
574  if (!xptiTraceEnabled())
575  return;
576  // Obtain the stream ID so all commands can emit traces to that stream
577  MStreamID = xptiRegisterStream(SYCL_STREAM_NAME);
578 #endif
579 }
580 
582 #ifdef XPTI_ENABLE_INSTRUMENTATION
584 #endif
585 }
586 
598  Command *Cmd, void *ObjAddr, bool IsCommand,
599  std::optional<access::mode> AccMode) {
600 #ifdef XPTI_ENABLE_INSTRUMENTATION
601  // Bail early if either the source or the target node for the given
602  // dependency is undefined or NULL
603  constexpr uint16_t NotificationTraceType = xpti::trace_edge_create;
604  if (!(xptiCheckTraceEnabled(MStreamID, NotificationTraceType) &&
605  MTraceEvent && Cmd && Cmd->MTraceEvent))
606  return;
607 
608  // If all the information we need for creating an edge event is available,
609  // then go ahead with creating it; if not, bail early!
610  xpti::utils::StringHelper SH;
611  std::string AddressStr = SH.addressAsString<void *>(ObjAddr);
612  std::string Prefix = AccMode ? accessModeToString(AccMode.value()) : "Event";
613  std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
614  // Create an edge with the dependent buffer address for which a command
615  // object has been created as one of the properties of the edge
616  xpti::payload_t Payload(TypeString.c_str(), MAddress);
617  uint64_t EdgeInstanceNo;
618  xpti_td *EdgeEvent =
619  xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
620  xpti_at::active, &EdgeInstanceNo);
621  if (EdgeEvent) {
622  xpti_td *SrcEvent = static_cast<xpti_td *>(Cmd->MTraceEvent);
623  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
624  EdgeEvent->source_id = SrcEvent->unique_id;
625  EdgeEvent->target_id = TgtEvent->unique_id;
626  if (IsCommand) {
627  xpti::addMetadata(EdgeEvent, "access_mode",
628  static_cast<int>(AccMode.value()));
629  xpti::addMetadata(EdgeEvent, "memory_object",
630  reinterpret_cast<size_t>(ObjAddr));
631  } else {
632  xpti::addMetadata(EdgeEvent, "event", reinterpret_cast<size_t>(ObjAddr));
633  }
634  xptiNotifySubscribers(MStreamID, NotificationTraceType,
635  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
636  nullptr);
637  }
638  // General comment - None of these are serious errors as the instrumentation
639  // layer MUST be tolerant of errors. If we need to let the end user know, we
640  // throw exceptions in the future
641 #endif
642 }
643 
649  ur_event_handle_t &UrEventAddr) {
650 #ifdef XPTI_ENABLE_INSTRUMENTATION
651  // If we have failed to create an event to represent the Command, then we
652  // cannot emit an edge event. Bail early!
653  if (!(xptiCheckTraceEnabled(MStreamID) && MTraceEvent))
654  return;
655 
656  if (Cmd && Cmd->MTraceEvent) {
657  // If the event is associated with a command, we use this command's trace
658  // event as the source of edge, hence modeling the control flow
659  emitEdgeEventForCommandDependence(Cmd, (void *)UrEventAddr, false);
660  return;
661  }
662  if (UrEventAddr) {
663  xpti::utils::StringHelper SH;
664  std::string AddressStr = SH.addressAsString<ur_event_handle_t>(UrEventAddr);
665  // This is the case when it is a OCL event enqueued by the user or another
666  // event is registered by the runtime as a dependency The dependency on
667  // this occasion is an OCL event; so we build a virtual node in the graph
668  // with the event as the metadata for the node
669  std::string NodeName = SH.nameWithAddressString("virtual_node", AddressStr);
670 
671  // Node name is "virtual_node[<event_addr>]"
672  xpti::payload_t VNPayload(NodeName.c_str(), MAddress);
673  uint64_t VNodeInstanceNo;
674  xpti_td *NodeEvent =
675  xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
676  xpti_at::active, &VNodeInstanceNo);
677  // Emit the virtual node first
678  xpti::addMetadata(NodeEvent, "kernel_name", NodeName);
679  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
680  detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
681  nullptr);
682  // Create a new event for the edge
683  std::string EdgeName = SH.nameWithAddressString("Event", AddressStr);
684  xpti::payload_t EdgePayload(EdgeName.c_str(), MAddress);
685  uint64_t EdgeInstanceNo;
686  xpti_td *EdgeEvent =
687  xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
688  xpti_at::active, &EdgeInstanceNo);
689  if (EdgeEvent && NodeEvent) {
690  // Source node represents the event and this event needs to be completed
691  // before target node can execute
692  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
693  EdgeEvent->source_id = NodeEvent->unique_id;
694  EdgeEvent->target_id = TgtEvent->unique_id;
695  xpti::addMetadata(EdgeEvent, "event",
696  reinterpret_cast<size_t>(UrEventAddr));
697  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
698  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
699  nullptr);
700  }
701  return;
702  }
703 #endif
704 }
705 
706 uint64_t Command::makeTraceEventProlog(void *MAddress) {
707  uint64_t CommandInstanceNo = 0;
708 #ifdef XPTI_ENABLE_INSTRUMENTATION
709  if (!xptiCheckTraceEnabled(MStreamID))
710  return CommandInstanceNo;
711 
713  // Setup the member variables with information needed for event notification
714  MCommandNodeType = commandToNodeType(MType);
715  MCommandName = commandToName(MType);
716  xpti::utils::StringHelper SH;
717  MAddressString = SH.addressAsString<void *>(MAddress);
718  std::string CommandString =
719  SH.nameWithAddressString(MCommandName, MAddressString);
720 
721  xpti::payload_t p(CommandString.c_str(), MAddress);
722  xpti_td *CmdTraceEvent =
723  xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
724  xpti_at::active, &CommandInstanceNo);
725  MInstanceID = CommandInstanceNo;
726  if (CmdTraceEvent) {
727  MTraceEvent = (void *)CmdTraceEvent;
728  // If we are seeing this event again, then the instance ID will be greater
729  // than 1; in the previous implementation, we would skip sending a
730  // notifications for subsequent instances. With the new implementation, we
731  // will send a notification for each instance as this allows for mutable
732  // metadata entries for multiple visits to the same code location and
733  // maintaining data integrity.
734  }
735 #endif
736  return CommandInstanceNo;
737 }
738 
740 #ifdef XPTI_ENABLE_INSTRUMENTATION
741  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
742  if (!(xptiCheckTraceEnabled(MStreamID, NotificationTraceType) && MTraceEvent))
743  return;
745  xptiNotifySubscribers(MStreamID, NotificationTraceType,
746  detail::GSYCLGraphEvent,
747  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
748  static_cast<const void *>(MCommandNodeType.c_str()));
749 #endif
750 }
751 
753  std::vector<Command *> &ToCleanUp) {
754  const ContextImplPtr &WorkerContext = getWorkerContext();
755 
756  // 1. Non-host events can be ignored if they are not fully initialized.
757  // 2. Some types of commands do not produce UR events after they are
758  // enqueued (e.g. alloca). Note that we can't check the ur event to make that
759  // distinction since the command might still be unenqueued at this point.
760  bool PiEventExpected =
761  (!DepEvent->isHost() && !DepEvent->isDefaultConstructed());
762  if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
763  PiEventExpected &= DepCmd->producesPiEvent();
764 
765  if (!PiEventExpected) {
766  // call to waitInternal() is in waitForPreparedHostEvents() as it's called
767  // from enqueue process functions
768  MPreparedHostDepsEvents.push_back(DepEvent);
769  return nullptr;
770  }
771 
772  Command *ConnectionCmd = nullptr;
773 
774  ContextImplPtr DepEventContext = DepEvent->getContextImpl();
775  // If contexts don't match we'll connect them using host task
776  if (DepEventContext != WorkerContext && WorkerContext) {
778  ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp);
779  } else
780  MPreparedDepsEvents.push_back(std::move(DepEvent));
781 
782  return ConnectionCmd;
783 }
784 
786  if (!MQueue)
787  return nullptr;
788  return MQueue->getContextImplPtr();
789 }
790 
791 bool Command::producesPiEvent() const { return true; }
792 
793 bool Command::supportsPostEnqueueCleanup() const { return true; }
794 
796  return MLeafCounter == 0 &&
798 }
799 
800 Command *Command::addDep(DepDesc NewDep, std::vector<Command *> &ToCleanUp) {
801  Command *ConnectionCmd = nullptr;
802 
803  if (NewDep.MDepCommand) {
804  ConnectionCmd =
805  processDepEvent(NewDep.MDepCommand->getEvent(), NewDep, ToCleanUp);
806  }
807  // ConnectionCmd insertion builds the following dependency structure:
808  // this -> emptyCmd (for ConnectionCmd) -> ConnectionCmd -> NewDep
809  // that means that this and NewDep are already dependent
810  if (!ConnectionCmd) {
811  MDeps.push_back(NewDep);
812  if (NewDep.MDepCommand)
813  NewDep.MDepCommand->addUser(this);
814  }
815 
816 #ifdef XPTI_ENABLE_INSTRUMENTATION
818  (void *)NewDep.MDepRequirement->MSYCLMemObj,
819  true, NewDep.MDepRequirement->MAccessMode);
820 #endif
821 
822  return ConnectionCmd;
823 }
824 
826  std::vector<Command *> &ToCleanUp) {
827 #ifdef XPTI_ENABLE_INSTRUMENTATION
828  // We need this for just the instrumentation, so guarding it will prevent
829  // unused variable warnings when instrumentation is turned off
830  Command *Cmd = (Command *)Event->getCommand();
831  ur_event_handle_t UrEventAddr = Event->getHandle();
832  // Now make an edge for the dependent event
833  emitEdgeEventForEventDependence(Cmd, UrEventAddr);
834 #endif
835 
836  return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr},
837  ToCleanUp);
838 }
839 
840 void Command::emitEnqueuedEventSignal(const ur_event_handle_t UrEventAddr) {
841 #ifdef XPTI_ENABLE_INSTRUMENTATION
842  emitInstrumentationGeneral(
843  MStreamID, MInstanceID, static_cast<xpti_td *>(MTraceEvent),
844  xpti::trace_signal, static_cast<const void *>(UrEventAddr));
845 #endif
846  std::ignore = UrEventAddr;
847 }
848 
849 void Command::emitInstrumentation(uint16_t Type, const char *Txt) {
850 #ifdef XPTI_ENABLE_INSTRUMENTATION
851  return emitInstrumentationGeneral(MStreamID, MInstanceID,
852  static_cast<xpti_td *>(MTraceEvent), Type,
853  static_cast<const void *>(Txt));
854 #else
855  std::ignore = Type;
856  std::ignore = Txt;
857 #endif
858 }
859 
860 bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking,
861  std::vector<Command *> &ToCleanUp) {
862 #ifdef XPTI_ENABLE_INSTRUMENTATION
863  // If command is enqueued from host task thread - it will not have valid
864  // submission code location set. So we set it manually to properly trace
865  // failures if ur level report any.
866  std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
867  if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
868  AsyncCodeLocationPtr.reset(
870  }
871 #endif
872  // Exit if already enqueued
874  return true;
875 
876  // If the command is blocked from enqueueing
878  // Exit if enqueue type is not blocking
879  if (!Blocking) {
880  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, this);
881  return false;
882  }
883 
884 #ifdef XPTI_ENABLE_INSTRUMENTATION
885  // Scoped trace event notifier that emits a barrier begin and barrier end
886  // event, which models the barrier while enqueuing along with the blocked
887  // reason, as determined by the scheduler
888  std::string Info = "enqueue.barrier[";
889  Info += std::string(getBlockReason()) + "]";
890  emitInstrumentation(xpti::trace_barrier_begin, Info.c_str());
891 #endif
892 
893  // Wait if blocking
895  ;
896 #ifdef XPTI_ENABLE_INSTRUMENTATION
897  emitInstrumentation(xpti::trace_barrier_end, Info.c_str());
898 #endif
899  }
900 
901  std::lock_guard<std::mutex> Lock(MEnqueueMtx);
902 
903  // Exit if the command is already enqueued
905  return true;
906 
907 #ifdef XPTI_ENABLE_INSTRUMENTATION
908  emitInstrumentation(xpti::trace_task_begin, nullptr);
909 #endif
910 
912  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueFailed, this);
913  return false;
914  }
915 
916  // Command status set to "failed" beforehand, so this command
917  // has already been marked as "failed" if enqueueImp throws an exception.
918  // This will avoid execution of the same failed command twice.
921  ur_result_t Res = enqueueImp();
922 
923  if (UR_RESULT_SUCCESS != Res)
924  EnqueueResult =
926  else {
927  MEvent->setEnqueued();
929  (MEvent->isHost() || MEvent->getHandle() == nullptr))
930  MEvent->setComplete();
931 
932  // Consider the command is successfully enqueued if return code is
933  // UR_RESULT_SUCCESS
938  assert(!MMarkedForCleanup);
939  MMarkedForCleanup = true;
940  ToCleanUp.push_back(this);
941  }
942  }
943 
944  // Emit this correlation signal before the task end
945  emitEnqueuedEventSignal(MEvent->getHandle());
946 #ifdef XPTI_ENABLE_INSTRUMENTATION
947  emitInstrumentation(xpti::trace_task_end, nullptr);
948 #endif
950 }
951 
952 void Command::resolveReleaseDependencies(std::set<Command *> &DepList) {
953 #ifdef XPTI_ENABLE_INSTRUMENTATION
954  assert(MType == CommandType::RELEASE && "Expected release command");
955  if (!MTraceEvent)
956  return;
957  // The current command is the target node for all dependencies as the source
958  // nodes have to be completed first before the current node can begin to
959  // execute; these edges model control flow
960  xpti_td *TgtTraceEvent = static_cast<xpti_td *>(MTraceEvent);
961  // We have all the Commands that must be completed before the release
962  // command can be enqueued; here we'll find the command that is an Alloca
963  // with the same SYCLMemObject address and create a dependency line (edge)
964  // between them in our sematic modeling
965  for (auto &Item : DepList) {
966  if (Item->MTraceEvent && Item->MAddress == MAddress) {
967  xpti::utils::StringHelper SH;
968  std::string AddressStr = SH.addressAsString<void *>(MAddress);
969  std::string TypeString =
970  "Edge:" + SH.nameWithAddressString(commandToName(MType), AddressStr);
971 
972  // Create an edge with the dependent buffer address being one of the
973  // properties of the edge
974  xpti::payload_t p(TypeString.c_str(), MAddress);
975  uint64_t EdgeInstanceNo;
976  xpti_td *EdgeEvent =
977  xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
978  xpti_at::active, &EdgeInstanceNo);
979  if (EdgeEvent) {
980  xpti_td *SrcTraceEvent = static_cast<xpti_td *>(Item->MTraceEvent);
981  EdgeEvent->target_id = TgtTraceEvent->unique_id;
982  EdgeEvent->source_id = SrcTraceEvent->unique_id;
983  xpti::addMetadata(EdgeEvent, "memory_object",
984  reinterpret_cast<size_t>(MAddress));
985  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
986  detail::GSYCLGraphEvent, EdgeEvent,
987  EdgeInstanceNo, nullptr);
988  }
989  }
990  }
991 #endif
992 }
993 
994 const char *Command::getBlockReason() const {
995  switch (MBlockReason) {
997  return "A Buffer is locked by the host accessor";
999  return "Blocked by host task";
1000  default:
1001  return "Unknown block reason";
1002  }
1003 }
1004 
1006 #ifdef XPTI_ENABLE_INSTRUMENTATION
1007  if (!xptiTraceEnabled())
1008  return;
1009 
1011  auto TData = Tls.query();
1012  if (TData.fileName())
1013  MSubmissionFileName = TData.fileName();
1014  if (TData.functionName())
1015  MSubmissionFunctionName = TData.functionName();
1016  if (MSubmissionFileName.size() || MSubmissionFunctionName.size())
1019  (int)TData.lineNumber(), (int)TData.columnNumber()};
1020 #endif
1021 }
1022 
1024  Requirement Req,
1025  AllocaCommandBase *LinkedAllocaCmd,
1026  bool IsConst)
1027  : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
1028  MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst),
1029  MRequirement(std::move(Req)), MReleaseCmd(Queue, this) {
1032 }
1033 
1035 #ifdef XPTI_ENABLE_INSTRUMENTATION
1036  if (!xptiCheckTraceEnabled(MStreamID))
1037  return;
1038  // Create a payload with the command name and an event using this payload to
1039  // emit a node_create
1042  // Set the relevant meta data properties for this command; in the 64-bit key
1043  // based implementation, we would notify the graph events only for the first
1044  // instance as the trace event structure was invariant across all instances.
1045  // Due to mutable metadata requirements, we now create and notify them for all
1046  // instances. In addition to this, we have moved to 128-bit keys in the XPTI
1047  // internal infrastructure to guarantee collision free universal IDs.
1048  if (MTraceEvent) {
1049  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1050  addDeviceMetadata(TE, MQueue);
1051  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1052  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1053  // as this data is mutable and the metadata is supposed to be invariant
1054  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1055  getQueueID(MQueue));
1056  }
1057 #endif
1058 }
1059 
1060 bool AllocaCommandBase::producesPiEvent() const { return false; }
1061 
1063 
1064 bool AllocaCommandBase::readyForCleanup() const { return false; }
1065 
1067  bool InitFromUserData,
1068  AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
1069  : AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
1070  LinkedAllocaCmd, IsConst),
1071  MInitFromUserData(InitFromUserData) {
1072  // Node event must be created before the dependent edge is added to this
1073  // node, so this call must be before the addDep() call.
1075  // "Nothing to depend on"
1076  std::vector<Command *> ToCleanUp;
1077  Command *ConnectionCmd =
1078  addDep(DepDesc(nullptr, getRequirement(), this), ToCleanUp);
1079  assert(ConnectionCmd == nullptr);
1080  assert(ToCleanUp.empty());
1081  (void)ConnectionCmd;
1082 }
1083 
1085 #ifdef XPTI_ENABLE_INSTRUMENTATION
1086  if (!xptiCheckTraceEnabled(MStreamID))
1087  return;
1088 
1090 #endif
1091 }
1092 
1093 ur_result_t AllocaCommand::enqueueImp() {
1095  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1096 
1097  ur_event_handle_t UREvent = nullptr;
1098 
1099  void *HostPtr = nullptr;
1100  if (!MIsLeaderAlloca) {
1101 
1102  if (!MQueue) {
1103  // Do not need to make allocation if we have a linked device allocation
1104  Command::waitForEvents(MQueue, EventImpls, UREvent);
1105  MEvent->setHandle(UREvent);
1106 
1107  return UR_RESULT_SUCCESS;
1108  }
1109  HostPtr = MLinkedAllocaCmd->getMemAllocation();
1110  }
1111  // TODO: Check if it is correct to use std::move on stack variable and
1112  // delete it RawEvents below.
1114  MInitFromUserData, HostPtr,
1115  std::move(EventImpls), UREvent);
1116  MEvent->setHandle(UREvent);
1117  return UR_RESULT_SUCCESS;
1118 }
1119 
1120 void AllocaCommand::printDot(std::ostream &Stream) const {
1121  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1122 
1123  Stream << "ID = " << this << "\\n";
1124  Stream << "ALLOCA ON " << queueDeviceToString(MQueue.get()) << "\\n";
1125  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
1126  Stream << " Link : " << this->MLinkedAllocaCmd << "\\n";
1127  Stream << "\"];" << std::endl;
1128 
1129  for (const auto &Dep : MDeps) {
1130  if (Dep.MDepCommand == nullptr)
1131  continue;
1132  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1133  << " [ label = \"Access mode: "
1134  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1135  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1136  << std::endl;
1137  }
1138 }
1139 
1141  AllocaCommandBase *ParentAlloca,
1142  std::vector<Command *> &ToEnqueue,
1143  std::vector<Command *> &ToCleanUp)
1144  : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue),
1145  std::move(Req),
1146  /*LinkedAllocaCmd*/ nullptr, /*IsConst*/ false),
1147  MParentAlloca(ParentAlloca) {
1148  // Node event must be created before the dependent edge
1149  // is added to this node, so this call must be before
1150  // the addDep() call.
1152  Command *ConnectionCmd = addDep(
1153  DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp);
1154  if (ConnectionCmd)
1155  ToEnqueue.push_back(ConnectionCmd);
1156 }
1157 
1159 #ifdef XPTI_ENABLE_INSTRUMENTATION
1160  if (!xptiCheckTraceEnabled(MStreamID))
1161  return;
1162 
1163  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1164  xpti::addMetadata(TE, "offset", this->MRequirement.MOffsetInBytes);
1165  xpti::addMetadata(TE, "access_range_start",
1166  this->MRequirement.MAccessRange[0]);
1167  xpti::addMetadata(TE, "access_range_end", this->MRequirement.MAccessRange[1]);
1168  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
1170 #endif
1171 }
1172 
1174  // In some cases parent`s memory allocation might change (e.g., after
1175  // map/unmap operations). If parent`s memory allocation changes, sub-buffer
1176  // memory allocation should be changed as well.
1177  if (!MQueue) {
1178  return static_cast<void *>(
1179  static_cast<char *>(MParentAlloca->getMemAllocation()) +
1181  }
1182  return MMemAllocation;
1183 }
1184 
1185 ur_result_t AllocaSubBufCommand::enqueueImp() {
1187  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1188  ur_event_handle_t UREvent = nullptr;
1189 
1191  getContext(MQueue), MParentAlloca->getMemAllocation(),
1193  MRequirement.MAccessRange, std::move(EventImpls), UREvent);
1194  MEvent->setHandle(UREvent);
1195 
1197  MMemAllocation);
1198  return UR_RESULT_SUCCESS;
1199 }
1200 
1201 void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
1202  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1203 
1204  Stream << "ID = " << this << "\\n";
1205  Stream << "ALLOCA SUB BUF ON " << queueDeviceToString(MQueue.get()) << "\\n";
1206  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
1207  Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n";
1208  Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n";
1209  Stream << "\"];" << std::endl;
1210 
1211  for (const auto &Dep : MDeps) {
1212  if (Dep.MDepCommand == nullptr)
1213  continue;
1214  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1215  << " [ label = \"Access mode: "
1216  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1217  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1218  << std::endl;
1219  }
1220 }
1221 
1223  : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) {
1225 }
1226 
1228 #ifdef XPTI_ENABLE_INSTRUMENTATION
1229  if (!xptiCheckTraceEnabled(MStreamID))
1230  return;
1231  // Create a payload with the command name and an event using this payload to
1232  // emit a node_create
1233  MAddress = MAllocaCmd->getSYCLMemObj();
1235 
1236  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1237  addDeviceMetadata(TE, MQueue);
1238  xpti::addMetadata(TE, "allocation_type",
1239  commandToName(MAllocaCmd->getType()));
1240  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1241  // as this data is mutable and the metadata is supposed to be invariant
1242  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
1244 #endif
1245 }
1246 
1247 ur_result_t ReleaseCommand::enqueueImp() {
1249  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1250  std::vector<ur_event_handle_t> RawEvents = getUrEvents(EventImpls);
1251  bool SkipRelease = false;
1252 
1253  // On host side we only allocate memory for full buffers.
1254  // Thus, deallocating sub buffers leads to double memory freeing.
1255  SkipRelease |= !MQueue && MAllocaCmd->getType() == ALLOCA_SUB_BUF;
1256 
1257  const bool CurAllocaIsHost = !MAllocaCmd->getQueue();
1258  bool NeedUnmap = false;
1259  if (MAllocaCmd->MLinkedAllocaCmd) {
1260 
1261  // When releasing one of the "linked" allocations special rules take
1262  // place:
1263  // 1. Device allocation should always be released.
1264  // 2. Host allocation should be released if host allocation is "leader".
1265  // 3. Device alloca in the pair should be in active state in order to be
1266  // correctly released.
1267 
1268  // There is no actual memory allocation if a host alloca command is
1269  // created being linked to a device allocation.
1270  SkipRelease |= CurAllocaIsHost && !MAllocaCmd->MIsLeaderAlloca;
1271 
1272  NeedUnmap |= CurAllocaIsHost == MAllocaCmd->MIsActive;
1273  }
1274 
1275  if (NeedUnmap) {
1276  const QueueImplPtr &Queue = CurAllocaIsHost
1277  ? MAllocaCmd->MLinkedAllocaCmd->getQueue()
1278  : MAllocaCmd->getQueue();
1279 
1280  EventImplPtr UnmapEventImpl(new event_impl(Queue));
1281  UnmapEventImpl->setContextImpl(getContext(Queue));
1282  UnmapEventImpl->setStateIncomplete();
1283  ur_event_handle_t UREvent = nullptr;
1284 
1285  void *Src = CurAllocaIsHost
1286  ? MAllocaCmd->getMemAllocation()
1287  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1288 
1289  void *Dst = !CurAllocaIsHost
1290  ? MAllocaCmd->getMemAllocation()
1291  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1292 
1293  MemoryManager::unmap(MAllocaCmd->getSYCLMemObj(), Dst, Queue, Src,
1294  RawEvents, UREvent);
1295  UnmapEventImpl->setHandle(UREvent);
1296  std::swap(MAllocaCmd->MIsActive, MAllocaCmd->MLinkedAllocaCmd->MIsActive);
1297  EventImpls.clear();
1298  EventImpls.push_back(UnmapEventImpl);
1299  }
1300  ur_event_handle_t UREvent = nullptr;
1301  if (SkipRelease)
1302  Command::waitForEvents(MQueue, EventImpls, UREvent);
1303  else {
1305  MAllocaCmd->getMemAllocation(),
1306  std::move(EventImpls), UREvent);
1307  }
1308  MEvent->setHandle(UREvent);
1309  return UR_RESULT_SUCCESS;
1310 }
1311 
1312 void ReleaseCommand::printDot(std::ostream &Stream) const {
1313  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\"";
1314 
1315  Stream << "ID = " << this << " ; ";
1316  Stream << "RELEASE ON " << queueDeviceToString(MQueue.get()) << "\\n";
1317  Stream << " Alloca : " << MAllocaCmd << "\\n";
1318  Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n";
1319  Stream << "\"];" << std::endl;
1320 
1321  for (const auto &Dep : MDeps) {
1322  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1323  << " [ label = \"Access mode: "
1324  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1325  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1326  << std::endl;
1327  }
1328 }
1329 
1330 bool ReleaseCommand::producesPiEvent() const { return false; }
1331 
1332 bool ReleaseCommand::supportsPostEnqueueCleanup() const { return false; }
1333 
1334 bool ReleaseCommand::readyForCleanup() const { return false; }
1335 
1337  void **DstPtr, QueueImplPtr Queue,
1338  access::mode MapMode)
1339  : Command(CommandType::MAP_MEM_OBJ, std::move(Queue)),
1340  MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr),
1341  MMapMode(MapMode) {
1343 }
1344 
1346 #ifdef XPTI_ENABLE_INSTRUMENTATION
1347  if (!xptiCheckTraceEnabled(MStreamID))
1348  return;
1349  // Create a payload with the command name and an event using this payload to
1350  // emit a node_create
1351  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1353 
1354  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1355  addDeviceMetadata(TE, MQueue);
1356  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1357  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1358  // as this data is mutable and the metadata is supposed to be invariant
1359  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
1361 #endif
1362 }
1363 
1364 ur_result_t MapMemObject::enqueueImp() {
1366  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1367  std::vector<ur_event_handle_t> RawEvents = getUrEvents(EventImpls);
1368  flushCrossQueueDeps(EventImpls, MWorkerQueue);
1369 
1370  ur_event_handle_t UREvent = nullptr;
1371  *MDstPtr = MemoryManager::map(
1372  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(), MQueue,
1373  MMapMode, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1374  MSrcReq.MOffset, MSrcReq.MElemSize, std::move(RawEvents), UREvent);
1375  MEvent->setHandle(UREvent);
1376  return UR_RESULT_SUCCESS;
1377 }
1378 
1379 void MapMemObject::printDot(std::ostream &Stream) const {
1380  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
1381 
1382  Stream << "ID = " << this << " ; ";
1383  Stream << "MAP ON " << queueDeviceToString(MQueue.get()) << "\\n";
1384 
1385  Stream << "\"];" << std::endl;
1386 
1387  for (const auto &Dep : MDeps) {
1388  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1389  << " [ label = \"Access mode: "
1390  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1391  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1392  << std::endl;
1393  }
1394 }
1395 
1397  void **SrcPtr, QueueImplPtr Queue)
1398  : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)),
1399  MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) {
1401 }
1402 
1404 #ifdef XPTI_ENABLE_INSTRUMENTATION
1405  if (!xptiCheckTraceEnabled(MStreamID))
1406  return;
1407  // Create a payload with the command name and an event using this payload to
1408  // emit a node_create
1409  MAddress = MDstAllocaCmd->getSYCLMemObj();
1411 
1412  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1413  addDeviceMetadata(TE, MQueue);
1414  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1415  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1416  // as this data is mutable and the metadata is supposed to be invariant
1417  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
1419 #endif
1420 }
1421 
1423  // TODO remove this workaround once the batching issue is addressed in Level
1424  // Zero plugin.
1425  // Consider the following scenario on Level Zero:
1426  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1427  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1428  // 3. queueA.wait().
1429  // 4. queueB.wait().
1430  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1431  // dependencies (i.e. they were waited for prior to enqueueing any command
1432  // that's dependent on them). This allowed Level Zero plugin to detect that
1433  // each queue is idle on steps 1/2 and submit the command list right away.
1434  // This is no longer the case since we started passing these dependencies in
1435  // an event waitlist and Level Zero plugin attempts to batch these commands,
1436  // so the execution of kernel B starts only on step 4. This workaround
1437  // restores the old behavior in this case until this is resolved.
1438  return MQueue && (MQueue->getDeviceImplPtr()->getBackend() !=
1440  MEvent->getHandle() != nullptr);
1441 }
1442 
1443 ur_result_t UnMapMemObject::enqueueImp() {
1445  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1446  std::vector<ur_event_handle_t> RawEvents = getUrEvents(EventImpls);
1447  flushCrossQueueDeps(EventImpls, MWorkerQueue);
1448 
1449  ur_event_handle_t UREvent = nullptr;
1450  MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(),
1451  MDstAllocaCmd->getMemAllocation(), MQueue, *MSrcPtr,
1452  std::move(RawEvents), UREvent);
1453  MEvent->setHandle(UREvent);
1454 
1455  return UR_RESULT_SUCCESS;
1456 }
1457 
1458 void UnMapMemObject::printDot(std::ostream &Stream) const {
1459  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
1460 
1461  Stream << "ID = " << this << " ; ";
1462  Stream << "UNMAP ON " << queueDeviceToString(MQueue.get()) << "\\n";
1463 
1464  Stream << "\"];" << std::endl;
1465 
1466  for (const auto &Dep : MDeps) {
1467  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1468  << " [ label = \"Access mode: "
1469  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1470  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1471  << std::endl;
1472  }
1473 }
1474 
1476  AllocaCommandBase *SrcAllocaCmd,
1477  Requirement DstReq,
1478  AllocaCommandBase *DstAllocaCmd,
1479  QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
1480  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1481  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1482  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)),
1483  MDstAllocaCmd(DstAllocaCmd) {
1484  if (MSrcQueue) {
1485  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1486  }
1487 
1488  MWorkerQueue = !MQueue ? MSrcQueue : MQueue;
1489  MEvent->setWorkerQueue(MWorkerQueue);
1490 
1492 }
1493 
1495 #ifdef XPTI_ENABLE_INSTRUMENTATION
1496  if (!xptiCheckTraceEnabled(MStreamID))
1497  return;
1498  // Create a payload with the command name and an event using this payload to
1499  // emit a node_create
1500  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1502 
1503  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1504  addDeviceMetadata(CmdTraceEvent, MQueue);
1505  xpti::addMetadata(CmdTraceEvent, "memory_object",
1506  reinterpret_cast<size_t>(MAddress));
1507  xpti::addMetadata(CmdTraceEvent, "copy_from",
1508  MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0);
1509  xpti::addMetadata(CmdTraceEvent, "copy_to",
1510  MQueue ? deviceToID(MQueue->get_device()) : 0);
1511  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1512  // as this data is mutable and the metadata is supposed to be invariant
1513  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
1515 #endif
1516 }
1517 
1519  if (!MWorkerQueue)
1520  return nullptr;
1521  return MWorkerQueue->getContextImplPtr();
1522 }
1523 
1525  // TODO remove this workaround once the batching issue is addressed in Level
1526  // Zero plugin.
1527  // Consider the following scenario on Level Zero:
1528  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1529  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1530  // 3. queueA.wait().
1531  // 4. queueB.wait().
1532  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1533  // dependencies (i.e. they were waited for prior to enqueueing any command
1534  // that's dependent on them). This allowed Level Zero plugin to detect that
1535  // each queue is idle on steps 1/2 and submit the command list right away.
1536  // This is no longer the case since we started passing these dependencies in
1537  // an event waitlist and Level Zero plugin attempts to batch these commands,
1538  // so the execution of kernel B starts only on step 4. This workaround
1539  // restores the old behavior in this case until this is resolved.
1540  return !MQueue ||
1541  MQueue->getDeviceImplPtr()->getBackend() !=
1543  MEvent->getHandle() != nullptr;
1544 }
1545 
1546 ur_result_t MemCpyCommand::enqueueImp() {
1548  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1549 
1550  ur_event_handle_t UREvent = nullptr;
1551 
1552  auto RawEvents = getUrEvents(EventImpls);
1553  flushCrossQueueDeps(EventImpls, MWorkerQueue);
1554 
1556  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1557  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1558  MSrcReq.MOffset, MSrcReq.MElemSize, MDstAllocaCmd->getMemAllocation(),
1559  MQueue, MDstReq.MDims, MDstReq.MMemoryRange, MDstReq.MAccessRange,
1560  MDstReq.MOffset, MDstReq.MElemSize, std::move(RawEvents), UREvent,
1561  MEvent);
1562  MEvent->setHandle(UREvent);
1563  return UR_RESULT_SUCCESS;
1564 }
1565 
1566 void MemCpyCommand::printDot(std::ostream &Stream) const {
1567  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
1568 
1569  Stream << "ID = " << this << " ; ";
1570  Stream << "MEMCPY ON " << queueDeviceToString(MQueue.get()) << "\\n";
1571  Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue << "\\n";
1572  Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue << "\\n";
1573 
1574  Stream << "\"];" << std::endl;
1575 
1576  for (const auto &Dep : MDeps) {
1577  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1578  << " [ label = \"Access mode: "
1579  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1580  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1581  << std::endl;
1582  }
1583 }
1584 
1585 AllocaCommandBase *ExecCGCommand::getAllocaForReq(Requirement *Req) {
1586  for (const DepDesc &Dep : MDeps) {
1587  if (Dep.MDepRequirement == Req)
1588  return Dep.MAllocaCmd;
1589  }
1590  // Default constructed accessors do not add dependencies, but they can be
1591  // passed to commands. Simply return nullptr, since they are empty and don't
1592  // really require any memory.
1593  return nullptr;
1594 }
1595 
1596 std::vector<std::shared_ptr<const void>>
1598  if (MCommandGroup->getType() == CGType::Kernel)
1599  return ((CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
1600  return {};
1601 }
1602 
1604  if (MCommandGroup->getType() == CGType::Kernel)
1605  ((CGExecKernel *)MCommandGroup.get())->clearAuxiliaryResources();
1606 }
1607 
1608 ur_result_t UpdateHostRequirementCommand::enqueueImp() {
1610  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1611  ur_event_handle_t UREvent = nullptr;
1612  Command::waitForEvents(MQueue, EventImpls, UREvent);
1613  MEvent->setHandle(UREvent);
1614 
1615  assert(MSrcAllocaCmd && "Expected valid alloca command");
1616  assert(MSrcAllocaCmd->getMemAllocation() && "Expected valid source pointer");
1617  assert(MDstPtr && "Expected valid target pointer");
1618  *MDstPtr = MSrcAllocaCmd->getMemAllocation();
1619 
1620  return UR_RESULT_SUCCESS;
1621 }
1622 
1623 void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const {
1624  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\"";
1625 
1626  Stream << "ID = " << this << "\\n";
1627  Stream << "UPDATE REQ ON " << queueDeviceToString(MQueue.get()) << "\\n";
1628  bool IsReqOnBuffer =
1629  MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer;
1630  Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n";
1631  if (IsReqOnBuffer)
1632  Stream << "Is sub buffer: " << std::boolalpha << MDstReq.MIsSubBuffer
1633  << "\\n";
1634 
1635  Stream << "\"];" << std::endl;
1636 
1637  for (const auto &Dep : MDeps) {
1638  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1639  << " [ label = \"Access mode: "
1640  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1641  << "MemObj: " << Dep.MAllocaCmd->getSYCLMemObj() << " \" ]"
1642  << std::endl;
1643  }
1644 }
1645 
1647  AllocaCommandBase *SrcAllocaCmd,
1648  Requirement DstReq, void **DstPtr,
1649  QueueImplPtr SrcQueue,
1650  QueueImplPtr DstQueue)
1651  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1652  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1653  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) {
1654  if (MSrcQueue) {
1655  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1656  }
1657 
1658  MWorkerQueue = !MQueue ? MSrcQueue : MQueue;
1659  MEvent->setWorkerQueue(MWorkerQueue);
1660 
1662 }
1663 
1665 #ifdef XPTI_ENABLE_INSTRUMENTATION
1666  if (!xptiCheckTraceEnabled(MStreamID))
1667  return;
1668  // Create a payload with the command name and an event using this payload to
1669  // emit a node_create
1670  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1672 
1673  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1674  addDeviceMetadata(CmdTraceEvent, MQueue);
1675  xpti::addMetadata(CmdTraceEvent, "memory_object",
1676  reinterpret_cast<size_t>(MAddress));
1677  xpti::addMetadata(CmdTraceEvent, "copy_from",
1678  MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0);
1679  xpti::addMetadata(CmdTraceEvent, "copy_to",
1680  MQueue ? deviceToID(MQueue->get_device()) : 0);
1681  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1682  // as this data is mutable and the metadata is supposed to be invariant
1683  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
1685 #endif
1686 }
1687 
1689  if (!MWorkerQueue)
1690  return nullptr;
1691  return MWorkerQueue->getContextImplPtr();
1692 }
1693 
1694 ur_result_t MemCpyCommandHost::enqueueImp() {
1695  const QueueImplPtr &Queue = MWorkerQueue;
1697  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1698  std::vector<ur_event_handle_t> RawEvents = getUrEvents(EventImpls);
1699 
1700  ur_event_handle_t UREvent = nullptr;
1701  // Omit copying if mode is discard one.
1702  // TODO: Handle this at the graph building time by, for example, creating
1703  // empty node instead of memcpy.
1706  Command::waitForEvents(Queue, EventImpls, UREvent);
1707 
1708  return UR_RESULT_SUCCESS;
1709  }
1710 
1711  flushCrossQueueDeps(EventImpls, MWorkerQueue);
1712 
1713  try {
1715  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1716  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1717  MSrcReq.MOffset, MSrcReq.MElemSize, *MDstPtr, MQueue, MDstReq.MDims,
1718  MDstReq.MMemoryRange, MDstReq.MAccessRange, MDstReq.MOffset,
1719  MDstReq.MElemSize, std::move(RawEvents), UREvent, MEvent);
1720  MEvent->setHandle(UREvent);
1721  } catch (sycl::exception &e) {
1722  return static_cast<ur_result_t>(get_ur_error(e));
1723  }
1724 
1725  return UR_RESULT_SUCCESS;
1726 }
1727 
1730 }
1731 
1732 ur_result_t EmptyCommand::enqueueImp() {
1734  ur_event_handle_t UREvent = nullptr;
1736  MEvent->setHandle(UREvent);
1737  return UR_RESULT_SUCCESS;
1738 }
1739 
1741  const Requirement *Req) {
1742  const Requirement &ReqRef = *Req;
1743  MRequirements.emplace_back(ReqRef);
1744  const Requirement *const StoredReq = &MRequirements.back();
1745 
1746  // EmptyCommand is always host one, so we believe that result of addDep is
1747  // nil
1748  std::vector<Command *> ToCleanUp;
1749  Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp);
1750  assert(Cmd == nullptr && "Conection command should be null for EmptyCommand");
1751  assert(ToCleanUp.empty() && "addDep should add a command for cleanup only if "
1752  "there's a connection command");
1753  (void)Cmd;
1754 }
1755 
1757 #ifdef XPTI_ENABLE_INSTRUMENTATION
1758  if (!xptiCheckTraceEnabled(MStreamID))
1759  return;
1760  // Create a payload with the command name and an event using this payload to
1761  // emit a node_create
1762  if (MRequirements.empty())
1763  return;
1764 
1765  Requirement &Req = *MRequirements.begin();
1766 
1767  MAddress = Req.MSYCLMemObj;
1769 
1770  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1771  addDeviceMetadata(CmdTraceEvent, MQueue);
1772  xpti::addMetadata(CmdTraceEvent, "memory_object",
1773  reinterpret_cast<size_t>(MAddress));
1774  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1775  // as this data is mutable and the metadata is supposed to be invariant
1776  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
1778 #endif
1779 }
1780 
1781 void EmptyCommand::printDot(std::ostream &Stream) const {
1782  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
1783 
1784  Stream << "ID = " << this << "\\n";
1785  Stream << "EMPTY NODE"
1786  << "\\n";
1787 
1788  Stream << "\"];" << std::endl;
1789 
1790  for (const auto &Dep : MDeps) {
1791  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1792  << " [ label = \"Access mode: "
1793  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1794  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1795  << std::endl;
1796  }
1797 }
1798 
1799 bool EmptyCommand::producesPiEvent() const { return false; }
1800 
1801 void MemCpyCommandHost::printDot(std::ostream &Stream) const {
1802  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
1803 
1804  Stream << "ID = " << this << "\\n";
1805  Stream << "MEMCPY HOST ON " << queueDeviceToString(MQueue.get()) << "\\n";
1806 
1807  Stream << "\"];" << std::endl;
1808 
1809  for (const auto &Dep : MDeps) {
1810  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1811  << " [ label = \"Access mode: "
1812  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1813  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1814  << std::endl;
1815  }
1816 }
1817 
1819  QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd,
1820  void **DstPtr)
1821  : Command(CommandType::UPDATE_REQUIREMENT, std::move(Queue)),
1822  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(Req)), MDstPtr(DstPtr) {
1823 
1825 }
1826 
1828 #ifdef XPTI_ENABLE_INSTRUMENTATION
1829  if (!xptiCheckTraceEnabled(MStreamID))
1830  return;
1831  // Create a payload with the command name and an event using this payload to
1832  // emit a node_create
1833  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1835 
1836  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1837  addDeviceMetadata(CmdTraceEvent, MQueue);
1838  xpti::addMetadata(CmdTraceEvent, "memory_object",
1839  reinterpret_cast<size_t>(MAddress));
1840  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1841  // as this data is mutable and the metadata is supposed to be invariant
1842  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
1844 #endif
1845 }
1846 
1847 static std::string_view cgTypeToString(detail::CGType Type) {
1848  switch (Type) {
1850  return "Kernel";
1851  break;
1853  return "update_host";
1854  break;
1855  case detail::CGType::Fill:
1856  return "fill";
1857  break;
1859  return "copy acc to acc";
1860  break;
1862  return "copy acc to ptr";
1863  break;
1865  return "copy ptr to acc";
1866  break;
1868  return "barrier";
1870  return "barrier waitlist";
1872  return "copy usm";
1873  break;
1875  return "fill usm";
1876  break;
1878  return "prefetch usm";
1879  break;
1881  return "host task";
1882  break;
1884  return "copy 2d usm";
1885  break;
1887  return "fill 2d usm";
1888  break;
1890  return "advise usm";
1892  return "memset 2d usm";
1893  break;
1895  return "copy to device_global";
1896  break;
1898  return "copy from device_global";
1899  break;
1901  return "read_write host pipe";
1903  return "exec command buffer";
1905  return "copy image";
1907  return "semaphore wait";
1909  return "semaphore signal";
1910  default:
1911  return "unknown";
1912  break;
1913  }
1914 }
1915 
1917  std::unique_ptr<detail::CG> CommandGroup, QueueImplPtr Queue,
1918  bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer,
1919  const std::vector<ur_exp_command_buffer_sync_point_t> &Dependencies)
1920  : Command(CommandType::RUN_CG, std::move(Queue), CommandBuffer,
1921  Dependencies),
1922  MEventNeeded(EventNeeded), MCommandGroup(std::move(CommandGroup)) {
1923  if (MCommandGroup->getType() == detail::CGType::CodeplayHostTask) {
1924  MEvent->setSubmittedQueue(
1925  static_cast<detail::CGHostTask *>(MCommandGroup.get())->MQueue);
1926  }
1927  if (MCommandGroup->getType() == detail::CGType::ProfilingTag)
1928  MEvent->markAsProfilingTagEvent();
1929 
1931 }
1932 
1933 #ifdef XPTI_ENABLE_INSTRUMENTATION
1934 std::string instrumentationGetKernelName(
1935  const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1936  const std::string &FunctionName, const std::string &SyclKernelName,
1937  void *&Address, std::optional<bool> &FromSource) {
1938  std::string KernelName;
1939  if (SyclKernel && SyclKernel->isCreatedFromSource()) {
1940  FromSource = true;
1941  ur_kernel_handle_t KernelHandle = SyclKernel->getHandleRef();
1942  Address = KernelHandle;
1943  KernelName = FunctionName;
1944  } else {
1945  FromSource = false;
1946  KernelName = demangleKernelName(SyclKernelName);
1947  }
1948  return KernelName;
1949 }
1950 
1951 void instrumentationAddExtraKernelMetadata(
1952  xpti_td *&CmdTraceEvent, const NDRDescT &NDRDesc,
1953  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
1954  const std::string &KernelName,
1955  const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1956  const QueueImplPtr &Queue,
1957  std::vector<ArgDesc> &CGArgs) // CGArgs are not const since they could be
1958  // sorted in this function
1959 {
1960  std::vector<ArgDesc> Args;
1961 
1962  auto FilterArgs = [&Args](detail::ArgDesc &Arg, int NextTrueIndex) {
1963  Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex});
1964  };
1965  ur_program_handle_t Program = nullptr;
1966  ur_kernel_handle_t Kernel = nullptr;
1967  std::mutex *KernelMutex = nullptr;
1968  const KernelArgMask *EliminatedArgMask = nullptr;
1969 
1970  std::shared_ptr<kernel_impl> SyclKernelImpl;
1971  std::shared_ptr<device_image_impl> DeviceImageImpl;
1972 
1973  // Use kernel_bundle if available unless it is interop.
1974  // Interop bundles can't be used in the first branch, because the
1975  // kernels in interop kernel bundles (if any) do not have kernel_id and
1976  // can therefore not be looked up, but since they are self-contained
1977  // they can simply be launched directly.
1978  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
1979  kernel_id KernelID =
1981  kernel SyclKernel =
1982  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
1983  std::shared_ptr<kernel_impl> KernelImpl =
1984  detail::getSyclObjImpl(SyclKernel);
1985 
1986  EliminatedArgMask = KernelImpl->getKernelArgMask();
1987  Program = KernelImpl->getDeviceImage()->get_ur_program_ref();
1988  } else if (nullptr != SyclKernel) {
1989  Program = SyclKernel->getProgramRef();
1990  if (!SyclKernel->isCreatedFromSource())
1991  EliminatedArgMask = SyclKernel->getKernelArgMask();
1992  } else {
1993  assert(Queue && "Kernel submissions should have an associated queue");
1994  std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
1996  Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName);
1997  }
1998 
1999  applyFuncOnFilteredArgs(EliminatedArgMask, CGArgs, FilterArgs);
2000 
2001  xpti::offload_kernel_enqueue_data_t KernelData{
2002  {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
2003  {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
2004  {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
2005  NDRDesc.GlobalOffset[2]},
2006  Args.size()};
2007  xpti::addMetadata(CmdTraceEvent, "enqueue_kernel_data", KernelData);
2008  for (size_t i = 0; i < Args.size(); i++) {
2009  std::string Prefix("arg");
2010  xpti::offload_kernel_arg_data_t arg{(int)Args[i].MType, Args[i].MPtr,
2011  Args[i].MSize, Args[i].MIndex};
2012  xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i), arg);
2013  }
2014 }
2015 
2016 void instrumentationFillCommonData(const std::string &KernelName,
2017  const std::string &FileName, uint64_t Line,
2018  uint64_t Column, const void *const Address,
2019  const QueueImplPtr &Queue,
2020  std::optional<bool> &FromSource,
2021  uint64_t &OutInstanceID,
2022  xpti_td *&OutTraceEvent) {
2023  // Get source file, line number information from the CommandGroup object
2024  // and create payload using name, address, and source info
2025  //
2026  // On Windows, since the support for builtin functions is not available in
2027  // MSVC, the MFileName, MLine will be set to nullptr and "0" respectively.
2028  // Handle this condition explicitly here.
2029  bool HasSourceInfo = false;
2030  xpti::payload_t Payload;
2031  if (!FileName.empty()) {
2032  // File name has a valid string
2033  Payload = xpti::payload_t(KernelName.c_str(), FileName.c_str(), Line,
2034  Column, Address);
2035  HasSourceInfo = true;
2036  } else if (Address) {
2037  // We have a valid function name and an address
2038  Payload = xpti::payload_t(KernelName.c_str(), Address);
2039  } else {
2040  // In any case, we will have a valid function name and we'll use that to
2041  // create the hash
2042  Payload = xpti::payload_t(KernelName.c_str());
2043  }
2044 
2045  uint64_t CGKernelInstanceNo;
2046  // Create event using the payload
2047  xpti_td *CmdTraceEvent =
2048  xptiMakeEvent("ExecCG", &Payload, xpti::trace_graph_event,
2049  xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
2050  if (CmdTraceEvent) {
2051  OutInstanceID = CGKernelInstanceNo;
2052  OutTraceEvent = CmdTraceEvent;
2053 
2054  addDeviceMetadata(CmdTraceEvent, Queue);
2055  if (!KernelName.empty()) {
2056  xpti::addMetadata(CmdTraceEvent, "kernel_name", KernelName);
2057  }
2058  if (FromSource.has_value()) {
2059  xpti::addMetadata(CmdTraceEvent, "from_source", FromSource.value());
2060  }
2061  if (HasSourceInfo) {
2062  xpti::addMetadata(CmdTraceEvent, "sym_function_name", KernelName);
2063  xpti::addMetadata(CmdTraceEvent, "sym_source_file_name", FileName);
2064  xpti::addMetadata(CmdTraceEvent, "sym_line_no", static_cast<int>(Line));
2065  xpti::addMetadata(CmdTraceEvent, "sym_column_no",
2066  static_cast<int>(Column));
2067  }
2068  // We no longer set the 'queue_id' in the metadata structure as it is a
2069  // mutable value and multiple threads using the same queue created at the
2070  // same location will overwrite the metadata values creating inconsistencies
2071  }
2072 }
2073 #endif
2074 
2075 #ifdef XPTI_ENABLE_INSTRUMENTATION
2076 std::pair<xpti_td *, uint64_t> emitKernelInstrumentationData(
2077  int32_t StreamID, const std::shared_ptr<detail::kernel_impl> &SyclKernel,
2078  const detail::code_location &CodeLoc, const std::string &SyclKernelName,
2079  const QueueImplPtr &Queue, const NDRDescT &NDRDesc,
2080  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
2081  std::vector<ArgDesc> &CGArgs) {
2082 
2083  auto XptiObjects = std::make_pair<xpti_td *, uint64_t>(nullptr, -1);
2084  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2085  if (!xptiCheckTraceEnabled(StreamID))
2086  return XptiObjects;
2087 
2088  void *Address = nullptr;
2089  std::optional<bool> FromSource;
2090  std::string KernelName = instrumentationGetKernelName(
2091  SyclKernel, std::string(CodeLoc.functionName()), SyclKernelName, Address,
2092  FromSource);
2093 
2094  auto &[CmdTraceEvent, InstanceID] = XptiObjects;
2095 
2096  std::string FileName =
2097  CodeLoc.fileName() ? CodeLoc.fileName() : std::string();
2098  instrumentationFillCommonData(KernelName, FileName, CodeLoc.lineNumber(),
2099  CodeLoc.columnNumber(), Address, Queue,
2100  FromSource, InstanceID, CmdTraceEvent);
2101 
2102  if (CmdTraceEvent) {
2103  // Stash the queue_id mutable metadata in TLS
2104  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(Queue));
2105  instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc,
2106  KernelBundleImplPtr, SyclKernelName,
2107  SyclKernel, Queue, CGArgs);
2108 
2109  xptiNotifySubscribers(
2110  StreamID, NotificationTraceType, detail::GSYCLGraphEvent, CmdTraceEvent,
2111  InstanceID,
2112  static_cast<const void *>(
2113  commandToNodeType(Command::CommandType::RUN_CG).c_str()));
2114  }
2115 
2116  return XptiObjects;
2117 }
2118 #endif
2119 
2121 #ifdef XPTI_ENABLE_INSTRUMENTATION
2122  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2123  if (!xptiCheckTraceEnabled(MStreamID))
2124  return;
2125 
2126  std::string KernelName;
2127  std::optional<bool> FromSource;
2128  switch (MCommandGroup->getType()) {
2129  case detail::CGType::Kernel: {
2130  auto KernelCG =
2131  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2132  KernelName = instrumentationGetKernelName(
2133  KernelCG->MSyclKernel, MCommandGroup->MFunctionName,
2134  KernelCG->getKernelName(), MAddress, FromSource);
2135  } break;
2136  default:
2137  KernelName = getTypeString();
2138  break;
2139  }
2140 
2141  xpti_td *CmdTraceEvent = nullptr;
2142  instrumentationFillCommonData(KernelName, MCommandGroup->MFileName,
2143  MCommandGroup->MLine, MCommandGroup->MColumn,
2144  MAddress, MQueue, FromSource, MInstanceID,
2145  CmdTraceEvent);
2146 
2147  if (CmdTraceEvent) {
2148  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
2149  getQueueID(MQueue));
2150  MTraceEvent = static_cast<void *>(CmdTraceEvent);
2151  if (MCommandGroup->getType() == detail::CGType::Kernel) {
2152  auto KernelCG =
2153  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2154  instrumentationAddExtraKernelMetadata(
2155  CmdTraceEvent, KernelCG->MNDRDesc, KernelCG->getKernelBundle(),
2156  KernelCG->MKernelName, KernelCG->MSyclKernel, MQueue,
2157  KernelCG->MArgs);
2158  }
2159 
2160  xptiNotifySubscribers(
2161  MStreamID, NotificationTraceType, detail::GSYCLGraphEvent,
2162  CmdTraceEvent, MInstanceID,
2163  static_cast<const void *>(commandToNodeType(MType).c_str()));
2164  }
2165 #endif
2166 }
2167 
2168 void ExecCGCommand::printDot(std::ostream &Stream) const {
2169  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2170 
2171  Stream << "ID = " << this << "\\n";
2172  Stream << "EXEC CG ON " << queueDeviceToString(MQueue.get()) << "\\n";
2173 
2174  switch (MCommandGroup->getType()) {
2175  case detail::CGType::Kernel: {
2176  auto KernelCG =
2177  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2178  Stream << "Kernel name: ";
2179  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
2180  Stream << "created from source";
2181  else
2182  Stream << demangleKernelName(KernelCG->getKernelName());
2183  Stream << "\\n";
2184  break;
2185  }
2186  default:
2187  Stream << "CG type: " << getTypeString() << "\\n";
2188  break;
2189  }
2190 
2191  Stream << "\"];" << std::endl;
2192 
2193  for (const auto &Dep : MDeps) {
2194  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
2195  << " [ label = \"Access mode: "
2196  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
2197  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
2198  << std::endl;
2199  }
2200 }
2201 
2202 std::string_view ExecCGCommand::getTypeString() const {
2203  return cgTypeToString(MCommandGroup->getType());
2204 }
2205 
2206 // SYCL has a parallel_for_work_group variant where the only NDRange
2207 // characteristics set by a user is the number of work groups. This does not
2208 // map to the OpenCL clEnqueueNDRangeAPI, which requires global work size to
2209 // be set as well. This function determines local work size based on the
2210 // device characteristics and the number of work groups requested by the user,
2211 // then calculates the global work size. SYCL specification (from 4.8.5.3):
2212 // The member function handler::parallel_for_work_group is parameterized by
2213 // the number of work - groups, such that the size of each group is chosen by
2214 // the runtime, or by the number of work - groups and number of work - items
2215 // for users who need more control.
2216 static void adjustNDRangePerKernel(NDRDescT &NDR, ur_kernel_handle_t Kernel,
2217  const device_impl &DeviceImpl) {
2218  if (NDR.GlobalSize[0] != 0)
2219  return; // GlobalSize is set - no need to adjust
2220  // check the prerequisites:
2221  assert(NDR.LocalSize[0] == 0);
2222  // TODO might be good to cache this info together with the kernel info to
2223  // avoid get_kernel_work_group_info on every kernel run
2225  sycl::info::kernel_device_specific::compile_work_group_size>(
2226  Kernel, DeviceImpl.getHandleRef(), DeviceImpl.getPlugin());
2227 
2228  if (WGSize[0] == 0) {
2229  WGSize = {1, 1, 1};
2230  }
2231  NDR = sycl::detail::NDRDescT{nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize),
2232  static_cast<int>(NDR.Dims)};
2233 }
2234 
2235 // We have the following mapping between dimensions with SPIR-V builtins:
2236 // 1D: id[0] -> x
2237 // 2D: id[0] -> y, id[1] -> x
2238 // 3D: id[0] -> z, id[1] -> y, id[2] -> x
2239 // So in order to ensure the correctness we update all the kernel
2240 // parameters accordingly.
2241 // Initially we keep the order of NDRDescT as it provided by the user, this
2242 // simplifies overall handling and do the reverse only when
2243 // the kernel is enqueued.
2245  if (NDR.Dims > 1) {
2246  std::swap(NDR.GlobalSize[0], NDR.GlobalSize[NDR.Dims - 1]);
2247  std::swap(NDR.LocalSize[0], NDR.LocalSize[NDR.Dims - 1]);
2248  std::swap(NDR.GlobalOffset[0], NDR.GlobalOffset[NDR.Dims - 1]);
2249  }
2250 }
2251 
2252 ur_mem_flags_t AccessModeToUr(access::mode AccessorMode) {
2253  switch (AccessorMode) {
2254  case access::mode::read:
2255  return UR_MEM_FLAG_READ_ONLY;
2256  case access::mode::write:
2258  return UR_MEM_FLAG_WRITE_ONLY;
2259  default:
2260  return UR_MEM_FLAG_READ_WRITE;
2261  }
2262 }
2263 
2265  const PluginPtr &Plugin, ur_kernel_handle_t Kernel,
2266  const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2267  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2268  const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex) {
2269  switch (Arg.MType) {
2271  break;
2273  Requirement *Req = (Requirement *)(Arg.MPtr);
2274 
2275  // getMemAllocationFunc is nullptr when there are no requirements. However,
2276  // we may pass default constructed accessors to a command, which don't add
2277  // requirements. In such case, getMemAllocationFunc is nullptr, but it's a
2278  // valid case, so we need to properly handle it.
2279  ur_mem_handle_t MemArg =
2280  getMemAllocationFunc
2281  ? reinterpret_cast<ur_mem_handle_t>(getMemAllocationFunc(Req))
2282  : nullptr;
2283  ur_kernel_arg_mem_obj_properties_t MemObjData{};
2284  MemObjData.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
2285  MemObjData.memoryAccess = AccessModeToUr(Req->MAccessMode);
2286  Plugin->call<UrApiKind::urKernelSetArgMemObj>(Kernel, NextTrueIndex,
2287  &MemObjData, MemArg);
2288  break;
2289  }
2291  if (Arg.MPtr) {
2292  Plugin->call<UrApiKind::urKernelSetArgValue>(
2293  Kernel, NextTrueIndex, Arg.MSize, nullptr, Arg.MPtr);
2294  } else {
2295  Plugin->call<UrApiKind::urKernelSetArgLocal>(Kernel, NextTrueIndex,
2296  Arg.MSize, nullptr);
2297  }
2298 
2299  break;
2300  }
2302  sampler *SamplerPtr = (sampler *)Arg.MPtr;
2303  ur_sampler_handle_t Sampler =
2304  (ur_sampler_handle_t)detail::getSyclObjImpl(*SamplerPtr)
2305  ->getOrCreateSampler(Context);
2306  Plugin->call<UrApiKind::urKernelSetArgSampler>(Kernel, NextTrueIndex,
2307  nullptr, Sampler);
2308  break;
2309  }
2311  // We need to de-rerence this to get the actual USM allocation - that's the
2312  // pointer UR is expecting.
2313  const void *Ptr = *static_cast<const void *const *>(Arg.MPtr);
2314  Plugin->call<UrApiKind::urKernelSetArgPointer>(Kernel, NextTrueIndex,
2315  nullptr, Ptr);
2316  break;
2317  }
2319  assert(DeviceImageImpl != nullptr);
2320  ur_mem_handle_t SpecConstsBuffer =
2321  DeviceImageImpl->get_spec_const_buffer_ref();
2322 
2323  ur_kernel_arg_mem_obj_properties_t MemObjProps{};
2324  MemObjProps.pNext = nullptr;
2325  MemObjProps.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
2326  MemObjProps.memoryAccess = UR_MEM_FLAG_READ_ONLY;
2327  Plugin->call<UrApiKind::urKernelSetArgMemObj>(
2328  Kernel, NextTrueIndex, &MemObjProps, SpecConstsBuffer);
2329  break;
2330  }
2332  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
2333  "Invalid kernel param kind " +
2334  codeToString(UR_RESULT_ERROR_INVALID_VALUE));
2335  break;
2336  }
2337 }
2338 
2339 static ur_result_t SetKernelParamsAndLaunch(
2340  const QueueImplPtr &Queue, std::vector<ArgDesc> &Args,
2341  const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2342  ur_kernel_handle_t Kernel, NDRDescT &NDRDesc,
2343  std::vector<ur_event_handle_t> &RawEvents,
2344  const detail::EventImplPtr &OutEventImpl,
2345  const KernelArgMask *EliminatedArgMask,
2346  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2347  bool IsCooperative, bool KernelUsesClusterLaunch,
2348  const RTDeviceBinaryImage *BinImage, const std::string &KernelName) {
2349  assert(Queue && "Kernel submissions should have an associated queue");
2350  const PluginPtr &Plugin = Queue->getPlugin();
2351 
2353  std::vector<unsigned char> Empty;
2355  Queue, BinImage, KernelName,
2356  DeviceImageImpl.get() ? DeviceImageImpl->get_spec_const_blob_ref()
2357  : Empty);
2358  }
2359 
2360  auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
2361  &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
2362  SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc,
2363  Queue->get_context(), Arg, NextTrueIndex);
2364  };
2365 
2366  applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
2367 
2368  adjustNDRangePerKernel(NDRDesc, Kernel, *(Queue->getDeviceImplPtr()));
2369 
2370  // Remember this information before the range dimensions are reversed
2371  const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
2372 
2374 
2375  size_t RequiredWGSize[3] = {0, 0, 0};
2376  size_t *LocalSize = nullptr;
2377 
2378  if (HasLocalSize)
2379  LocalSize = &NDRDesc.LocalSize[0];
2380  else {
2381  Plugin->call<UrApiKind::urKernelGetGroupInfo>(
2382  Kernel, Queue->getDeviceImplPtr()->getHandleRef(),
2383  UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize),
2384  RequiredWGSize,
2385  /* pPropSizeRet = */ nullptr);
2386 
2387  const bool EnforcedLocalSize =
2388  (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2389  RequiredWGSize[2] != 0);
2390  if (EnforcedLocalSize)
2391  LocalSize = RequiredWGSize;
2392  }
2393  if (OutEventImpl != nullptr)
2394  OutEventImpl->setHostEnqueueTime();
2395  if (KernelUsesClusterLaunch) {
2396  std::vector<ur_exp_launch_property_t> property_list;
2397 
2398  ur_exp_launch_property_value_t launch_property_value_cluster_range;
2399  launch_property_value_cluster_range.clusterDim[0] =
2400  NDRDesc.ClusterDimensions[0];
2401  launch_property_value_cluster_range.clusterDim[1] =
2402  NDRDesc.ClusterDimensions[1];
2403  launch_property_value_cluster_range.clusterDim[2] =
2404  NDRDesc.ClusterDimensions[2];
2405 
2406  property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION,
2407  launch_property_value_cluster_range});
2408 
2409  if (IsCooperative) {
2410  ur_exp_launch_property_value_t launch_property_value_cooperative;
2411  launch_property_value_cooperative.cooperative = 1;
2412  property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE,
2413  launch_property_value_cooperative});
2414  }
2415 
2416  ur_event_handle_t UREvent = nullptr;
2417  ur_result_t Error =
2418  Plugin->call_nocheck<UrApiKind::urEnqueueKernelLaunchCustomExp>(
2419  Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalSize[0],
2420  LocalSize, property_list.size(), property_list.data(),
2421  RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2422  OutEventImpl ? &UREvent : nullptr);
2423  if (OutEventImpl) {
2424  OutEventImpl->setHandle(UREvent);
2425  }
2426  return Error;
2427  }
2428  ur_event_handle_t UREvent = nullptr;
2429  ur_result_t Error =
2430  [&](auto... Args) {
2431  if (IsCooperative) {
2432  return Plugin
2433  ->call_nocheck<UrApiKind::urEnqueueCooperativeKernelLaunchExp>(
2434  Args...);
2435  }
2436  return Plugin->call_nocheck<UrApiKind::urEnqueueKernelLaunch>(Args...);
2437  }(Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
2438  &NDRDesc.GlobalSize[0], LocalSize, RawEvents.size(),
2439  RawEvents.empty() ? nullptr : &RawEvents[0],
2440  OutEventImpl ? &UREvent : nullptr);
2441  if (Error == UR_RESULT_SUCCESS && OutEventImpl) {
2442  OutEventImpl->setHandle(UREvent);
2443  }
2444 
2445  return Error;
2446 }
2447 
2449  context Ctx, DeviceImplPtr DeviceImpl,
2450  ur_exp_command_buffer_handle_t CommandBuffer,
2451  const CGExecKernel &CommandGroup,
2452  std::vector<ur_exp_command_buffer_sync_point_t> &SyncPoints,
2453  ur_exp_command_buffer_sync_point_t *OutSyncPoint,
2454  ur_exp_command_buffer_command_handle_t *OutCommand,
2455  const std::function<void *(Requirement *Req)> &getMemAllocationFunc) {
2456  auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx);
2457  const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin();
2458  ur_kernel_handle_t UrKernel = nullptr;
2459  ur_program_handle_t UrProgram = nullptr;
2460  std::shared_ptr<kernel_impl> SyclKernelImpl = nullptr;
2461  std::shared_ptr<device_image_impl> DeviceImageImpl = nullptr;
2462 
2463  auto Kernel = CommandGroup.MSyclKernel;
2464  auto KernelBundleImplPtr = CommandGroup.MKernelBundle;
2465  const KernelArgMask *EliminatedArgMask = nullptr;
2466 
2467  // Use kernel_bundle if available unless it is interop.
2468  // Interop bundles can't be used in the first branch, because the kernels
2469  // in interop kernel bundles (if any) do not have kernel_id
2470  // and can therefore not be looked up, but since they are self-contained
2471  // they can simply be launched directly.
2472  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
2473  auto KernelName = CommandGroup.MKernelName;
2474  kernel_id KernelID =
2476  kernel SyclKernel =
2477  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
2478  SyclKernelImpl = detail::getSyclObjImpl(SyclKernel);
2479  UrKernel = SyclKernelImpl->getHandleRef();
2480  DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2481  UrProgram = DeviceImageImpl->get_ur_program_ref();
2482  EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2483  } else if (Kernel != nullptr) {
2484  UrKernel = Kernel->getHandleRef();
2485  UrProgram = Kernel->getProgramRef();
2486  EliminatedArgMask = Kernel->getKernelArgMask();
2487  } else {
2488  std::tie(UrKernel, std::ignore, EliminatedArgMask, UrProgram) =
2489  sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
2490  ContextImpl, DeviceImpl, CommandGroup.MKernelName);
2491  }
2492 
2493  auto SetFunc = [&Plugin, &UrKernel, &DeviceImageImpl, &Ctx,
2494  &getMemAllocationFunc](sycl::detail::ArgDesc &Arg,
2495  size_t NextTrueIndex) {
2496  sycl::detail::SetArgBasedOnType(Plugin, UrKernel, DeviceImageImpl,
2497  getMemAllocationFunc, Ctx, Arg,
2498  NextTrueIndex);
2499  };
2500  // Copy args for modification
2501  auto Args = CommandGroup.MArgs;
2502  sycl::detail::applyFuncOnFilteredArgs(EliminatedArgMask, Args, SetFunc);
2503 
2504  // Remember this information before the range dimensions are reversed
2505  const bool HasLocalSize = (CommandGroup.MNDRDesc.LocalSize[0] != 0);
2506 
2507  // Copy NDRDesc for modification
2508  auto NDRDesc = CommandGroup.MNDRDesc;
2509  // Reverse kernel dims
2511 
2512  size_t RequiredWGSize[3] = {0, 0, 0};
2513  size_t *LocalSize = nullptr;
2514 
2515  if (HasLocalSize)
2516  LocalSize = &NDRDesc.LocalSize[0];
2517  else {
2518  Plugin->call<UrApiKind::urKernelGetGroupInfo>(
2519  UrKernel, DeviceImpl->getHandleRef(),
2520  UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize),
2521  RequiredWGSize,
2522  /* pPropSizeRet = */ nullptr);
2523 
2524  const bool EnforcedLocalSize =
2525  (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2526  RequiredWGSize[2] != 0);
2527  if (EnforcedLocalSize)
2528  LocalSize = RequiredWGSize;
2529  }
2530 
2531  ur_result_t Res =
2532  Plugin->call_nocheck<UrApiKind::urCommandBufferAppendKernelLaunchExp>(
2533  CommandBuffer, UrKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
2534  &NDRDesc.GlobalSize[0], LocalSize, SyncPoints.size(),
2535  SyncPoints.size() ? SyncPoints.data() : nullptr, OutSyncPoint,
2536  OutCommand);
2537 
2538  if (!SyclKernelImpl && !Kernel) {
2539  Plugin->call<UrApiKind::urKernelRelease>(UrKernel);
2540  Plugin->call<UrApiKind::urProgramRelease>(UrProgram);
2541  }
2542 
2543  if (Res != UR_RESULT_SUCCESS) {
2544  const device_impl &DeviceImplem = *(DeviceImpl);
2546  UrKernel, NDRDesc);
2547  }
2548 
2549  return Res;
2550 }
2551 
2553  const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector<ArgDesc> &Args,
2554  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
2555  const std::shared_ptr<detail::kernel_impl> &MSyclKernel,
2556  const std::string &KernelName, std::vector<ur_event_handle_t> &RawEvents,
2557  const detail::EventImplPtr &OutEventImpl,
2558  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2559  ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative,
2560  const bool KernelUsesClusterLaunch, const RTDeviceBinaryImage *BinImage) {
2561  assert(Queue && "Kernel submissions should have an associated queue");
2562  // Run OpenCL kernel
2563  auto ContextImpl = Queue->getContextImplPtr();
2564  auto DeviceImpl = Queue->getDeviceImplPtr();
2565  ur_kernel_handle_t Kernel = nullptr;
2566  std::mutex *KernelMutex = nullptr;
2567  ur_program_handle_t Program = nullptr;
2568  const KernelArgMask *EliminatedArgMask;
2569 
2570  std::shared_ptr<kernel_impl> SyclKernelImpl;
2571  std::shared_ptr<device_image_impl> DeviceImageImpl;
2572 
2573  // Use kernel_bundle if available unless it is interop.
2574  // Interop bundles can't be used in the first branch, because the kernels
2575  // in interop kernel bundles (if any) do not have kernel_id
2576  // and can therefore not be looked up, but since they are self-contained
2577  // they can simply be launched directly.
2578  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
2579  kernel_id KernelID =
2581  kernel SyclKernel =
2582  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
2583 
2584  SyclKernelImpl = detail::getSyclObjImpl(SyclKernel);
2585 
2586  Kernel = SyclKernelImpl->getHandleRef();
2587  DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2588 
2589  Program = DeviceImageImpl->get_ur_program_ref();
2590 
2591  EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2592  KernelMutex = SyclKernelImpl->getCacheMutex();
2593  } else if (nullptr != MSyclKernel) {
2594  assert(MSyclKernel->get_info<info::kernel::context>() ==
2595  Queue->get_context());
2596  Kernel = MSyclKernel->getHandleRef();
2597  Program = MSyclKernel->getProgramRef();
2598 
2599  // Non-cacheable kernels use mutexes from kernel_impls.
2600  // TODO this can still result in a race condition if multiple SYCL
2601  // kernels are created with the same native handle. To address this,
2602  // we need to either store and use a ur_native_handle_t -> mutex map or
2603  // reuse and return existing SYCL kernels from make_native to avoid
2604  // their duplication in such cases.
2605  KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex();
2606  EliminatedArgMask = MSyclKernel->getKernelArgMask();
2607  } else {
2608  std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
2610  ContextImpl, DeviceImpl, KernelName, NDRDesc);
2611  }
2612 
2613  // We may need more events for the launch, so we make another reference.
2614  std::vector<ur_event_handle_t> &EventsWaitList = RawEvents;
2615 
2616  // Initialize device globals associated with this.
2617  std::vector<ur_event_handle_t> DeviceGlobalInitEvents =
2618  ContextImpl->initializeDeviceGlobals(Program, Queue);
2619  std::vector<ur_event_handle_t> EventsWithDeviceGlobalInits;
2620  if (!DeviceGlobalInitEvents.empty()) {
2621  EventsWithDeviceGlobalInits.reserve(RawEvents.size() +
2622  DeviceGlobalInitEvents.size());
2623  EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2624  RawEvents.begin(), RawEvents.end());
2625  EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2626  DeviceGlobalInitEvents.begin(),
2627  DeviceGlobalInitEvents.end());
2628  EventsWaitList = EventsWithDeviceGlobalInits;
2629  }
2630 
2631  ur_result_t Error = UR_RESULT_SUCCESS;
2632  {
2633  // When KernelMutex is null, this means that in-memory caching is
2634  // disabled, which means that kernel object is not shared, so no locking
2635  // is necessary.
2636  using LockT = std::unique_lock<std::mutex>;
2637  auto Lock = KernelMutex ? LockT(*KernelMutex) : LockT();
2638 
2639  // Set SLM/Cache configuration for the kernel if non-default value is
2640  // provided.
2641  if (KernelCacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_SLM ||
2642  KernelCacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_DATA) {
2643  const PluginPtr &Plugin = Queue->getPlugin();
2644  Plugin->call<UrApiKind::urKernelSetExecInfo>(
2645  Kernel, UR_KERNEL_EXEC_INFO_CACHE_CONFIG,
2646  sizeof(ur_kernel_cache_config_t), nullptr, &KernelCacheConfig);
2647  }
2648 
2649  Error = SetKernelParamsAndLaunch(
2650  Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList,
2651  OutEventImpl, EliminatedArgMask, getMemAllocationFunc,
2652  KernelIsCooperative, KernelUsesClusterLaunch, BinImage, KernelName);
2653 
2654  const PluginPtr &Plugin = Queue->getPlugin();
2655  if (!SyclKernelImpl && !MSyclKernel) {
2656  Plugin->call<UrApiKind::urKernelRelease>(Kernel);
2657  Plugin->call<UrApiKind::urProgramRelease>(Program);
2658  }
2659  }
2660  if (UR_RESULT_SUCCESS != Error) {
2661  // If we have got non-success error code, let's analyze it to emit nice
2662  // exception explaining what was wrong
2663  const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr());
2665  Kernel, NDRDesc);
2666  }
2667 }
2668 
2669 ur_result_t enqueueReadWriteHostPipe(const QueueImplPtr &Queue,
2670  const std::string &PipeName, bool blocking,
2671  void *ptr, size_t size,
2672  std::vector<ur_event_handle_t> &RawEvents,
2673  const detail::EventImplPtr &OutEventImpl,
2674  bool read) {
2675  assert(Queue &&
2676  "ReadWrite host pipe submissions should have an associated queue");
2677  detail::HostPipeMapEntry *hostPipeEntry =
2679 
2680  ur_program_handle_t Program = nullptr;
2681  device Device = Queue->get_device();
2682  ContextImplPtr ContextImpl = Queue->getContextImplPtr();
2683  std::optional<ur_program_handle_t> CachedProgram =
2684  ContextImpl->getProgramForHostPipe(Device, hostPipeEntry);
2685  if (CachedProgram)
2686  Program = *CachedProgram;
2687  else {
2688  // If there was no cached program, build one.
2689  device_image_plain devImgPlain =
2691  hostPipeEntry->getDevBinImage(), Queue->get_context(),
2692  Queue->get_device());
2693  device_image_plain BuiltImage =
2694  ProgramManager::getInstance().build(devImgPlain, {Device}, {});
2695  Program = getSyclObjImpl(BuiltImage)->get_ur_program_ref();
2696  }
2697  assert(Program && "Program for this hostpipe is not compiled.");
2698 
2699  const PluginPtr &Plugin = Queue->getPlugin();
2700 
2701  ur_queue_handle_t ur_q = Queue->getHandleRef();
2702  ur_result_t Error;
2703 
2704  ur_event_handle_t UREvent = nullptr;
2705  auto OutEvent = OutEventImpl ? &UREvent : nullptr;
2706  if (OutEventImpl != nullptr)
2707  OutEventImpl->setHostEnqueueTime();
2708  if (read) {
2709  Error = Plugin->call_nocheck<UrApiKind::urEnqueueReadHostPipe>(
2710  ur_q, Program, PipeName.c_str(), blocking, ptr, size, RawEvents.size(),
2711  RawEvents.empty() ? nullptr : &RawEvents[0], OutEvent);
2712  } else {
2713  Error = Plugin->call_nocheck<UrApiKind::urEnqueueWriteHostPipe>(
2714  ur_q, Program, PipeName.c_str(), blocking, ptr, size, RawEvents.size(),
2715  RawEvents.empty() ? nullptr : &RawEvents[0], OutEvent);
2716  }
2717  if (Error == UR_RESULT_SUCCESS && OutEventImpl) {
2718  OutEventImpl->setHandle(UREvent);
2719  }
2720  return Error;
2721 }
2722 
2723 ur_result_t ExecCGCommand::enqueueImpCommandBuffer() {
2724  assert(MQueue && "Command buffer enqueue should have an associated queue");
2725  // Wait on host command dependencies
2727 
2728  // Any device dependencies need to be waited on here since subsequent
2729  // submissions of the command buffer itself will not receive dependencies on
2730  // them, e.g. initial copies from host to device
2731  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
2732  flushCrossQueueDeps(EventImpls, MWorkerQueue);
2733  std::vector<ur_event_handle_t> RawEvents = getUrEvents(EventImpls);
2734  if (!RawEvents.empty()) {
2735  MQueue->getPlugin()->call<UrApiKind::urEventWait>(RawEvents.size(),
2736  &RawEvents[0]);
2737  }
2738 
2739  ur_exp_command_buffer_sync_point_t OutSyncPoint;
2740  ur_exp_command_buffer_command_handle_t OutCommand = nullptr;
2741  switch (MCommandGroup->getType()) {
2742  case CGType::Kernel: {
2743  CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2744 
2745  auto getMemAllocationFunc = [this](Requirement *Req) {
2746  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2747  return AllocaCmd->getMemAllocation();
2748  };
2749 
2750  auto result = enqueueImpCommandBufferKernel(
2751  MQueue->get_context(), MQueue->getDeviceImplPtr(), MCommandBuffer,
2752  *ExecKernel, MSyncPointDeps, &OutSyncPoint, &OutCommand,
2753  getMemAllocationFunc);
2754  MEvent->setSyncPoint(OutSyncPoint);
2755  MEvent->setCommandBufferCommand(OutCommand);
2756  return result;
2757  }
2758  case CGType::CopyUSM: {
2759  CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2761  MQueue->getContextImplPtr(), Copy->getSrc(), MCommandBuffer,
2762  Copy->getLength(), Copy->getDst(), MSyncPointDeps, &OutSyncPoint);
2763  MEvent->setSyncPoint(OutSyncPoint);
2764  return UR_RESULT_SUCCESS;
2765  }
2766  case CGType::CopyAccToAcc: {
2767  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2768  Requirement *ReqSrc = (Requirement *)(Copy->getSrc());
2769  Requirement *ReqDst = (Requirement *)(Copy->getDst());
2770 
2771  AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2772  AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2773 
2775  MQueue->getContextImplPtr(), MCommandBuffer,
2776  AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
2777  ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2778  ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2779  ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2780  ReqDst->MOffset, ReqDst->MElemSize, std::move(MSyncPointDeps),
2781  &OutSyncPoint);
2782  MEvent->setSyncPoint(OutSyncPoint);
2783  return UR_RESULT_SUCCESS;
2784  }
2785  case CGType::CopyAccToPtr: {
2786  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2787  Requirement *Req = (Requirement *)Copy->getSrc();
2788  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2789 
2791  MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(),
2792  AllocaCmd->getMemAllocation(), Req->MDims, Req->MMemoryRange,
2793  Req->MAccessRange, Req->MOffset, Req->MElemSize, (char *)Copy->getDst(),
2794  Req->MDims, Req->MAccessRange,
2795  /*DstOffset=*/{0, 0, 0}, Req->MElemSize, std::move(MSyncPointDeps),
2796  &OutSyncPoint);
2797  MEvent->setSyncPoint(OutSyncPoint);
2798  return UR_RESULT_SUCCESS;
2799  }
2800  case CGType::CopyPtrToAcc: {
2801  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2802  Requirement *Req = (Requirement *)(Copy->getDst());
2803  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2804 
2806  MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(),
2807  (char *)Copy->getSrc(), Req->MDims, Req->MAccessRange,
2808  /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2809  Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2810  Req->MElemSize, std::move(MSyncPointDeps), &OutSyncPoint);
2811  MEvent->setSyncPoint(OutSyncPoint);
2812  return UR_RESULT_SUCCESS;
2813  }
2814  case CGType::Fill: {
2815  CGFill *Fill = (CGFill *)MCommandGroup.get();
2816  Requirement *Req = (Requirement *)(Fill->getReqToFill());
2817  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2818 
2820  MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(),
2821  AllocaCmd->getMemAllocation(), Fill->MPattern.size(),
2822  Fill->MPattern.data(), Req->MDims, Req->MMemoryRange, Req->MAccessRange,
2823  Req->MOffset, Req->MElemSize, std::move(MSyncPointDeps), &OutSyncPoint);
2824  MEvent->setSyncPoint(OutSyncPoint);
2825  return UR_RESULT_SUCCESS;
2826  }
2827  case CGType::FillUSM: {
2828  CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
2830  MQueue->getContextImplPtr(), MCommandBuffer, Fill->getDst(),
2831  Fill->getLength(), Fill->getPattern(), std::move(MSyncPointDeps),
2832  &OutSyncPoint);
2833  MEvent->setSyncPoint(OutSyncPoint);
2834  return UR_RESULT_SUCCESS;
2835  }
2836  case CGType::PrefetchUSM: {
2837  CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
2839  MQueue->getContextImplPtr(), MCommandBuffer, Prefetch->getDst(),
2840  Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint);
2841  MEvent->setSyncPoint(OutSyncPoint);
2842  return UR_RESULT_SUCCESS;
2843  }
2844  case CGType::AdviseUSM: {
2845  CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
2847  MQueue->getContextImplPtr(), MCommandBuffer, Advise->getDst(),
2848  Advise->getLength(), Advise->getAdvice(), std::move(MSyncPointDeps),
2849  &OutSyncPoint);
2850  MEvent->setSyncPoint(OutSyncPoint);
2851  return UR_RESULT_SUCCESS;
2852  }
2853 
2854  default:
2855  throw exception(make_error_code(errc::runtime),
2856  "CG type not implemented for command buffers.");
2857  }
2858 }
2859 
2860 ur_result_t ExecCGCommand::enqueueImp() {
2861  if (MCommandBuffer) {
2862  return enqueueImpCommandBuffer();
2863  } else {
2864  return enqueueImpQueue();
2865  }
2866 }
2867 
2868 ur_result_t ExecCGCommand::enqueueImpQueue() {
2871  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
2872  auto RawEvents = getUrEvents(EventImpls);
2873  flushCrossQueueDeps(EventImpls, MWorkerQueue);
2874 
2875  // We can omit creating a UR event and create a "discarded" event if either
2876  // the queue has the discard property or the command has been explicitly
2877  // marked as not needing an event, e.g. if the user did not ask for one, and
2878  // if the queue supports discarded UR event and there are no requirements.
2879  bool DiscardUrEvent = MQueue && (MQueue->MDiscardEvents || !MEventNeeded) &&
2880  MQueue->supportsDiscardingPiEvents() &&
2881  MCommandGroup->getRequirements().size() == 0;
2882 
2883  ur_event_handle_t UREvent = nullptr;
2884  ur_event_handle_t *Event = DiscardUrEvent ? nullptr : &UREvent;
2885  detail::EventImplPtr EventImpl = DiscardUrEvent ? nullptr : MEvent;
2886 
2887  switch (MCommandGroup->getType()) {
2888 
2889  case CGType::UpdateHost: {
2890  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
2891  "Update host should be handled by the Scheduler. " +
2892  codeToString(UR_RESULT_ERROR_INVALID_VALUE));
2893  }
2894  case CGType::CopyAccToPtr: {
2895  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2896  Requirement *Req = (Requirement *)Copy->getSrc();
2897  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2898 
2900  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2901  Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2902  Req->MElemSize, Copy->getDst(), nullptr, Req->MDims, Req->MAccessRange,
2903  Req->MAccessRange, /*DstOffset=*/{0, 0, 0}, Req->MElemSize,
2904  std::move(RawEvents), UREvent, MEvent);
2905  MEvent->setHandle(UREvent);
2906 
2907  return UR_RESULT_SUCCESS;
2908  }
2909  case CGType::CopyPtrToAcc: {
2910  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2911  Requirement *Req = (Requirement *)(Copy->getDst());
2912  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2913 
2914  MemoryManager::copy(AllocaCmd->getSYCLMemObj(), Copy->getSrc(), nullptr,
2915  Req->MDims, Req->MAccessRange, Req->MAccessRange,
2916  /*SrcOffset*/ {0, 0, 0}, Req->MElemSize,
2917  AllocaCmd->getMemAllocation(), MQueue, Req->MDims,
2918  Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2919  Req->MElemSize, std::move(RawEvents), UREvent, MEvent);
2920  MEvent->setHandle(UREvent);
2921  return UR_RESULT_SUCCESS;
2922  }
2923  case CGType::CopyAccToAcc: {
2924  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2925  Requirement *ReqSrc = (Requirement *)(Copy->getSrc());
2926  Requirement *ReqDst = (Requirement *)(Copy->getDst());
2927 
2928  AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2929  AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2930 
2932  AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(), MQueue,
2933  ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2934  ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2935  MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2936  ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents), UREvent,
2937  MEvent);
2938  MEvent->setHandle(UREvent);
2939  return UR_RESULT_SUCCESS;
2940  }
2941  case CGType::Fill: {
2942  CGFill *Fill = (CGFill *)MCommandGroup.get();
2943  Requirement *Req = (Requirement *)(Fill->getReqToFill());
2944  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2945 
2947  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2948  Fill->MPattern.size(), Fill->MPattern.data(), Req->MDims,
2949  Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
2950  std::move(RawEvents), UREvent, MEvent);
2951  MEvent->setHandle(UREvent);
2952  return UR_RESULT_SUCCESS;
2953  }
2954  case CGType::Kernel: {
2955  assert(MQueue && "Kernel submissions should have an associated queue");
2956  CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2957 
2958  NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
2959  std::vector<ArgDesc> &Args = ExecKernel->MArgs;
2960 
2961  auto getMemAllocationFunc = [this](Requirement *Req) {
2962  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2963  // getAllocaForReq may return nullptr if Req is a default constructed
2964  // accessor. Simply return nullptr in such a case.
2965  return AllocaCmd ? AllocaCmd->getMemAllocation() : nullptr;
2966  };
2967 
2968  const std::shared_ptr<detail::kernel_impl> &SyclKernel =
2969  ExecKernel->MSyclKernel;
2970  const std::string &KernelName = ExecKernel->MKernelName;
2971 
2972  if (!EventImpl) {
2973  // Kernel only uses assert if it's non interop one
2974  bool KernelUsesAssert =
2975  !(SyclKernel && SyclKernel->isInterop()) &&
2977  if (KernelUsesAssert) {
2978  EventImpl = MEvent;
2979  }
2980  }
2981 
2982  const RTDeviceBinaryImage *BinImage = nullptr;
2984  std::tie(BinImage, std::ignore) =
2985  retrieveKernelBinary(MQueue, KernelName.c_str());
2986  assert(BinImage && "Failed to obtain a binary image.");
2987  }
2988  enqueueImpKernel(MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(),
2989  SyclKernel, KernelName, RawEvents, EventImpl,
2990  getMemAllocationFunc, ExecKernel->MKernelCacheConfig,
2991  ExecKernel->MKernelIsCooperative,
2992  ExecKernel->MKernelUsesClusterLaunch, BinImage);
2993 
2994  return UR_RESULT_SUCCESS;
2995  }
2996  case CGType::CopyUSM: {
2997  CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2998  MemoryManager::copy_usm(Copy->getSrc(), MQueue, Copy->getLength(),
2999  Copy->getDst(), std::move(RawEvents), Event,
3000  MEvent);
3001  if (Event)
3002  MEvent->setHandle(*Event);
3003  return UR_RESULT_SUCCESS;
3004  }
3005  case CGType::FillUSM: {
3006  CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
3007  MemoryManager::fill_usm(Fill->getDst(), MQueue, Fill->getLength(),
3008  Fill->getPattern(), std::move(RawEvents), Event,
3009  MEvent);
3010  if (Event)
3011  MEvent->setHandle(*Event);
3012  return UR_RESULT_SUCCESS;
3013  }
3014  case CGType::PrefetchUSM: {
3015  CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
3016  MemoryManager::prefetch_usm(Prefetch->getDst(), MQueue,
3017  Prefetch->getLength(), std::move(RawEvents),
3018  Event, MEvent);
3019  if (Event)
3020  MEvent->setHandle(*Event);
3021  return UR_RESULT_SUCCESS;
3022  }
3023  case CGType::AdviseUSM: {
3024  CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
3025  MemoryManager::advise_usm(Advise->getDst(), MQueue, Advise->getLength(),
3026  Advise->getAdvice(), std::move(RawEvents), Event,
3027  MEvent);
3028  if (Event)
3029  MEvent->setHandle(*Event);
3030  return UR_RESULT_SUCCESS;
3031  }
3032  case CGType::Copy2DUSM: {
3033  CGCopy2DUSM *Copy = (CGCopy2DUSM *)MCommandGroup.get();
3034  MemoryManager::copy_2d_usm(Copy->getSrc(), Copy->getSrcPitch(), MQueue,
3035  Copy->getDst(), Copy->getDstPitch(),
3036  Copy->getWidth(), Copy->getHeight(),
3037  std::move(RawEvents), Event, MEvent);
3038  if (Event)
3039  MEvent->setHandle(*Event);
3040  return UR_RESULT_SUCCESS;
3041  }
3042  case CGType::Fill2DUSM: {
3043  CGFill2DUSM *Fill = (CGFill2DUSM *)MCommandGroup.get();
3044  MemoryManager::fill_2d_usm(Fill->getDst(), MQueue, Fill->getPitch(),
3045  Fill->getWidth(), Fill->getHeight(),
3046  Fill->getPattern(), std::move(RawEvents), Event,
3047  MEvent);
3048  if (Event)
3049  MEvent->setHandle(*Event);
3050  return UR_RESULT_SUCCESS;
3051  }
3052  case CGType::Memset2DUSM: {
3053  CGMemset2DUSM *Memset = (CGMemset2DUSM *)MCommandGroup.get();
3054  MemoryManager::memset_2d_usm(Memset->getDst(), MQueue, Memset->getPitch(),
3055  Memset->getWidth(), Memset->getHeight(),
3056  Memset->getValue(), std::move(RawEvents),
3057  Event, MEvent);
3058  if (Event)
3059  MEvent->setHandle(*Event);
3060  return UR_RESULT_SUCCESS;
3061  }
3062  case CGType::CodeplayHostTask: {
3063  CGHostTask *HostTask = static_cast<CGHostTask *>(MCommandGroup.get());
3064 
3065  for (ArgDesc &Arg : HostTask->MArgs) {
3066  switch (Arg.MType) {
3068  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
3069  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
3070 
3071  if (AllocaCmd)
3072  Req->MData = AllocaCmd->getMemAllocation();
3073  break;
3074  }
3075  default:
3076  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
3077  "Unsupported arg type " +
3078  codeToString(UR_RESULT_ERROR_INVALID_VALUE));
3079  }
3080  }
3081 
3082  std::vector<interop_handle::ReqToMem> ReqToMem;
3083  std::vector<ur_mem_handle_t> ReqUrMem;
3084 
3085  if (HostTask->MHostTask->isInteropTask()) {
3086  // Extract the Mem Objects for all Requirements, to ensure they are
3087  // available if a user asks for them inside the interop task scope
3088  const std::vector<Requirement *> &HandlerReq =
3089  HostTask->getRequirements();
3090  auto ReqToMemConv = [&ReqToMem, &ReqUrMem, HostTask](Requirement *Req) {
3091  const std::vector<AllocaCommandBase *> &AllocaCmds =
3092  Req->MSYCLMemObj->MRecord->MAllocaCommands;
3093 
3094  for (AllocaCommandBase *AllocaCmd : AllocaCmds)
3095  if (getContext(HostTask->MQueue) ==
3096  getContext(AllocaCmd->getQueue())) {
3097  auto MemArg = reinterpret_cast<ur_mem_handle_t>(
3098  AllocaCmd->getMemAllocation());
3099  ReqToMem.emplace_back(std::make_pair(Req, MemArg));
3100  ReqUrMem.emplace_back(MemArg);
3101 
3102  return;
3103  }
3104 
3105  assert(false &&
3106  "Can't get memory object due to no allocation available");
3107 
3108  throw sycl::exception(
3109  sycl::make_error_code(sycl::errc::runtime),
3110  "Can't get memory object due to no allocation available " +
3111  codeToString(UR_RESULT_ERROR_INVALID_MEM_OBJECT));
3112  };
3113  std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
3114  std::sort(std::begin(ReqToMem), std::end(ReqToMem));
3115  }
3116 
3117  // Host task is executed asynchronously so we should record where it was
3118  // submitted to report exception origin properly.
3120 
3122  DispatchHostTask(this, std::move(ReqToMem), std::move(ReqUrMem)));
3123 
3125 
3126  return UR_RESULT_SUCCESS;
3127  }
3129  CGHostTask *HostTask = static_cast<CGHostTask *>(MCommandGroup.get());
3130 
3131  for (ArgDesc &Arg : HostTask->MArgs) {
3132  switch (Arg.MType) {
3134  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
3135  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
3136 
3137  if (AllocaCmd)
3138  Req->MData = AllocaCmd->getMemAllocation();
3139  break;
3140  }
3141  default:
3142  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
3143  "Unsupported arg type ");
3144  }
3145  }
3146 
3147  std::vector<interop_handle::ReqToMem> ReqToMem;
3148  std::vector<ur_mem_handle_t> ReqMems;
3149 
3150  if (HostTask->MHostTask->isInteropTask()) {
3151  // Extract the Mem Objects for all Requirements, to ensure they are
3152  // available if a user asks for them inside the interop task scope
3153  const std::vector<Requirement *> &HandlerReq =
3154  HostTask->getRequirements();
3155  auto ReqToMemConv = [&ReqToMem, &ReqMems, HostTask](Requirement *Req) {
3156  const std::vector<AllocaCommandBase *> &AllocaCmds =
3157  Req->MSYCLMemObj->MRecord->MAllocaCommands;
3158 
3159  for (AllocaCommandBase *AllocaCmd : AllocaCmds)
3160  if (getContext(HostTask->MQueue) ==
3161  getContext(AllocaCmd->getQueue())) {
3162  auto MemArg = reinterpret_cast<ur_mem_handle_t>(
3163  AllocaCmd->getMemAllocation());
3164  ReqToMem.emplace_back(std::make_pair(Req, MemArg));
3165  ReqMems.emplace_back(MemArg);
3166 
3167  return;
3168  }
3169 
3170  assert(false &&
3171  "Can't get memory object due to no allocation available");
3172 
3173  throw sycl::exception(
3174  sycl::make_error_code(sycl::errc::runtime),
3175  "Can't get memory object due to no allocation available " +
3176  codeToString(UR_RESULT_ERROR_INVALID_MEM_OBJECT));
3177  };
3178  std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
3179  std::sort(std::begin(ReqToMem), std::end(ReqToMem));
3180  }
3181 
3182  EnqueueNativeCommandData CustomOpData{
3183  interop_handle{ReqToMem, HostTask->MQueue,
3184  HostTask->MQueue->getDeviceImplPtr(),
3185  HostTask->MQueue->getContextImplPtr()},
3186  HostTask->MHostTask->MInteropTask};
3187 
3188  ur_bool_t NativeCommandSupport = false;
3189  MQueue->getPlugin()->call<UrApiKind::urDeviceGetInfo>(
3190  detail::getSyclObjImpl(MQueue->get_device())->getHandleRef(),
3191  UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP,
3192  sizeof(NativeCommandSupport), &NativeCommandSupport, nullptr);
3193  assert(NativeCommandSupport && "ext_codeplay_enqueue_native_command is not "
3194  "supported on this device");
3195  MQueue->getPlugin()->call<UrApiKind::urEnqueueNativeCommandExp>(
3196  MQueue->getHandleRef(), InteropFreeFunc, &CustomOpData, ReqMems.size(),
3197  ReqMems.data(), nullptr, RawEvents.size(), RawEvents.data(), Event);
3198  if (Event)
3199  MEvent->setHandle(*Event);
3200  return UR_RESULT_SUCCESS;
3201  }
3202  case CGType::Barrier: {
3203  assert(MQueue && "Barrier submission should have an associated queue");
3204  const PluginPtr &Plugin = MQueue->getPlugin();
3205  if (MEvent != nullptr)
3206  MEvent->setHostEnqueueTime();
3207  Plugin->call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
3208  MQueue->getHandleRef(), 0, nullptr, Event);
3209  if (Event)
3210  MEvent->setHandle(*Event);
3211  return UR_RESULT_SUCCESS;
3212  }
3213  case CGType::BarrierWaitlist: {
3214  assert(MQueue && "Barrier submission should have an associated queue");
3215  CGBarrier *Barrier = static_cast<CGBarrier *>(MCommandGroup.get());
3216  std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
3217  std::vector<ur_event_handle_t> UrEvents = getUrEventsBlocking(Events);
3218  if (UrEvents.empty()) {
3219  // If Events is empty, then the barrier has no effect.
3220  return UR_RESULT_SUCCESS;
3221  }
3222  const PluginPtr &Plugin = MQueue->getPlugin();
3223  if (MEvent != nullptr)
3224  MEvent->setHostEnqueueTime();
3225  Plugin->call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
3226  MQueue->getHandleRef(), UrEvents.size(), &UrEvents[0], Event);
3227  if (Event)
3228  MEvent->setHandle(*Event);
3229  return UR_RESULT_SUCCESS;
3230  }
3231  case CGType::ProfilingTag: {
3232  assert(MQueue && "Profiling tag requires a valid queue");
3233  const auto &Plugin = MQueue->getPlugin();
3234  // If the queue is not in-order, we need to insert a barrier. This barrier
3235  // does not need output events as it will implicitly enforce the following
3236  // enqueue is blocked until it finishes.
3237  if (!MQueue->isInOrder()) {
3238  // FIXME: Due to a bug in the L0 UR adapter, we will leak events if we do
3239  // not pass an output event to the UR call. Once that is fixed,
3240  // this immediately-deleted event can be removed.
3241  ur_event_handle_t PreTimestampBarrierEvent{};
3242  Plugin->call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
3243  MQueue->getHandleRef(),
3244  /*num_events_in_wait_list=*/0,
3245  /*event_wait_list=*/nullptr, &PreTimestampBarrierEvent);
3246  Plugin->call<UrApiKind::urEventRelease>(PreTimestampBarrierEvent);
3247  }
3248 
3249  Plugin->call<UrApiKind::urEnqueueTimestampRecordingExp>(
3250  MQueue->getHandleRef(),
3251  /*blocking=*/false,
3252  /*num_events_in_wait_list=*/0, /*event_wait_list=*/nullptr, Event);
3253  if (Event)
3254  MEvent->setHandle(*Event);
3255  return UR_RESULT_SUCCESS;
3256  }
3258  CGCopyToDeviceGlobal *Copy = (CGCopyToDeviceGlobal *)MCommandGroup.get();
3260  Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(), MQueue,
3261  Copy->getNumBytes(), Copy->getOffset(), Copy->getSrc(),
3262  std::move(RawEvents), Event, MEvent);
3263  if (Event)
3264  MEvent->setHandle(*Event);
3265  return UR_RESULT_SUCCESS;
3266  }
3268  CGCopyFromDeviceGlobal *Copy =
3269  (CGCopyFromDeviceGlobal *)MCommandGroup.get();
3271  Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(), MQueue,
3272  Copy->getNumBytes(), Copy->getOffset(), Copy->getDest(),
3273  std::move(RawEvents), Event, MEvent);
3274  if (Event)
3275  MEvent->setHandle(*Event);
3276  return UR_RESULT_SUCCESS;
3277  }
3279  CGReadWriteHostPipe *ExecReadWriteHostPipe =
3280  (CGReadWriteHostPipe *)MCommandGroup.get();
3281  std::string pipeName = ExecReadWriteHostPipe->getPipeName();
3282  void *hostPtr = ExecReadWriteHostPipe->getHostPtr();
3283  size_t typeSize = ExecReadWriteHostPipe->getTypeSize();
3284  bool blocking = ExecReadWriteHostPipe->isBlocking();
3285  bool read = ExecReadWriteHostPipe->isReadHostPipe();
3286 
3287  if (!EventImpl) {
3288  EventImpl = MEvent;
3289  }
3290  return enqueueReadWriteHostPipe(MQueue, pipeName, blocking, hostPtr,
3291  typeSize, RawEvents, EventImpl, read);
3292  }
3294  assert(MQueue &&
3295  "Command buffer submissions should have an associated queue");
3296  CGExecCommandBuffer *CmdBufferCG =
3297  static_cast<CGExecCommandBuffer *>(MCommandGroup.get());
3298  if (MEvent != nullptr)
3299  MEvent->setHostEnqueueTime();
3300  ur_result_t Err =
3301  MQueue->getPlugin()->call_nocheck<UrApiKind::urCommandBufferEnqueueExp>(
3302  CmdBufferCG->MCommandBuffer, MQueue->getHandleRef(),
3303  RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
3304  Event);
3305  if (Event)
3306  MEvent->setHandle(*Event);
3307 
3308  return Err;
3309  }
3310  case CGType::CopyImage: {
3311  CGCopyImage *Copy = (CGCopyImage *)MCommandGroup.get();
3312 
3314  MQueue, Copy->getSrc(), Copy->getDst(), Copy->getSrcDesc(),
3315  Copy->getDstDesc(), Copy->getSrcFormat(), Copy->getDstFormat(),
3316  Copy->getCopyFlags(), Copy->getSrcOffset(), Copy->getDstOffset(),
3317  Copy->getCopyExtent(), std::move(RawEvents), Event);
3318  if (Event)
3319  MEvent->setHandle(*Event);
3320 
3321  return UR_RESULT_SUCCESS;
3322  }
3323  case CGType::SemaphoreWait: {
3324  assert(MQueue &&
3325  "Semaphore wait submissions should have an associated queue");
3326  CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get();
3327  const detail::PluginPtr &Plugin = MQueue->getPlugin();
3328  auto OptWaitValue = SemWait->getWaitValue();
3329  uint64_t WaitValue = OptWaitValue.has_value() ? OptWaitValue.value() : 0;
3330  Plugin->call<UrApiKind::urBindlessImagesWaitExternalSemaphoreExp>(
3331  MQueue->getHandleRef(), SemWait->getExternalSemaphore(),
3332  OptWaitValue.has_value(), WaitValue, 0, nullptr, nullptr);
3333 
3334  return UR_RESULT_SUCCESS;
3335  }
3336  case CGType::SemaphoreSignal: {
3337  assert(MQueue &&
3338  "Semaphore signal submissions should have an associated queue");
3339  CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get();
3340  const detail::PluginPtr &Plugin = MQueue->getPlugin();
3341  auto OptSignalValue = SemSignal->getSignalValue();
3342  uint64_t SignalValue =
3343  OptSignalValue.has_value() ? OptSignalValue.value() : 0;
3344  Plugin->call<UrApiKind::urBindlessImagesSignalExternalSemaphoreExp>(
3345  MQueue->getHandleRef(), SemSignal->getExternalSemaphore(),
3346  OptSignalValue.has_value(), SignalValue, 0, nullptr, nullptr);
3347 
3348  return UR_RESULT_SUCCESS;
3349  }
3350  case CGType::None:
3351  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
3352  "CG type not implemented. " +
3353  codeToString(UR_RESULT_ERROR_INVALID_OPERATION));
3354  }
3355  return UR_RESULT_ERROR_INVALID_OPERATION;
3356 }
3357 
3359  return !MCommandBuffer &&
3360  MCommandGroup->getType() != CGType::CodeplayHostTask;
3361 }
3362 
3364  // Host tasks are cleaned up upon completion instead.
3366  (MCommandGroup->getType() != CGType::CodeplayHostTask);
3367 }
3368 
3370  if (MCommandGroup->getType() == CGType::CodeplayHostTask)
3371  return MLeafCounter == 0 && MEvent->isCompleted();
3372  return Command::readyForCleanup();
3373 }
3374 
3376  QueueImplPtr Queue,
3378  std::vector<std::shared_ptr<ext::oneapi::experimental::detail::node_impl>>
3379  Nodes)
3380  : Command(CommandType::UPDATE_CMD_BUFFER, Queue), MGraph(Graph),
3381  MNodes(Nodes) {}
3382 
3383 ur_result_t UpdateCommandBufferCommand::enqueueImp() {
3385  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
3386  ur_event_handle_t UREvent = nullptr;
3387  Command::waitForEvents(MQueue, EventImpls, UREvent);
3388  MEvent->setHandle(UREvent);
3389 
3390  for (auto &Node : MNodes) {
3391  auto CG = static_cast<CGExecKernel *>(Node->MCommandGroup.get());
3392  for (auto &Arg : CG->MArgs) {
3393  if (Arg.MType != kernel_param_kind_t::kind_accessor) {
3394  continue;
3395  }
3396  // Search through deps to get actual allocation for accessor args.
3397  for (const DepDesc &Dep : MDeps) {
3398  Requirement *Req = static_cast<AccessorImplHost *>(Arg.MPtr);
3399  if (Dep.MDepRequirement == Req) {
3400  if (Dep.MAllocaCmd) {
3401  Req->MData = Dep.MAllocaCmd->getMemAllocation();
3402  } else {
3404  "No allocation available for accessor when "
3405  "updating command buffer!");
3406  }
3407  }
3408  }
3409  }
3410  MGraph->updateImpl(Node);
3411  }
3412 
3413  return UR_RESULT_SUCCESS;
3414 }
3415 
3416 void UpdateCommandBufferCommand::printDot(std::ostream &Stream) const {
3417  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
3418 
3419  Stream << "ID = " << this << "\\n";
3420  Stream << "CommandBuffer Command Update"
3421  << "\\n";
3422 
3423  Stream << "\"];" << std::endl;
3424 
3425  for (const auto &Dep : MDeps) {
3426  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
3427  << " [ label = \"Access mode: "
3428  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
3429  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
3430  << std::endl;
3431  }
3432 }
3433 
3435 bool UpdateCommandBufferCommand::producesPiEvent() const { return false; }
3436 
3437 } // namespace detail
3438 } // namespace _V1
3439 } // namespace sycl
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:50
Base class for memory allocation commands.
Definition: commands.hpp:443
const Requirement * getRequirement() const final
Definition: commands.hpp:454
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
Definition: commands.hpp:471
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:1064
virtual void * getMemAllocation() const =0
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:1060
bool MIsActive
Indicates that current alloca is active one.
Definition: commands.hpp:473
bool MIsLeaderAlloca
Indicates that the command owns memory allocation in case of connected alloca command.
Definition: commands.hpp:477
AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
Definition: commands.cpp:1023
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:1062
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1034
SYCLMemObjI * getSYCLMemObj() const
Definition: commands.hpp:450
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1120
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1084
AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData=true, AllocaCommandBase *LinkedAllocaCmd=nullptr, bool IsConst=false)
Definition: commands.cpp:1066
void * getMemAllocation() const final
Definition: commands.cpp:1173
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1158
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1201
AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, std::vector< Command * > &ToEnqueue, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:1140
sycl::detail::kernel_param_kind_t MType
Definition: cg.hpp:55
"Execute kernel" command group class.
Definition: cg.hpp:246
std::shared_ptr< detail::kernel_bundle_impl > MKernelBundle
Definition: cg.hpp:252
std::vector< ArgDesc > MArgs
Definition: cg.hpp:253
NDRDescT MNDRDesc
Stores ND-range description.
Definition: cg.hpp:249
std::shared_ptr< detail::kernel_impl > MSyclKernel
Definition: cg.hpp:251
std::shared_ptr< detail::queue_impl > MQueue
Definition: cg.hpp:677
Base class for all types of command groups.
Definition: cg.hpp:160
CGType getType() const
Definition: cg.hpp:207
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:109
virtual ur_result_t enqueueImp()=0
Private interface. Derived classes should implement this method.
Command * processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform glueing of events from different contexts.
Definition: commands.cpp:752
void * MTraceEvent
The event for node_create and task_begin.
Definition: commands.hpp:338
CommandType MType
The type of the command.
Definition: commands.hpp:287
virtual bool producesPiEvent() const
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:791
int32_t MStreamID
The stream under which the traces are emitted.
Definition: commands.hpp:342
void emitInstrumentation(uint16_t Type, const char *Txt=nullptr)
Emits an event of Type.
Definition: commands.cpp:849
virtual void emitInstrumentationData()=0
Instrumentation method which emits telemetry data.
void resolveReleaseDependencies(std::set< Command * > &list)
Looks at all the dependencies for the release command and enables instrumentation to report these dep...
Definition: commands.cpp:952
std::string MCommandName
Buffer to build the command end-user understandable name.
Definition: commands.hpp:351
bool MMarkedForCleanup
Indicates that the node will be freed by graph cleanup.
Definition: commands.hpp:376
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
Definition: commands.hpp:313
std::vector< ur_exp_command_buffer_sync_point_t > MSyncPointDeps
List of sync points for submissions to a command buffer.
Definition: commands.hpp:396
virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector< Command * > &ToCleanUp)
Checks if the command is enqueued, and calls enqueueImp.
Definition: commands.cpp:860
void waitForEvents(QueueImplPtr Queue, std::vector< EventImplPtr > &RawEvents, ur_event_handle_t &Event)
Definition: commands.cpp:498
void waitForPreparedHostEvents() const
Definition: commands.cpp:493
std::string MSubmissionFileName
Introduces string to handle memory management since code_location struct works with raw char arrays.
Definition: commands.hpp:362
std::vector< ur_event_handle_t > getUrEvents(const std::vector< EventImplPtr > &EventImpls) const
Collect UR events from EventImpls and filter out some of them in case of in order queue.
Definition: commands.cpp:232
std::mutex MEnqueueMtx
Mutex used to protect enqueueing from race conditions.
Definition: commands.hpp:289
void emitInstrumentationDataProxy()
Proxy method which calls emitInstrumentationData.
Definition: commands.cpp:581
code_location MSubmissionCodeLocation
Represents code location of command submission to SYCL API, assigned with the valid value only if com...
Definition: commands.hpp:359
void makeTraceEventEpilog()
If prolog has been run, run epilog; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:739
std::vector< EventImplPtr > & MPreparedHostDepsEvents
Definition: commands.hpp:261
void emitEnqueuedEventSignal(const ur_event_handle_t UrEventAddr)
Creates a signal event with the enqueued kernel event handle.
Definition: commands.cpp:840
std::vector< EventImplPtr > & MPreparedDepsEvents
Dependency events prepared for waiting by backend.
Definition: commands.hpp:260
std::string MSubmissionFunctionName
Definition: commands.hpp:363
uint64_t MInstanceID
Instance ID tracked for the command.
Definition: commands.hpp:355
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
Definition: commands.hpp:307
const char * getBlockReason() const
Definition: commands.cpp:994
std::vector< ur_event_handle_t > getUrEventsBlocking(const std::vector< EventImplPtr > &EventImpls) const
Collect UR events from EventImpls and filter out some of them in case of in order queue.
Definition: commands.cpp:257
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:795
void addUser(Command *NewUser)
Definition: commands.hpp:143
virtual ContextImplPtr getWorkerContext() const
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:785
std::atomic< EnqueueResultT::ResultT > MEnqueueStatus
Describes the status of the command.
Definition: commands.hpp:330
const EventImplPtr & getEvent() const
Definition: commands.hpp:182
ur_exp_command_buffer_handle_t MCommandBuffer
CommandBuffer which will be used to submit to instead of the queue, if set.
Definition: commands.hpp:394
uint64_t makeTraceEventProlog(void *MAddress)
Create a trace event of node_create type; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:706
friend class DispatchHostTask
Definition: commands.hpp:291
CommandType getType() const
Definition: commands.hpp:146
void * MAddress
Reserved for storing the object address such as SPIR-V or memory object address.
Definition: commands.hpp:345
std::string MAddressString
Buffer to build the address string.
Definition: commands.hpp:347
bool MIsBlockable
Indicates whether the command can be blocked from enqueueing.
Definition: commands.hpp:311
virtual bool supportsPostEnqueueCleanup() const
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:793
bool MTraceEventPrologComplete
Flag to indicate if makeTraceEventProlog() has been run.
Definition: commands.hpp:353
std::string MCommandNodeType
Buffer to build the command node type.
Definition: commands.hpp:349
void emitEdgeEventForEventDependence(Command *Cmd, ur_event_handle_t &EventAddr)
Creates an edge event when the dependency is an event.
Definition: commands.cpp:648
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:800
void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, bool IsCommand, std::optional< access::mode > AccMode=std::nullopt)
Creates an edge event when the dependency is a command.
Definition: commands.cpp:597
const QueueImplPtr & getQueue() const
Definition: commands.hpp:180
Command(CommandType Type, QueueImplPtr Queue, ur_exp_command_buffer_handle_t CommandBuffer=nullptr, const std::vector< ur_exp_command_buffer_sync_point_t > &SyncPoints={})
It is safe to bind MPreparedDepsEvents and MPreparedHostDepsEvents references to event_impl class mem...
Definition: commands.cpp:555
DispatchHostTask(ExecCGCommand *ThisCmd, std::vector< interop_handle::ReqToMem > ReqToMem, std::vector< ur_mem_handle_t > ReqUrMem)
Definition: commands.cpp:379
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:1799
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1781
void addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, const Requirement *Req)
Definition: commands.cpp:1740
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1756
The exec CG command enqueues execution of kernel or explicit memory operation.
Definition: commands.hpp:631
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:3358
ExecCGCommand(std::unique_ptr< detail::CG > CommandGroup, QueueImplPtr Queue, bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer=nullptr, const std::vector< ur_exp_command_buffer_sync_point_t > &Dependencies={})
Definition: commands.cpp:1916
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:3363
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:3369
std::string_view getTypeString() const
Definition: commands.cpp:2202
detail::CG & getCG() const
Definition: commands.hpp:646
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:2168
std::vector< std::shared_ptr< const void > > getAuxiliaryResources() const
Definition: commands.cpp:1597
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:2120
void TraceEventXPTI(const char *Message)
static GlobalHandler & instance()
void call(HostProfilingInfo *HPI)
Definition: host_task.hpp:35
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1345
MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode)
Definition: commands.cpp:1336
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1379
ContextImplPtr getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1688
MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, void **DstPtr, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1646
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1664
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1801
ContextImplPtr getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1518
MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, AllocaCommandBase *DstAllocaCmd, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1475
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1494
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:1524
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1566
static void ext_oneapi_fill_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, void *DstMem, size_t Len, const std::vector< unsigned char > &Pattern, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void ext_oneapi_copyH2D_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, char *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void copy_image_bindless(QueueImplPtr Queue, const void *Src, void *Dst, const ur_image_desc_t &SrcDesc, const ur_image_desc_t &DstDesc, const ur_image_format_t &SrcFormat, const ur_image_format_t &DstFormat, const ur_exp_image_copy_flags_t Flags, ur_rect_offset_t SrcOffset, ur_rect_offset_t DstOffset, ur_rect_region_t CopyExtent, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent)
static void fill_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, const std::vector< unsigned char > &Pattern, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_advise_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, const void *Mem, size_t Length, ur_usm_advice_flags_t Advice, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void fill(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, size_t PatternSize, const unsigned char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void * allocateMemSubBuffer(ContextImplPtr TargetContext, void *ParentMemObj, size_t ElemSize, size_t Offset, range< 3 > Range, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
static void ext_oneapi_fill_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, void *Mem, size_t PatternSize, const unsigned char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void memset_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, char Value, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copyD2H_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, char *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void copy_to_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, const void *SrcMem, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copyD2D_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void copy_2d_usm(const void *SrcMem, size_t SrcPitch, QueueImplPtr Queue, void *DstMem, size_t DstPitch, size_t Width, size_t Height, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void advise_usm(const void *Ptr, QueueImplPtr Queue, size_t Len, ur_usm_advice_flags_t Advice, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void * map(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, access::mode AccessMode, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent)
static void copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_usm(const void *SrcMem, QueueImplPtr Queue, size_t Len, void *DstMem, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, const std::vector< unsigned char > &Pattern, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_from_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, void *DstMem, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void * allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
static void ext_oneapi_copy_usm_cmd_buffer(ContextImplPtr Context, const void *SrcMem, ur_exp_command_buffer_handle_t CommandBuffer, size_t Len, void *DstMem, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void unmap(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, void *MappedPtr, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent)
static void release(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
static void ext_oneapi_prefetch_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, void *Mem, size_t Length, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void prefetch_usm(void *Ptr, QueueImplPtr Queue, size_t Len, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
sycl::range< 3 > GlobalSize
Definition: cg.hpp:148
sycl::range< 3 > NumWorkGroups
Number of workgroups, used to record the number of workgroups from the simplest form of parallel_for_...
Definition: cg.hpp:154
sycl::id< 3 > GlobalOffset
Definition: cg.hpp:150
sycl::range< 3 > LocalSize
Definition: cg.hpp:149
sycl::range< 3 > ClusterDimensions
Definition: cg.hpp:155
std::tuple< ur_kernel_handle_t, std::mutex *, const KernelArgMask *, ur_program_handle_t > getOrCreateKernel(const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, const NDRDescT &NDRDesc={})
static ProgramManager & getInstance()
kernel_id getSYCLKernelID(const std::string &KernelName)
device_image_plain getDeviceImageFromBinaryImage(RTDeviceBinaryImage *BinImage, const context &Ctx, const device &Dev)
bool kernelUsesAssert(const std::string &KernelName) const
HostPipeMapEntry * getHostPipeEntry(const std::string &UniqueId)
device_image_plain build(const device_image_plain &DeviceImage, const std::vector< device > &Devs, const property_list &PropList)
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:1330
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:1332
ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd)
Definition: commands.cpp:1222
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1312
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1227
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:1334
static const char * get()
Definition: config.hpp:115
virtual MemObjType getType() const =0
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
void enqueueCommandForCG(EventImplPtr NewEvent, std::vector< Command * > &AuxilaryCmds, BlockingT Blocking=NON_BLOCKING)
Definition: scheduler.cpp:139
ur_kernel_handle_t completeSpecConstMaterialization(QueueImplPtr Queue, const RTDeviceBinaryImage *BinImage, const std::string &KernelName, std::vector< unsigned char > &SpecConstBlob)
Definition: scheduler.cpp:581
static Scheduler & getInstance()
Definition: scheduler.cpp:249
void NotifyHostTaskCompletion(Command *Cmd)
Definition: scheduler.cpp:441
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1458
UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, void **SrcPtr, QueueImplPtr Queue)
Definition: commands.cpp:1396
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1403
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:1422
UpdateCommandBufferCommand(QueueImplPtr Queue, ext::oneapi::experimental::detail::exec_graph_impl *Graph, std::vector< std::shared_ptr< ext::oneapi::experimental::detail::node_impl >> Nodes)
Definition: commands.cpp:3375
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:3416
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:3434
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
Definition: commands.cpp:3435
UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd, void **DstPtr)
Definition: commands.cpp:1818
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1827
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1623
static void bufferAssociateNotification(const void *, const void *)
const PluginPtr & getPlugin() const
ur_device_handle_t & getHandleRef()
Get reference to UR device.
Definition: device_impl.hpp:65
static ThreadPool & getThreadPool()
Definition: queue_impl.hpp:629
Data type that manages the code_location information in TLS.
Definition: common.hpp:128
const detail::code_location & query()
Query the information in the TLS slot.
Definition: common.cpp:66
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
Class representing the implementation of command_graph<executable>.
void updateImpl(std::shared_ptr< node_impl > NodeImpl)
Objects of the class identify kernel is some kernel_bundle related APIs.
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:71
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:22
Objects of the property_list class are containers for the SYCL properties.
std::function< void(interop_handle)> func
Definition: commands.cpp:322
sycl::interop_handle ih
Definition: commands.cpp:321
void handleErrorOrWarning(ur_result_t Error, const device_impl &DeviceImpl, ur_kernel_handle_t Kernel, const NDRDescT &NDRDesc)
Analyzes error code and arguments of urEnqueueKernelLaunch to emit user-friendly exception describing...
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:259
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:31
static void flushCrossQueueDeps(const std::vector< EventImplPtr > &EventImpls, const QueueImplPtr &Queue)
Definition: commands.cpp:311
static std::string demangleKernelName(std::string Name)
Definition: commands.cpp:120
static ur_result_t SetKernelParamsAndLaunch(const QueueImplPtr &Queue, std::vector< ArgDesc > &Args, const std::shared_ptr< device_image_impl > &DeviceImageImpl, ur_kernel_handle_t Kernel, NDRDescT &NDRDesc, std::vector< ur_event_handle_t > &RawEvents, const detail::EventImplPtr &OutEventImpl, const KernelArgMask *EliminatedArgMask, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, bool IsCooperative, bool KernelUsesClusterLaunch, const RTDeviceBinaryImage *BinImage, const std::string &KernelName)
Definition: commands.cpp:2339
std::vector< bool > KernelArgMask
std::string codeToString(int32_t code)
Definition: exception.hpp:57
constexpr const char * SYCL_STREAM_NAME
ur_mem_flags_t AccessModeToUr(access::mode AccessorMode)
Definition: commands.cpp:2252
static std::string_view cgTypeToString(detail::CGType Type)
Definition: commands.cpp:1847
void ReverseRangeDimensionsForKernel(NDRDescT &NDR)
Definition: commands.cpp:2244
void enqueueImpKernel(const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector< ArgDesc > &Args, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, const std::shared_ptr< detail::kernel_impl > &MSyclKernel, const std::string &KernelName, std::vector< ur_event_handle_t > &RawEvents, const detail::EventImplPtr &OutEventImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative, const bool KernelUsesClusterLaunch, const RTDeviceBinaryImage *BinImage)
Definition: commands.cpp:2552
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:32
void SetArgBasedOnType(const PluginPtr &Plugin, ur_kernel_handle_t Kernel, const std::shared_ptr< device_image_impl > &DeviceImageImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex)
Definition: commands.cpp:2264
std::shared_ptr< event_impl > EventImplPtr
Definition: handler.hpp:183
std::shared_ptr< plugin > PluginPtr
Definition: ur.hpp:107
AccessorImplHost Requirement
ur_result_t enqueueImpCommandBufferKernel(context Ctx, DeviceImplPtr DeviceImpl, ur_exp_command_buffer_handle_t CommandBuffer, const CGExecKernel &CommandGroup, std::vector< ur_exp_command_buffer_sync_point_t > &SyncPoints, ur_exp_command_buffer_sync_point_t *OutSyncPoint, ur_exp_command_buffer_command_handle_t *OutCommand, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
Definition: commands.cpp:2448
std::shared_ptr< device_impl > DeviceImplPtr
CGType
Type of the command group.
Definition: cg_types.hpp:42
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
static void adjustNDRangePerKernel(NDRDescT &NDR, ur_kernel_handle_t Kernel, const device_impl &DeviceImpl)
Definition: commands.cpp:2216
void applyFuncOnFilteredArgs(const KernelArgMask *EliminatedArgMask, std::vector< ArgDesc > &Args, std::function< void(detail::ArgDesc &Arg, int NextTrueIndex)> Func)
Definition: commands.cpp:123
ur_result_t enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, bool blocking, void *ptr, size_t size, std::vector< ur_event_handle_t > &RawEvents, const detail::EventImplPtr &OutEventImpl, bool read)
Definition: commands.cpp:2669
auto tie(Ts &...Args)
Definition: tuple.hpp:39
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: helpers.hpp:45
int32_t get_ur_error(const exception &e)
Definition: exception.hpp:156
static ContextImplPtr getContext(const QueueImplPtr &Queue)
Definition: commands.cpp:98
std::enable_if< !std::is_same< typename Param::return_type, sycl::range< 3 > >::value, typename Param::return_type >::type get_kernel_device_specific_info(ur_kernel_handle_t Kernel, ur_device_handle_t Device, const PluginPtr &Plugin)
Definition: kernel_info.hpp:96
static std::string accessModeToString(access::mode Mode)
Definition: commands.cpp:157
std::string queueDeviceToString(const queue_impl *const &Queue)
std::tuple< const RTDeviceBinaryImage *, ur_program_handle_t > retrieveKernelBinary(const QueueImplPtr &, const char *KernelName, CGExecKernel *CGKernel=nullptr)
Definition: helpers.cpp:38
Function for_each(Group g, Ptr first, Ptr last, Function f)
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, _Tp > arg(const complex< _Tp > &__c)
PropertyListT int access::address_space multi_ptr & operator=(multi_ptr &&)=default
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:65
Definition: access.hpp:18
Dependency between two commands.
Definition: commands.hpp:83
const Requirement * MDepRequirement
Requirement for the dependency.
Definition: commands.hpp:96
Command * MDepCommand
The actual dependency command.
Definition: commands.hpp:94
Result of command enqueueing.
Definition: commands.hpp:64