DPC++ Runtime
Runtime libraries for oneAPI DPC++
handler.cpp
Go to the documentation of this file.
1 //==-------- handler.cpp --- SYCL command group handler --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "ur_api.h"
10 #include "sycl/detail/helpers.hpp"
11 #include <algorithm>
12 
13 #include <detail/config.hpp>
15 #include <detail/graph_impl.hpp>
16 #include <detail/handler_impl.hpp>
17 #include <detail/host_task.hpp>
18 #include <detail/image_impl.hpp>
20 #include <detail/kernel_impl.hpp>
21 #include <detail/queue_impl.hpp>
24 #include <detail/usm/usm_impl.hpp>
25 #include <sycl/detail/common.hpp>
26 #include <sycl/detail/helpers.hpp>
28 #include <sycl/detail/ur.hpp>
29 #include <sycl/event.hpp>
30 #include <sycl/handler.hpp>
31 #include <sycl/info/info_desc.hpp>
32 #include <sycl/stream.hpp>
33 
36 
37 namespace sycl {
38 inline namespace _V1 {
39 
40 namespace detail {
41 
42 bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr) {
43  DeviceGlobalMapEntry *DGEntry =
45  DeviceGlobalPtr);
46  return DGEntry && !DGEntry->MImageIdentifiers.empty();
47 }
48 
49 ur_exp_image_copy_flags_t getUrImageCopyFlags(sycl::usm::alloc SrcPtrType,
50  sycl::usm::alloc DstPtrType) {
51  if (DstPtrType == sycl::usm::alloc::device) {
52  // Dest is on device
53  if (SrcPtrType == sycl::usm::alloc::device)
54  return UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE;
55  if (SrcPtrType == sycl::usm::alloc::host ||
56  SrcPtrType == sycl::usm::alloc::unknown)
57  return UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE;
59  "Unknown copy source location");
60  }
61  if (DstPtrType == sycl::usm::alloc::host ||
62  DstPtrType == sycl::usm::alloc::unknown) {
63  // Dest is on host
64  if (SrcPtrType == sycl::usm::alloc::device)
65  return UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST;
66  if (SrcPtrType == sycl::usm::alloc::host ||
67  SrcPtrType == sycl::usm::alloc::unknown)
69  "Cannot copy image from host to host");
71  "Unknown copy source location");
72  }
74  "Unknown copy destination location");
75 }
76 
79  &DynamicParamBase) {
80  return sycl::detail::getSyclObjImpl(DynamicParamBase)->getValue();
81 }
82 
83 } // namespace detail
84 
85 handler::handler(std::shared_ptr<detail::queue_impl> Queue,
86  bool CallerNeedsEvent)
87  : handler(Queue, Queue, nullptr, CallerNeedsEvent) {}
88 
89 handler::handler(std::shared_ptr<detail::queue_impl> Queue,
90  std::shared_ptr<detail::queue_impl> PrimaryQueue,
91  std::shared_ptr<detail::queue_impl> SecondaryQueue,
92  bool CallerNeedsEvent)
93  : impl(std::make_shared<detail::handler_impl>(std::move(PrimaryQueue),
94  std::move(SecondaryQueue),
95  CallerNeedsEvent)),
96  MQueue(std::move(Queue)) {}
97 
98 handler::handler(
99  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph)
100  : impl(std::make_shared<detail::handler_impl>(Graph)) {}
101 
102 // Sets the submission state to indicate that an explicit kernel bundle has been
103 // set. Throws a sycl::exception with errc::invalid if the current state
104 // indicates that a specialization constant has been set.
105 void handler::setStateExplicitKernelBundle() {
106  impl->setStateExplicitKernelBundle();
107 }
108 
109 // Sets the submission state to indicate that a specialization constant has been
110 // set. Throws a sycl::exception with errc::invalid if the current state
111 // indicates that an explicit kernel bundle has been set.
112 void handler::setStateSpecConstSet() { impl->setStateSpecConstSet(); }
113 
114 // Returns true if the submission state is EXPLICIT_KERNEL_BUNDLE_STATE and
115 // false otherwise.
116 bool handler::isStateExplicitKernelBundle() const {
117  return impl->isStateExplicitKernelBundle();
118 }
119 
120 // Returns a shared_ptr to the kernel_bundle.
121 // If there is no kernel_bundle created:
122 // returns newly created kernel_bundle if Insert is true
123 // returns shared_ptr(nullptr) if Insert is false
124 std::shared_ptr<detail::kernel_bundle_impl>
125 handler::getOrInsertHandlerKernelBundle(bool Insert) const {
126  if (!impl->MKernelBundle && Insert) {
127  auto Ctx =
128  impl->MGraph ? impl->MGraph->getContext() : MQueue->get_context();
129  auto Dev = impl->MGraph ? impl->MGraph->getDevice() : MQueue->get_device();
130  impl->MKernelBundle = detail::getSyclObjImpl(
131  get_kernel_bundle<bundle_state::input>(Ctx, {Dev}, {}));
132  }
133  return impl->MKernelBundle;
134 }
135 
136 // Sets kernel bundle to the provided one.
137 void handler::setHandlerKernelBundle(
138  const std::shared_ptr<detail::kernel_bundle_impl> &NewKernelBundleImpPtr) {
139  impl->MKernelBundle = NewKernelBundleImpPtr;
140 }
141 
142 void handler::setHandlerKernelBundle(kernel Kernel) {
143  // Kernel may not have an associated kernel bundle if it is created from a
144  // program. As such, apply getSyclObjImpl directly on the kernel, i.e. not
145  // the other way around: getSyclObjImp(Kernel->get_kernel_bundle()).
146  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImpl =
147  detail::getSyclObjImpl(Kernel)->get_kernel_bundle();
148  setHandlerKernelBundle(KernelBundleImpl);
149 }
150 
151 event handler::finalize() {
152  // This block of code is needed only for reduction implementation.
153  // It is harmless (does nothing) for everything else.
154  if (MIsFinalized)
155  return MLastEvent;
156  MIsFinalized = true;
157 
158  // According to 4.7.6.9 of SYCL2020 spec, if a placeholder accessor is passed
159  // to a command without being bound to a command group, an exception should
160  // be thrown.
161  {
162  for (const auto &arg : impl->MArgs) {
164  continue;
165 
166  detail::Requirement *AccImpl =
167  static_cast<detail::Requirement *>(arg.MPtr);
168  if (AccImpl->MIsPlaceH) {
169  auto It = std::find(impl->CGData.MRequirements.begin(),
170  impl->CGData.MRequirements.end(), AccImpl);
171  if (It == impl->CGData.MRequirements.end())
173  "placeholder accessor must be bound by calling "
174  "handler::require() before it can be used.");
175 
176  // Check associated accessors
177  bool AccFound = false;
178  for (detail::ArgDesc &Acc : impl->MAssociatedAccesors) {
180  static_cast<detail::Requirement *>(Acc.MPtr) == AccImpl) {
181  AccFound = true;
182  break;
183  }
184  }
185 
186  if (!AccFound) {
188  "placeholder accessor must be bound by calling "
189  "handler::require() before it can be used.");
190  }
191  }
192  }
193  }
194 
195  const auto &type = getType();
196  if (type == detail::CGType::Kernel) {
197  // If there were uses of set_specialization_constant build the kernel_bundle
198  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImpPtr =
199  getOrInsertHandlerKernelBundle(/*Insert=*/false);
200  if (KernelBundleImpPtr) {
201  // Make sure implicit non-interop kernel bundles have the kernel
202  if (!KernelBundleImpPtr->isInterop() &&
203  !impl->isStateExplicitKernelBundle()) {
204  auto Dev =
205  impl->MGraph ? impl->MGraph->getDevice() : MQueue->get_device();
206  kernel_id KernelID =
208  MKernelName.c_str());
209  bool KernelInserted = KernelBundleImpPtr->add_kernel(KernelID, Dev);
210  // If kernel was not inserted and the bundle is in input mode we try
211  // building it and trying to find the kernel in executable mode
212  if (!KernelInserted &&
213  KernelBundleImpPtr->get_bundle_state() == bundle_state::input) {
214  auto KernelBundle =
215  detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
216  KernelBundleImpPtr);
217  kernel_bundle<bundle_state::executable> ExecKernelBundle =
218  build(KernelBundle);
219  KernelBundleImpPtr = detail::getSyclObjImpl(ExecKernelBundle);
220  setHandlerKernelBundle(KernelBundleImpPtr);
221  KernelInserted = KernelBundleImpPtr->add_kernel(KernelID, Dev);
222  }
223  // If the kernel was not found in executable mode we throw an exception
224  if (!KernelInserted)
226  "Failed to add kernel to kernel bundle.");
227  }
228 
229  switch (KernelBundleImpPtr->get_bundle_state()) {
230  case bundle_state::input: {
231  // Underlying level expects kernel_bundle to be in executable state
232  kernel_bundle<bundle_state::executable> ExecBundle = build(
233  detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
234  KernelBundleImpPtr));
235  KernelBundleImpPtr = detail::getSyclObjImpl(ExecBundle);
236  setHandlerKernelBundle(KernelBundleImpPtr);
237  break;
238  }
240  // Nothing to do
241  break;
244  assert(0 && "Expected that the bundle is either in input or executable "
245  "states.");
246  break;
247  }
248  }
249 
250  if (MQueue && !impl->MGraph && !impl->MSubgraphNode &&
251  !MQueue->getCommandGraph() && !MQueue->is_in_fusion_mode() &&
252  !impl->CGData.MRequirements.size() && !MStreamStorage.size() &&
253  (!impl->CGData.MEvents.size() ||
254  (MQueue->isInOrder() &&
256  impl->CGData.MEvents, MQueue->getContextImplPtr())))) {
257  // if user does not add a new dependency to the dependency graph, i.e.
258  // the graph is not changed, and the queue is not in fusion mode, then
259  // this faster path is used to submit kernel bypassing scheduler and
260  // avoiding CommandGroup, Command objects creation.
261 
262  std::vector<ur_event_handle_t> RawEvents;
263  detail::EventImplPtr NewEvent;
264 
265 #ifdef XPTI_ENABLE_INSTRUMENTATION
266  // uint32_t StreamID, uint64_t InstanceID, xpti_td* TraceEvent,
267  int32_t StreamID = xptiRegisterStream(detail::SYCL_STREAM_NAME);
268  auto [CmdTraceEvent, InstanceID] = emitKernelInstrumentationData(
269  StreamID, MKernel, MCodeLoc, MKernelName.c_str(), MQueue,
270  impl->MNDRDesc, KernelBundleImpPtr, impl->MArgs);
271  auto EnqueueKernel = [&, CmdTraceEvent = CmdTraceEvent,
272  InstanceID = InstanceID]() {
273 #else
274  auto EnqueueKernel = [&]() {
275 #endif
276 #ifdef XPTI_ENABLE_INSTRUMENTATION
277  detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
278  xpti::trace_task_begin, nullptr);
279 #endif
280  const detail::RTDeviceBinaryImage *BinImage = nullptr;
282  std::tie(BinImage, std::ignore) =
283  detail::retrieveKernelBinary(MQueue, MKernelName.c_str());
284  assert(BinImage && "Failed to obtain a binary image.");
285  }
286  enqueueImpKernel(MQueue, impl->MNDRDesc, impl->MArgs,
287  KernelBundleImpPtr, MKernel, MKernelName.c_str(),
288  RawEvents, NewEvent, nullptr, impl->MKernelCacheConfig,
289  impl->MKernelIsCooperative,
290  impl->MKernelUsesClusterLaunch, BinImage);
291 #ifdef XPTI_ENABLE_INSTRUMENTATION
292  // Emit signal only when event is created
293  if (NewEvent != nullptr) {
294  detail::emitInstrumentationGeneral(
295  StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
296  static_cast<const void *>(NewEvent->getHandleRef()));
297  }
298  detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
299  xpti::trace_task_end, nullptr);
300 #endif
301  };
302 
303  bool DiscardEvent = (MQueue->MDiscardEvents || !impl->MEventNeeded) &&
304  MQueue->supportsDiscardingPiEvents();
305  if (DiscardEvent) {
306  // Kernel only uses assert if it's non interop one
307  bool KernelUsesAssert =
308  !(MKernel && MKernel->isInterop()) &&
310  MKernelName.c_str());
311  DiscardEvent = !KernelUsesAssert;
312  }
313 
314  if (DiscardEvent) {
315  EnqueueKernel();
316  auto EventImpl = std::make_shared<detail::event_impl>(
318  MLastEvent = detail::createSyclObjFromImpl<event>(EventImpl);
319  } else {
320  NewEvent = std::make_shared<detail::event_impl>(MQueue);
321  NewEvent->setWorkerQueue(MQueue);
322  NewEvent->setContextImpl(MQueue->getContextImplPtr());
323  NewEvent->setStateIncomplete();
324  NewEvent->setSubmissionTime();
325 
326  EnqueueKernel();
327  if (NewEvent->isHost() || NewEvent->getHandleRef() == nullptr)
328  NewEvent->setComplete();
329  NewEvent->setEnqueued();
330 
331  MLastEvent = detail::createSyclObjFromImpl<event>(NewEvent);
332  }
333  return MLastEvent;
334  }
335  }
336 
337  std::unique_ptr<detail::CG> CommandGroup;
338  switch (type) {
339  case detail::CGType::Kernel: {
340  // Copy kernel name here instead of move so that it's available after
341  // running of this method by reductions implementation. This allows for
342  // assert feature to check if kernel uses assertions
343  CommandGroup.reset(new detail::CGExecKernel(
344  std::move(impl->MNDRDesc), std::move(MHostKernel), std::move(MKernel),
345  std::move(impl->MKernelBundle), std::move(impl->CGData),
346  std::move(impl->MArgs), MKernelName.c_str(), std::move(MStreamStorage),
347  std::move(impl->MAuxiliaryResources), getType(),
348  impl->MKernelCacheConfig, impl->MKernelIsCooperative,
349  impl->MKernelUsesClusterLaunch, MCodeLoc));
350  break;
351  }
355  CommandGroup.reset(
356  new detail::CGCopy(getType(), MSrcPtr, MDstPtr, std::move(impl->CGData),
357  std::move(impl->MAuxiliaryResources), MCodeLoc));
358  break;
360  CommandGroup.reset(new detail::CGFill(std::move(MPattern), MDstPtr,
361  std::move(impl->CGData), MCodeLoc));
362  break;
364  CommandGroup.reset(
365  new detail::CGUpdateHost(MDstPtr, std::move(impl->CGData), MCodeLoc));
366  break;
368  CommandGroup.reset(new detail::CGCopyUSM(
369  MSrcPtr, MDstPtr, MLength, std::move(impl->CGData), MCodeLoc));
370  break;
372  CommandGroup.reset(new detail::CGFillUSM(std::move(MPattern), MDstPtr,
373  MLength, std::move(impl->CGData),
374  MCodeLoc));
375  break;
377  CommandGroup.reset(new detail::CGPrefetchUSM(
378  MDstPtr, MLength, std::move(impl->CGData), MCodeLoc));
379  break;
381  CommandGroup.reset(new detail::CGAdviseUSM(MDstPtr, MLength, impl->MAdvice,
382  std::move(impl->CGData),
383  getType(), MCodeLoc));
384  break;
386  CommandGroup.reset(new detail::CGCopy2DUSM(
387  MSrcPtr, MDstPtr, impl->MSrcPitch, impl->MDstPitch, impl->MWidth,
388  impl->MHeight, std::move(impl->CGData), MCodeLoc));
389  break;
391  CommandGroup.reset(new detail::CGFill2DUSM(
392  std::move(MPattern), MDstPtr, impl->MDstPitch, impl->MWidth,
393  impl->MHeight, std::move(impl->CGData), MCodeLoc));
394  break;
396  CommandGroup.reset(new detail::CGMemset2DUSM(
397  MPattern[0], MDstPtr, impl->MDstPitch, impl->MWidth, impl->MHeight,
398  std::move(impl->CGData), MCodeLoc));
399  break;
402  auto context = impl->MGraph
403  ? detail::getSyclObjImpl(impl->MGraph->getContext())
404  : MQueue->getContextImplPtr();
405  CommandGroup.reset(new detail::CGHostTask(
406  std::move(impl->MHostTask), MQueue, context, std::move(impl->MArgs),
407  std::move(impl->CGData), getType(), MCodeLoc));
408  break;
409  }
412  if (auto GraphImpl = getCommandGraph(); GraphImpl != nullptr) {
413  impl->CGData.MEvents.insert(std::end(impl->CGData.MEvents),
414  std::begin(impl->MEventsWaitWithBarrier),
415  std::end(impl->MEventsWaitWithBarrier));
416  // Barrier node is implemented as an empty node in Graph
417  // but keep the barrier type to help managing dependencies
418  setType(detail::CGType::Barrier);
419  CommandGroup.reset(new detail::CG(detail::CGType::Barrier,
420  std::move(impl->CGData), MCodeLoc));
421  } else {
422  CommandGroup.reset(
423  new detail::CGBarrier(std::move(impl->MEventsWaitWithBarrier),
424  std::move(impl->CGData), getType(), MCodeLoc));
425  }
426  break;
427  }
429  CommandGroup.reset(
430  new detail::CGProfilingTag(std::move(impl->CGData), MCodeLoc));
431  break;
432  }
434  CommandGroup.reset(new detail::CGCopyToDeviceGlobal(
435  MSrcPtr, MDstPtr, impl->MIsDeviceImageScoped, MLength, impl->MOffset,
436  std::move(impl->CGData), MCodeLoc));
437  break;
438  }
440  CommandGroup.reset(new detail::CGCopyFromDeviceGlobal(
441  MSrcPtr, MDstPtr, impl->MIsDeviceImageScoped, MLength, impl->MOffset,
442  std::move(impl->CGData), MCodeLoc));
443  break;
444  }
446  CommandGroup.reset(new detail::CGReadWriteHostPipe(
447  impl->HostPipeName, impl->HostPipeBlocking, impl->HostPipePtr,
448  impl->HostPipeTypeSize, impl->HostPipeRead, std::move(impl->CGData),
449  MCodeLoc));
450  break;
451  }
453  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> ParentGraph =
454  MQueue ? MQueue->getCommandGraph() : impl->MGraph;
455 
456  // If a parent graph is set that means we are adding or recording a subgraph
457  // and we don't want to actually execute this command graph submission.
458  if (ParentGraph) {
460  if (MQueue) {
462  ParentGraph->MMutex);
463  }
464  impl->CGData.MRequirements = impl->MExecGraph->getRequirements();
465  // Here we are using the CommandGroup without passing a CommandBuffer to
466  // pass the exec_graph_impl and event dependencies. Since this subgraph CG
467  // will not be executed this is fine.
468  CommandGroup.reset(new sycl::detail::CGExecCommandBuffer(
469  nullptr, impl->MExecGraph, std::move(impl->CGData)));
470 
471  } else {
472  event GraphCompletionEvent =
473  impl->MExecGraph->enqueue(MQueue, std::move(impl->CGData));
474  MLastEvent = GraphCompletionEvent;
475  return MLastEvent;
476  }
477  } break;
479  CommandGroup.reset(new detail::CGCopyImage(
480  MSrcPtr, MDstPtr, impl->MSrcImageDesc, impl->MDstImageDesc,
481  impl->MSrcImageFormat, impl->MDstImageFormat, impl->MImageCopyFlags,
482  impl->MSrcOffset, impl->MDestOffset, impl->MCopyExtent,
483  std::move(impl->CGData), MCodeLoc));
484  break;
486  CommandGroup.reset(new detail::CGSemaphoreWait(
487  impl->MInteropSemaphoreHandle, impl->MWaitValue,
488  std::move(impl->CGData), MCodeLoc));
489  break;
491  CommandGroup.reset(new detail::CGSemaphoreSignal(
492  impl->MInteropSemaphoreHandle, impl->MSignalValue,
493  std::move(impl->CGData), MCodeLoc));
494  break;
496  if (detail::ur::trace()) {
497  std::cout << "WARNING: An empty command group is submitted." << std::endl;
498  }
499 
500  // Empty nodes are handled by Graph like standard nodes
501  // For Standard mode (non-graph),
502  // empty nodes are not sent to the scheduler to save time
503  if (impl->MGraph || (MQueue && MQueue->getCommandGraph())) {
504  CommandGroup.reset(new detail::CG(detail::CGType::None,
505  std::move(impl->CGData), MCodeLoc));
506  } else {
507  detail::EventImplPtr Event = std::make_shared<sycl::detail::event_impl>();
508  MLastEvent = detail::createSyclObjFromImpl<event>(Event);
509  return MLastEvent;
510  }
511  break;
512  }
513 
514  if (!CommandGroup)
515  throw exception(make_error_code(errc::runtime),
516  "Internal Error. Command group cannot be constructed.");
517 
518  // If there is a graph associated with the handler we are in the explicit
519  // graph mode, so we store the CG instead of submitting it to the scheduler,
520  // so it can be retrieved by the graph later.
521  if (impl->MGraph) {
522  impl->MGraphNodeCG = std::move(CommandGroup);
523  return detail::createSyclObjFromImpl<event>(
524  std::make_shared<detail::event_impl>());
525  }
526 
527  // If the queue has an associated graph then we need to take the CG and pass
528  // it to the graph to create a node, rather than submit it to the scheduler.
529  if (auto GraphImpl = MQueue->getCommandGraph(); GraphImpl) {
530  auto EventImpl = std::make_shared<detail::event_impl>();
531  EventImpl->setSubmittedQueue(MQueue);
532  std::shared_ptr<ext::oneapi::experimental::detail::node_impl> NodeImpl =
533  nullptr;
534 
535  // GraphImpl is read and written in this scope so we lock this graph
536  // with full priviledges.
538  GraphImpl->MMutex);
539 
541  impl->MUserFacingNodeType != ext::oneapi::experimental::node_type::empty
542  ? impl->MUserFacingNodeType
544 
545  // Create a new node in the graph representing this command-group
546  if (MQueue->isInOrder()) {
547  // In-order queues create implicit linear dependencies between nodes.
548  // Find the last node added to the graph from this queue, so our new
549  // node can set it as a predecessor.
550  auto DependentNode = GraphImpl->getLastInorderNode(MQueue);
551 
552  NodeImpl = DependentNode
553  ? GraphImpl->add(NodeType, std::move(CommandGroup),
554  {DependentNode})
555  : GraphImpl->add(NodeType, std::move(CommandGroup));
556 
557  // If we are recording an in-order queue remember the new node, so it
558  // can be used as a dependency for any more nodes recorded from this
559  // queue.
560  GraphImpl->setLastInorderNode(MQueue, NodeImpl);
561  } else {
562  auto LastBarrierRecordedFromQueue = GraphImpl->getBarrierDep(MQueue);
563  if (LastBarrierRecordedFromQueue) {
564  NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup),
565  {LastBarrierRecordedFromQueue});
566  } else {
567  NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup));
568  }
569 
570  if (NodeImpl->MCGType == sycl::detail::CGType::Barrier) {
571  GraphImpl->setBarrierDep(MQueue, NodeImpl);
572  }
573  }
574 
575  // Associate an event with this new node and return the event.
576  GraphImpl->addEventForNode(GraphImpl, EventImpl, NodeImpl);
577 
578  NodeImpl->MNDRangeUsed = impl->MNDRangeUsed;
579 
580  return detail::createSyclObjFromImpl<event>(EventImpl);
581  }
582 
584  std::move(CommandGroup), std::move(MQueue), impl->MEventNeeded);
585 
586  MLastEvent = detail::createSyclObjFromImpl<event>(Event);
587  return MLastEvent;
588 }
589 
590 void handler::addReduction(const std::shared_ptr<const void> &ReduObj) {
591  impl->MAuxiliaryResources.push_back(ReduObj);
592 }
593 
594 void handler::associateWithHandlerCommon(detail::AccessorImplPtr AccImpl,
595  int AccTarget) {
596  if (getCommandGraph() &&
597  static_cast<detail::SYCLMemObjT *>(AccImpl->MSYCLMemObj)
598  ->needsWriteBack()) {
600  "Accessors to buffers which have write_back enabled "
601  "are not allowed to be used in command graphs.");
602  }
603  detail::Requirement *Req = AccImpl.get();
604  if (Req->MAccessMode != sycl::access_mode::read) {
605  auto SYCLMemObj = static_cast<detail::SYCLMemObjT *>(Req->MSYCLMemObj);
606  SYCLMemObj->handleWriteAccessorCreation();
607  }
608  // Add accessor to the list of requirements.
609  if (Req->MAccessRange.size() != 0)
610  impl->CGData.MRequirements.push_back(Req);
611  // Store copy of the accessor.
612  impl->CGData.MAccStorage.push_back(std::move(AccImpl));
613  // Add an accessor to the handler list of associated accessors.
614  // For associated accessors index does not means nothing.
615  impl->MAssociatedAccesors.emplace_back(
616  detail::kernel_param_kind_t::kind_accessor, Req, AccTarget, /*index*/ 0);
617 }
618 
619 void handler::associateWithHandler(detail::AccessorBaseHost *AccBase,
620  access::target AccTarget) {
621  associateWithHandlerCommon(detail::getSyclObjImpl(*AccBase),
622  static_cast<int>(AccTarget));
623 }
624 
625 void handler::associateWithHandler(
626  detail::UnsampledImageAccessorBaseHost *AccBase, image_target AccTarget) {
627  associateWithHandlerCommon(detail::getSyclObjImpl(*AccBase),
628  static_cast<int>(AccTarget));
629 }
630 
631 void handler::associateWithHandler(
632  detail::SampledImageAccessorBaseHost *AccBase, image_target AccTarget) {
633  associateWithHandlerCommon(detail::getSyclObjImpl(*AccBase),
634  static_cast<int>(AccTarget));
635 }
636 
637 static void addArgsForGlobalAccessor(detail::Requirement *AccImpl, size_t Index,
638  size_t &IndexShift, int Size,
639  bool IsKernelCreatedFromSource,
640  size_t GlobalSize,
641  std::vector<detail::ArgDesc> &Args,
642  bool isESIMD) {
644  if (AccImpl->PerWI)
645  AccImpl->resize(GlobalSize);
646 
647  Args.emplace_back(kernel_param_kind_t::kind_accessor, AccImpl, Size,
648  Index + IndexShift);
649 
650  // TODO ESIMD currently does not suport offset, memory and access ranges -
651  // accessor::init for ESIMD-mode accessor has a single field, translated
652  // to a single kernel argument set above.
653  if (!isESIMD && !IsKernelCreatedFromSource) {
654  // Dimensionality of the buffer is 1 when dimensionality of the
655  // accessor is 0.
656  const size_t SizeAccField =
657  sizeof(size_t) * (AccImpl->MDims == 0 ? 1 : AccImpl->MDims);
658  ++IndexShift;
659  Args.emplace_back(kernel_param_kind_t::kind_std_layout,
660  &AccImpl->MAccessRange[0], SizeAccField,
661  Index + IndexShift);
662  ++IndexShift;
663  Args.emplace_back(kernel_param_kind_t::kind_std_layout,
664  &AccImpl->MMemoryRange[0], SizeAccField,
665  Index + IndexShift);
666  ++IndexShift;
667  Args.emplace_back(kernel_param_kind_t::kind_std_layout,
668  &AccImpl->MOffset[0], SizeAccField, Index + IndexShift);
669  }
670 }
671 
672 void handler::processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
673  const int Size, const size_t Index, size_t &IndexShift,
674  bool IsKernelCreatedFromSource, bool IsESIMD) {
676 
677  switch (Kind) {
678  case kernel_param_kind_t::kind_std_layout:
679  case kernel_param_kind_t::kind_pointer: {
680  addArg(Kind, Ptr, Size, Index + IndexShift);
681  break;
682  }
683  case kernel_param_kind_t::kind_stream: {
684  // Stream contains several accessors inside.
685  stream *S = static_cast<stream *>(Ptr);
686 
687  detail::AccessorBaseHost *GBufBase =
688  static_cast<detail::AccessorBaseHost *>(&S->GlobalBuf);
689  detail::AccessorImplPtr GBufImpl = detail::getSyclObjImpl(*GBufBase);
690  detail::Requirement *GBufReq = GBufImpl.get();
692  GBufReq, Index, IndexShift, Size, IsKernelCreatedFromSource,
693  impl->MNDRDesc.GlobalSize.size(), impl->MArgs, IsESIMD);
694  ++IndexShift;
695  detail::AccessorBaseHost *GOffsetBase =
696  static_cast<detail::AccessorBaseHost *>(&S->GlobalOffset);
697  detail::AccessorImplPtr GOfssetImpl = detail::getSyclObjImpl(*GOffsetBase);
698  detail::Requirement *GOffsetReq = GOfssetImpl.get();
700  GOffsetReq, Index, IndexShift, Size, IsKernelCreatedFromSource,
701  impl->MNDRDesc.GlobalSize.size(), impl->MArgs, IsESIMD);
702  ++IndexShift;
703  detail::AccessorBaseHost *GFlushBase =
704  static_cast<detail::AccessorBaseHost *>(&S->GlobalFlushBuf);
705  detail::AccessorImplPtr GFlushImpl = detail::getSyclObjImpl(*GFlushBase);
706  detail::Requirement *GFlushReq = GFlushImpl.get();
707 
708  size_t GlobalSize = impl->MNDRDesc.GlobalSize.size();
709  // If work group size wasn't set explicitly then it must be recieved
710  // from kernel attribute or set to default values.
711  // For now we can't get this attribute here.
712  // So we just suppose that WG size is always default for stream.
713  // TODO adjust MNDRDesc when device image contains kernel's attribute
714  if (GlobalSize == 0) {
715  // Suppose that work group size is 1 for every dimension
716  GlobalSize = impl->MNDRDesc.NumWorkGroups.size();
717  }
718  addArgsForGlobalAccessor(GFlushReq, Index, IndexShift, Size,
719  IsKernelCreatedFromSource, GlobalSize, impl->MArgs,
720  IsESIMD);
721  ++IndexShift;
722  addArg(kernel_param_kind_t::kind_std_layout, &S->FlushBufferSize,
723  sizeof(S->FlushBufferSize), Index + IndexShift);
724 
725  break;
726  }
727  case kernel_param_kind_t::kind_accessor: {
728  // For args kind of accessor Size is information about accessor.
729  // The first 11 bits of Size encodes the accessor target.
730  const access::target AccTarget =
731  static_cast<access::target>(Size & AccessTargetMask);
732  switch (AccTarget) {
734  case access::target::constant_buffer: {
735  detail::Requirement *AccImpl = static_cast<detail::Requirement *>(Ptr);
737  AccImpl, Index, IndexShift, Size, IsKernelCreatedFromSource,
738  impl->MNDRDesc.GlobalSize.size(), impl->MArgs, IsESIMD);
739  break;
740  }
741  case access::target::local: {
742  detail::LocalAccessorImplHost *LAcc =
743  static_cast<detail::LocalAccessorImplHost *>(Ptr);
744 
745  range<3> &Size = LAcc->MSize;
746  const int Dims = LAcc->MDims;
747  int SizeInBytes = LAcc->MElemSize;
748  for (int I = 0; I < Dims; ++I)
749  SizeInBytes *= Size[I];
750  // Some backends do not accept zero-sized local memory arguments, so we
751  // make it a minimum allocation of 1 byte.
752  SizeInBytes = std::max(SizeInBytes, 1);
753  impl->MArgs.emplace_back(kernel_param_kind_t::kind_std_layout, nullptr,
754  SizeInBytes, Index + IndexShift);
755  // TODO ESIMD currently does not suport MSize field passing yet
756  // accessor::init for ESIMD-mode accessor has a single field, translated
757  // to a single kernel argument set above.
758  if (!IsESIMD && !IsKernelCreatedFromSource) {
759  ++IndexShift;
760  const size_t SizeAccField = (Dims == 0 ? 1 : Dims) * sizeof(Size[0]);
761  addArg(kernel_param_kind_t::kind_std_layout, &Size, SizeAccField,
762  Index + IndexShift);
763  ++IndexShift;
764  addArg(kernel_param_kind_t::kind_std_layout, &Size, SizeAccField,
765  Index + IndexShift);
766  ++IndexShift;
767  addArg(kernel_param_kind_t::kind_std_layout, &Size, SizeAccField,
768  Index + IndexShift);
769  }
770  break;
771  }
774  detail::Requirement *AccImpl = static_cast<detail::Requirement *>(Ptr);
775  addArg(Kind, AccImpl, Size, Index + IndexShift);
776  if (!IsKernelCreatedFromSource) {
777  // TODO Handle additional kernel arguments for image class
778  // if the compiler front-end adds them.
779  }
780  break;
781  }
784  case access::target::host_buffer: {
786  "Unsupported accessor target case.");
787  break;
788  }
789  }
790  break;
791  }
792  case kernel_param_kind_t::kind_sampler: {
793  addArg(kernel_param_kind_t::kind_sampler, Ptr, sizeof(sampler),
794  Index + IndexShift);
795  break;
796  }
797  case kernel_param_kind_t::kind_specialization_constants_buffer: {
798  addArg(kernel_param_kind_t::kind_specialization_constants_buffer, Ptr, Size,
799  Index + IndexShift);
800  break;
801  }
802  case kernel_param_kind_t::kind_invalid:
803  throw exception(make_error_code(errc::invalid),
804  "Invalid kernel param kind");
805  break;
806  }
807 }
808 
809 // The argument can take up more space to store additional information about
810 // MAccessRange, MMemoryRange, and MOffset added with addArgsForGlobalAccessor.
811 // We use the worst-case estimate because the lifetime of the vector is short.
812 // In processArg the kind_stream case introduces the maximum number of
813 // additional arguments. The case adds additional 12 arguments to the currently
814 // processed argument, hence worst-case estimate is 12+1=13.
815 // TODO: the constant can be removed if the size of MArgs will be calculated at
816 // compile time.
817 inline constexpr size_t MaxNumAdditionalArgs = 13;
818 
819 void handler::extractArgsAndReqs() {
820  assert(MKernel && "MKernel is not initialized");
821  std::vector<detail::ArgDesc> UnPreparedArgs = std::move(impl->MArgs);
822  clearArgs();
823 
824  std::sort(
825  UnPreparedArgs.begin(), UnPreparedArgs.end(),
826  [](const detail::ArgDesc &first, const detail::ArgDesc &second) -> bool {
827  return (first.MIndex < second.MIndex);
828  });
829 
830  const bool IsKernelCreatedFromSource = MKernel->isCreatedFromSource();
831  impl->MArgs.reserve(MaxNumAdditionalArgs * UnPreparedArgs.size());
832 
833  size_t IndexShift = 0;
834  for (size_t I = 0; I < UnPreparedArgs.size(); ++I) {
835  void *Ptr = UnPreparedArgs[I].MPtr;
836  const detail::kernel_param_kind_t &Kind = UnPreparedArgs[I].MType;
837  const int &Size = UnPreparedArgs[I].MSize;
838  const int Index = UnPreparedArgs[I].MIndex;
839  processArg(Ptr, Kind, Size, Index, IndexShift, IsKernelCreatedFromSource,
840  false);
841  }
842 }
843 
844 void handler::extractArgsAndReqsFromLambda(
845  char *LambdaPtr, size_t KernelArgsNum,
846  const detail::kernel_param_desc_t *KernelArgs, bool IsESIMD) {
847  const bool IsKernelCreatedFromSource = false;
848  size_t IndexShift = 0;
849  impl->MArgs.reserve(MaxNumAdditionalArgs * KernelArgsNum);
850 
851  for (size_t I = 0; I < KernelArgsNum; ++I) {
852  void *Ptr = LambdaPtr + KernelArgs[I].offset;
853  const detail::kernel_param_kind_t &Kind = KernelArgs[I].kind;
854  const int &Size = KernelArgs[I].info;
856  // For args kind of accessor Size is information about accessor.
857  // The first 11 bits of Size encodes the accessor target.
858  const access::target AccTarget =
859  static_cast<access::target>(Size & AccessTargetMask);
860  if ((AccTarget == access::target::device ||
861  AccTarget == access::target::constant_buffer) ||
862  (AccTarget == access::target::image ||
863  AccTarget == access::target::image_array)) {
864  detail::AccessorBaseHost *AccBase =
865  static_cast<detail::AccessorBaseHost *>(Ptr);
866  Ptr = detail::getSyclObjImpl(*AccBase).get();
867  } else if (AccTarget == access::target::local) {
868  detail::LocalAccessorBaseHost *LocalAccBase =
869  static_cast<detail::LocalAccessorBaseHost *>(Ptr);
870  Ptr = detail::getSyclObjImpl(*LocalAccBase).get();
871  }
872  }
873  processArg(Ptr, Kind, Size, I, IndexShift, IsKernelCreatedFromSource,
874  IsESIMD);
875  }
876 }
877 
878 // Calling methods of kernel_impl requires knowledge of class layout.
879 // As this is impossible in header, there's a function that calls necessary
880 // method inside the library and returns the result.
881 detail::string handler::getKernelName() {
882  return detail::string{MKernel->get_info<info::kernel::function_name>()};
883 }
884 
885 void handler::verifyUsedKernelBundleInternal(detail::string_view KernelName) {
886  auto UsedKernelBundleImplPtr =
887  getOrInsertHandlerKernelBundle(/*Insert=*/false);
888  if (!UsedKernelBundleImplPtr)
889  return;
890 
891  // Implicit kernel bundles are populated late so we ignore them
892  if (!impl->isStateExplicitKernelBundle())
893  return;
894 
895  kernel_id KernelID = detail::get_kernel_id_impl(KernelName);
896  device Dev = impl->MGraph ? impl->MGraph->getDevice()
898  if (!UsedKernelBundleImplPtr->has_kernel(KernelID, Dev))
899  throw sycl::exception(
901  "The kernel bundle in use does not contain the kernel");
902 }
903 
904 void handler::ext_oneapi_barrier(const std::vector<event> &WaitList) {
905  throwIfActionIsCreated();
907  impl->MEventsWaitWithBarrier.reserve(WaitList.size());
908  for (auto &Event : WaitList) {
909  auto EventImpl = detail::getSyclObjImpl(Event);
910  // We could not wait for host task events in backend.
911  // Adding them as dependency to enable proper scheduling.
912  if (EventImpl->isHost()) {
913  depends_on(EventImpl);
914  }
915  impl->MEventsWaitWithBarrier.push_back(EventImpl);
916  }
917 }
918 
919 using namespace sycl::detail;
920 bool handler::DisableRangeRounding() {
922 }
923 
924 bool handler::RangeRoundingTrace() {
926 }
927 
928 void handler::GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
929  size_t &MinRange) {
930  SYCLConfig<SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS>::GetSettings(
931  MinFactor, GoodFactor, MinRange);
932 }
933 
934 void handler::memcpy(void *Dest, const void *Src, size_t Count) {
935  throwIfActionIsCreated();
936  MSrcPtr = const_cast<void *>(Src);
937  MDstPtr = Dest;
938  MLength = Count;
939  setType(detail::CGType::CopyUSM);
940 }
941 
942 void handler::memset(void *Dest, int Value, size_t Count) {
943  throwIfActionIsCreated();
944  MDstPtr = Dest;
945  MPattern.push_back(static_cast<char>(Value));
946  MLength = Count;
947  setUserFacingNodeType(ext::oneapi::experimental::node_type::memset);
948  setType(detail::CGType::FillUSM);
949 }
950 
951 void handler::prefetch(const void *Ptr, size_t Count) {
952  throwIfActionIsCreated();
953  MDstPtr = const_cast<void *>(Ptr);
954  MLength = Count;
956 }
957 
958 void handler::mem_advise(const void *Ptr, size_t Count, int Advice) {
959  throwIfActionIsCreated();
960  MDstPtr = const_cast<void *>(Ptr);
961  MLength = Count;
962  impl->MAdvice = static_cast<ur_usm_advice_flags_t>(Advice);
963  setType(detail::CGType::AdviseUSM);
964 }
965 
966 void handler::fill_impl(void *Dest, const void *Value, size_t ValueSize,
967  size_t Count) {
968  MDstPtr = Dest;
969  MPattern.resize(ValueSize);
970  std::memcpy(MPattern.data(), Value, ValueSize);
971  MLength = Count * ValueSize;
972  setType(detail::CGType::FillUSM);
973 }
974 
975 void handler::ext_oneapi_memcpy2d_impl(void *Dest, size_t DestPitch,
976  const void *Src, size_t SrcPitch,
977  size_t Width, size_t Height) {
978  // Checks done in callers.
979  MSrcPtr = const_cast<void *>(Src);
980  MDstPtr = Dest;
981  impl->MSrcPitch = SrcPitch;
982  impl->MDstPitch = DestPitch;
983  impl->MWidth = Width;
984  impl->MHeight = Height;
985  setType(detail::CGType::Copy2DUSM);
986 }
987 
988 void handler::ext_oneapi_fill2d_impl(void *Dest, size_t DestPitch,
989  const void *Value, size_t ValueSize,
990  size_t Width, size_t Height) {
991  // Checks done in callers.
992  MDstPtr = Dest;
993  MPattern.resize(ValueSize);
994  std::memcpy(MPattern.data(), Value, ValueSize);
995  impl->MDstPitch = DestPitch;
996  impl->MWidth = Width;
997  impl->MHeight = Height;
998  setType(detail::CGType::Fill2DUSM);
999 }
1000 
1001 void handler::ext_oneapi_memset2d_impl(void *Dest, size_t DestPitch, int Value,
1002  size_t Width, size_t Height) {
1003  // Checks done in callers.
1004  MDstPtr = Dest;
1005  MPattern.push_back(static_cast<unsigned char>(Value));
1006  impl->MDstPitch = DestPitch;
1007  impl->MWidth = Width;
1008  impl->MHeight = Height;
1009  setType(detail::CGType::Memset2DUSM);
1010 }
1011 
1013  const void *Src, ext::oneapi::experimental::image_mem_handle Dest,
1015  throwIfGraphAssociated<
1016  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
1017  sycl_ext_oneapi_bindless_images>();
1018  Desc.verify();
1019 
1020  MSrcPtr = const_cast<void *>(Src);
1021  MDstPtr = reinterpret_cast<void *>(Dest.raw_handle);
1022 
1023  ur_image_desc_t UrDesc = {};
1024  UrDesc.stype = UR_STRUCTURE_TYPE_IMAGE_DESC;
1025  UrDesc.width = Desc.width;
1026  UrDesc.height = Desc.height;
1027  UrDesc.depth = Desc.depth;
1028  UrDesc.arraySize = Desc.array_size;
1029 
1030  if (Desc.array_size > 1) {
1031  // Image Array.
1032  UrDesc.type =
1033  Desc.height > 0 ? UR_MEM_TYPE_IMAGE2D_ARRAY : UR_MEM_TYPE_IMAGE1D_ARRAY;
1034 
1035  // Cubemap.
1036  UrDesc.type =
1037  Desc.type == sycl::ext::oneapi::experimental::image_type::cubemap
1038  ? UR_MEM_TYPE_IMAGE_CUBEMAP_EXP
1039  : UrDesc.type;
1040  } else {
1041  UrDesc.type = Desc.depth > 0 ? UR_MEM_TYPE_IMAGE3D
1042  : (Desc.height > 0 ? UR_MEM_TYPE_IMAGE2D
1043  : UR_MEM_TYPE_IMAGE1D);
1044  }
1045 
1046  ur_image_format_t UrFormat;
1047  UrFormat.channelType =
1049  UrFormat.channelOrder = sycl::detail::convertChannelOrder(
1052 
1053  impl->MSrcOffset = {0, 0, 0};
1054  impl->MDestOffset = {0, 0, 0};
1055  impl->MCopyExtent = {Desc.width, Desc.height, Desc.depth};
1056  impl->MSrcImageDesc = UrDesc;
1057  impl->MDstImageDesc = UrDesc;
1058  impl->MSrcImageFormat = UrFormat;
1059  impl->MDstImageFormat = UrFormat;
1060  impl->MImageCopyFlags = UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE;
1061  setType(detail::CGType::CopyImage);
1062 }
1063 
1065  const void *Src, sycl::range<3> SrcOffset, sycl::range<3> SrcExtent,
1068  sycl::range<3> CopyExtent) {
1069  throwIfGraphAssociated<
1070  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
1071  sycl_ext_oneapi_bindless_images>();
1072  DestImgDesc.verify();
1073 
1074  MSrcPtr = const_cast<void *>(Src);
1075  MDstPtr = reinterpret_cast<void *>(Dest.raw_handle);
1076 
1077  ur_image_desc_t UrDesc = {};
1078  UrDesc.stype = UR_STRUCTURE_TYPE_IMAGE_DESC;
1079  UrDesc.width = DestImgDesc.width;
1080  UrDesc.height = DestImgDesc.height;
1081  UrDesc.depth = DestImgDesc.depth;
1082  UrDesc.arraySize = DestImgDesc.array_size;
1083 
1084  if (DestImgDesc.array_size > 1) {
1085  // Image Array.
1086  UrDesc.type = DestImgDesc.height > 0 ? UR_MEM_TYPE_IMAGE2D_ARRAY
1087  : UR_MEM_TYPE_IMAGE1D_ARRAY;
1088 
1089  // Cubemap.
1090  UrDesc.type =
1091  DestImgDesc.type == sycl::ext::oneapi::experimental::image_type::cubemap
1092  ? UR_MEM_TYPE_IMAGE_CUBEMAP_EXP
1093  : UrDesc.type;
1094  } else {
1095  UrDesc.type = DestImgDesc.depth > 0
1096  ? UR_MEM_TYPE_IMAGE3D
1097  : (DestImgDesc.height > 0 ? UR_MEM_TYPE_IMAGE2D
1098  : UR_MEM_TYPE_IMAGE1D);
1099  }
1100 
1101  ur_image_format_t UrFormat;
1102  UrFormat.channelType =
1104  UrFormat.channelOrder = sycl::detail::convertChannelOrder(
1107 
1108  impl->MSrcOffset = {SrcOffset[0], SrcOffset[1], SrcOffset[2]};
1109  impl->MDestOffset = {DestOffset[0], DestOffset[1], DestOffset[2]};
1110  impl->MCopyExtent = {CopyExtent[0], CopyExtent[1], CopyExtent[2]};
1111  impl->MSrcImageDesc = UrDesc;
1112  impl->MSrcImageDesc.width = SrcExtent[0];
1113  impl->MSrcImageDesc.height = SrcExtent[1];
1114  impl->MSrcImageDesc.depth = SrcExtent[2];
1115  impl->MDstImageDesc = UrDesc;
1116  impl->MSrcImageFormat = UrFormat;
1117  impl->MDstImageFormat = UrFormat;
1118  impl->MImageCopyFlags = UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE;
1119  setType(detail::CGType::CopyImage);
1120 }
1121 
1123  const ext::oneapi::experimental::image_mem_handle Src, void *Dest,
1125  throwIfGraphAssociated<
1126  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
1127  sycl_ext_oneapi_bindless_images>();
1128  Desc.verify();
1129 
1130  MSrcPtr = reinterpret_cast<void*>(Src.raw_handle);
1131  MDstPtr = Dest;
1132 
1133  ur_image_desc_t UrDesc = {};
1134  UrDesc.stype = UR_STRUCTURE_TYPE_IMAGE_DESC;
1135  UrDesc.width = Desc.width;
1136  UrDesc.height = Desc.height;
1137  UrDesc.depth = Desc.depth;
1138  UrDesc.arraySize = Desc.array_size;
1139 
1140  if (Desc.array_size > 1) {
1141  // Image Array.
1142  UrDesc.type =
1143  Desc.height > 0 ? UR_MEM_TYPE_IMAGE2D_ARRAY : UR_MEM_TYPE_IMAGE1D_ARRAY;
1144 
1145  // Cubemap.
1146  UrDesc.type =
1147  Desc.type == sycl::ext::oneapi::experimental::image_type::cubemap
1148  ? UR_MEM_TYPE_IMAGE_CUBEMAP_EXP
1149  : UrDesc.type;
1150  } else {
1151  UrDesc.type = Desc.depth > 0 ? UR_MEM_TYPE_IMAGE3D
1152  : (Desc.height > 0 ? UR_MEM_TYPE_IMAGE2D
1153  : UR_MEM_TYPE_IMAGE1D);
1154  }
1155 
1156  ur_image_format_t UrFormat;
1157  UrFormat.channelType =
1159  UrFormat.channelOrder = sycl::detail::convertChannelOrder(
1162 
1163  impl->MSrcOffset = {0, 0, 0};
1164  impl->MDestOffset = {0, 0, 0};
1165  impl->MCopyExtent = {Desc.width, Desc.height, Desc.depth};
1166  impl->MSrcImageDesc = UrDesc;
1167  impl->MDstImageDesc = UrDesc;
1168  impl->MSrcImageFormat = UrFormat;
1169  impl->MDstImageFormat = UrFormat;
1170  impl->MImageCopyFlags = UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST;
1171  setType(detail::CGType::CopyImage);
1172 }
1173 
1178  throwIfGraphAssociated<
1179  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
1180  sycl_ext_oneapi_bindless_images>();
1181  ImageDesc.verify();
1182 
1183  MSrcPtr = reinterpret_cast<void*>(Src.raw_handle);
1184  MDstPtr = reinterpret_cast<void*>(Dest.raw_handle);
1185 
1186  ur_image_desc_t UrDesc = {};
1187  UrDesc.stype = UR_STRUCTURE_TYPE_IMAGE_DESC;
1188  UrDesc.width = ImageDesc.width;
1189  UrDesc.height = ImageDesc.height;
1190  UrDesc.depth = ImageDesc.depth;
1191  UrDesc.arraySize = ImageDesc.array_size;
1192  if (ImageDesc.array_size > 1) {
1193  // Image Array.
1194  UrDesc.type = ImageDesc.height > 0 ? UR_MEM_TYPE_IMAGE2D_ARRAY
1195  : UR_MEM_TYPE_IMAGE1D_ARRAY;
1196 
1197  // Cubemap.
1198  UrDesc.type =
1199  ImageDesc.type == sycl::ext::oneapi::experimental::image_type::cubemap
1200  ? UR_MEM_TYPE_IMAGE_CUBEMAP_EXP
1201  : UrDesc.type;
1202  } else {
1203  UrDesc.type = ImageDesc.depth > 0
1204  ? UR_MEM_TYPE_IMAGE3D
1205  : (ImageDesc.height > 0 ? UR_MEM_TYPE_IMAGE2D
1206  : UR_MEM_TYPE_IMAGE1D);
1207  }
1208 
1209  ur_image_format_t UrFormat;
1210  UrFormat.channelType =
1212  UrFormat.channelOrder = sycl::detail::convertChannelOrder(
1215 
1216  impl->MSrcOffset = {0, 0, 0};
1217  impl->MDestOffset = {0, 0, 0};
1218  impl->MCopyExtent = {ImageDesc.width, ImageDesc.height, ImageDesc.depth};
1219  impl->MSrcImageDesc = UrDesc;
1220  impl->MDstImageDesc = UrDesc;
1221  impl->MSrcImageFormat = UrFormat;
1222  impl->MDstImageFormat = UrFormat;
1223  impl->MImageCopyFlags = UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE;
1224  setType(detail::CGType::CopyImage);
1225 }
1226 
1229  sycl::range<3> SrcOffset,
1230  const ext::oneapi::experimental::image_descriptor &SrcImgDesc, void *Dest,
1231  sycl::range<3> DestOffset, sycl::range<3> DestExtent,
1232  sycl::range<3> CopyExtent) {
1233  throwIfGraphAssociated<
1234  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
1235  sycl_ext_oneapi_bindless_images>();
1236  SrcImgDesc.verify();
1237 
1238  MSrcPtr = reinterpret_cast<void*>(Src.raw_handle);
1239  MDstPtr = Dest;
1240 
1241  ur_image_desc_t UrDesc = {};
1242  UrDesc.stype = UR_STRUCTURE_TYPE_IMAGE_DESC;
1243  UrDesc.width = SrcImgDesc.width;
1244  UrDesc.height = SrcImgDesc.height;
1245  UrDesc.depth = SrcImgDesc.depth;
1246  UrDesc.arraySize = SrcImgDesc.array_size;
1247 
1248  if (SrcImgDesc.array_size > 1) {
1249  // Image Array.
1250  UrDesc.type = SrcImgDesc.height > 0 ? UR_MEM_TYPE_IMAGE2D_ARRAY
1251  : UR_MEM_TYPE_IMAGE1D_ARRAY;
1252 
1253  // Cubemap.
1254  UrDesc.type =
1255  SrcImgDesc.type == sycl::ext::oneapi::experimental::image_type::cubemap
1256  ? UR_MEM_TYPE_IMAGE_CUBEMAP_EXP
1257  : UrDesc.type;
1258  } else {
1259  UrDesc.type = SrcImgDesc.depth > 0
1260  ? UR_MEM_TYPE_IMAGE3D
1261  : (SrcImgDesc.height > 0 ? UR_MEM_TYPE_IMAGE2D
1262  : UR_MEM_TYPE_IMAGE1D);
1263  }
1264 
1265  ur_image_format_t UrFormat;
1266  UrFormat.channelType =
1268  UrFormat.channelOrder = sycl::detail::convertChannelOrder(
1271 
1272  impl->MSrcOffset = {SrcOffset[0], SrcOffset[1], SrcOffset[2]};
1273  impl->MDestOffset = {DestOffset[0], DestOffset[1], DestOffset[2]};
1274  impl->MCopyExtent = {CopyExtent[0], CopyExtent[1], CopyExtent[2]};
1275  impl->MSrcImageDesc = UrDesc;
1276  impl->MDstImageDesc = UrDesc;
1277  impl->MDstImageDesc.width = DestExtent[0];
1278  impl->MDstImageDesc.height = DestExtent[1];
1279  impl->MDstImageDesc.depth = DestExtent[2];
1280  impl->MSrcImageFormat = UrFormat;
1281  impl->MDstImageFormat = UrFormat;
1282  impl->MImageCopyFlags = UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST;
1283  setType(detail::CGType::CopyImage);
1284 }
1285 
1287  const void *Src, void *Dest,
1288  const ext::oneapi::experimental::image_descriptor &Desc, size_t Pitch) {
1289  throwIfGraphAssociated<
1290  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
1291  sycl_ext_oneapi_bindless_images>();
1292  Desc.verify();
1293 
1294  MSrcPtr = const_cast<void *>(Src);
1295  MDstPtr = Dest;
1296 
1297  ur_image_desc_t UrDesc = {};
1298  UrDesc.stype = UR_STRUCTURE_TYPE_IMAGE_DESC;
1299  UrDesc.width = Desc.width;
1300  UrDesc.height = Desc.height;
1301  UrDesc.depth = Desc.depth;
1302  UrDesc.arraySize = Desc.array_size;
1303 
1304  if (Desc.array_size > 1) {
1305  // Image Array.
1306  UrDesc.type =
1307  Desc.height > 0 ? UR_MEM_TYPE_IMAGE2D_ARRAY : UR_MEM_TYPE_IMAGE1D_ARRAY;
1308 
1309  // Cubemap.
1310  UrDesc.type =
1311  Desc.type == sycl::ext::oneapi::experimental::image_type::cubemap
1312  ? UR_MEM_TYPE_IMAGE_CUBEMAP_EXP
1313  : UrDesc.type;
1314  } else {
1315  UrDesc.type = Desc.depth > 0 ? UR_MEM_TYPE_IMAGE3D
1316  : (Desc.height > 0 ? UR_MEM_TYPE_IMAGE2D
1317  : UR_MEM_TYPE_IMAGE1D);
1318  }
1319 
1320  ur_image_format_t UrFormat;
1321  UrFormat.channelType =
1323  UrFormat.channelOrder = sycl::detail::convertChannelOrder(
1326 
1327  impl->MSrcOffset = {0, 0, 0};
1328  impl->MDestOffset = {0, 0, 0};
1329  impl->MCopyExtent = {Desc.width, Desc.height, Desc.depth};
1330  impl->MSrcImageDesc = UrDesc;
1331  impl->MDstImageDesc = UrDesc;
1332  impl->MSrcImageFormat = UrFormat;
1333  impl->MDstImageFormat = UrFormat;
1334  impl->MSrcImageDesc.rowPitch = Pitch;
1335  impl->MDstImageDesc.rowPitch = Pitch;
1336  impl->MImageCopyFlags = detail::getUrImageCopyFlags(
1337  get_pointer_type(Src, MQueue->get_context()),
1338  get_pointer_type(Dest, MQueue->get_context()));
1339  setType(detail::CGType::CopyImage);
1340 }
1341 
1343  const void *Src, sycl::range<3> SrcOffset, void *Dest,
1344  sycl::range<3> DestOffset,
1345  const ext::oneapi::experimental::image_descriptor &DeviceImgDesc,
1346  size_t DeviceRowPitch, sycl::range<3> HostExtent,
1347  sycl::range<3> CopyExtent) {
1348  throwIfGraphAssociated<
1349  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
1350  sycl_ext_oneapi_bindless_images>();
1351  DeviceImgDesc.verify();
1352 
1353  MSrcPtr = const_cast<void *>(Src);
1354  MDstPtr = Dest;
1355 
1356  ur_image_desc_t UrDesc = {};
1357  UrDesc.stype = UR_STRUCTURE_TYPE_IMAGE_DESC;
1358  UrDesc.width = DeviceImgDesc.width;
1359  UrDesc.height = DeviceImgDesc.height;
1360  UrDesc.depth = DeviceImgDesc.depth;
1361  UrDesc.arraySize = DeviceImgDesc.array_size;
1362 
1363  if (DeviceImgDesc.array_size > 1) {
1364  // Image Array.
1365  UrDesc.type = DeviceImgDesc.height > 0 ? UR_MEM_TYPE_IMAGE2D_ARRAY
1366  : UR_MEM_TYPE_IMAGE1D_ARRAY;
1367 
1368  // Cubemap.
1369  UrDesc.type = DeviceImgDesc.type ==
1370  sycl::ext::oneapi::experimental::image_type::cubemap
1371  ? UR_MEM_TYPE_IMAGE_CUBEMAP_EXP
1372  : UrDesc.type;
1373  } else {
1374  UrDesc.type = DeviceImgDesc.depth > 0
1375  ? UR_MEM_TYPE_IMAGE3D
1376  : (DeviceImgDesc.height > 0 ? UR_MEM_TYPE_IMAGE2D
1377  : UR_MEM_TYPE_IMAGE1D);
1378  }
1379 
1380  ur_image_format_t UrFormat;
1381  UrFormat.channelType =
1383  UrFormat.channelOrder = sycl::detail::convertChannelOrder(
1386 
1387  impl->MSrcOffset = {SrcOffset[0], SrcOffset[1], SrcOffset[2]};
1388  impl->MDestOffset = {DestOffset[0], DestOffset[1], DestOffset[2]};
1389  impl->MCopyExtent = {CopyExtent[0], CopyExtent[1], CopyExtent[2]};
1390  impl->MSrcImageFormat = UrFormat;
1391  impl->MDstImageFormat = UrFormat;
1392  impl->MImageCopyFlags = detail::getUrImageCopyFlags(
1393  get_pointer_type(Src, MQueue->get_context()),
1394  get_pointer_type(Dest, MQueue->get_context()));
1395  impl->MSrcImageDesc = UrDesc;
1396  impl->MDstImageDesc = UrDesc;
1397 
1398  // Fill the descriptor row pitch and host extent based on the type of copy.
1399  if (impl->MImageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE) {
1400  impl->MDstImageDesc.rowPitch = DeviceRowPitch;
1401  impl->MSrcImageDesc.rowPitch = 0;
1402  impl->MSrcImageDesc.width = HostExtent[0];
1403  impl->MSrcImageDesc.height = HostExtent[1];
1404  impl->MSrcImageDesc.depth = HostExtent[2];
1405  } else if (impl->MImageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST) {
1406  impl->MSrcImageDesc.rowPitch = DeviceRowPitch;
1407  impl->MDstImageDesc.rowPitch = 0;
1408  impl->MDstImageDesc.width = HostExtent[0];
1409  impl->MDstImageDesc.height = HostExtent[1];
1410  impl->MDstImageDesc.depth = HostExtent[2];
1411  } else {
1412  impl->MDstImageDesc.rowPitch = DeviceRowPitch;
1413  impl->MSrcImageDesc.rowPitch = DeviceRowPitch;
1414  }
1415 
1416  setType(detail::CGType::CopyImage);
1417 }
1418 
1421  throwIfGraphAssociated<
1422  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
1423  sycl_ext_oneapi_bindless_images>();
1424  if (SemaphoreHandle.handle_type !=
1426  opaque_fd &&
1427  SemaphoreHandle.handle_type !=
1429  win32_nt_handle) {
1430  throw sycl::exception(
1432  "Invalid type of semaphore for this operation. The "
1433  "type of semaphore used needs a user passed wait value.");
1434  }
1435  impl->MInteropSemaphoreHandle =
1436  (ur_exp_interop_semaphore_handle_t)SemaphoreHandle.raw_handle;
1437  impl->MWaitValue = {};
1439 }
1440 
1443  uint64_t WaitValue) {
1444  throwIfGraphAssociated<
1445  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
1446  sycl_ext_oneapi_bindless_images>();
1447  if (SemaphoreHandle.handle_type !=
1449  win32_nt_dx12_fence) {
1450  throw sycl::exception(
1452  "Invalid type of semaphore for this operation. The "
1453  "type of semaphore does not support user passed wait values.");
1454  }
1455  impl->MInteropSemaphoreHandle =
1456  (ur_exp_interop_semaphore_handle_t)SemaphoreHandle.raw_handle;
1457  impl->MWaitValue = WaitValue;
1459 }
1460 
1463  throwIfGraphAssociated<
1464  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
1465  sycl_ext_oneapi_bindless_images>();
1466  if (SemaphoreHandle.handle_type !=
1468  opaque_fd &&
1469  SemaphoreHandle.handle_type !=
1471  win32_nt_handle) {
1472  throw sycl::exception(
1474  "Invalid type of semaphore for this operation. The "
1475  "type of semaphore used needs a user passed signal value.");
1476  }
1477  impl->MInteropSemaphoreHandle =
1478  (ur_exp_interop_semaphore_handle_t)SemaphoreHandle.raw_handle;
1479  impl->MSignalValue = {};
1481 }
1482 
1485  uint64_t SignalValue) {
1486  throwIfGraphAssociated<
1487  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
1488  sycl_ext_oneapi_bindless_images>();
1489  if (SemaphoreHandle.handle_type !=
1491  win32_nt_dx12_fence) {
1492  throw sycl::exception(
1494  "Invalid type of semaphore for this operation. The "
1495  "type of semaphore does not support user passed signal values.");
1496  }
1497  impl->MInteropSemaphoreHandle =
1498  (ur_exp_interop_semaphore_handle_t)SemaphoreHandle.raw_handle;
1499  impl->MSignalValue = SignalValue;
1501 }
1502 
1504  const kernel_bundle<bundle_state::executable> &ExecBundle) {
1505  std::shared_ptr<detail::queue_impl> PrimaryQueue =
1506  impl->MSubmissionPrimaryQueue;
1507  if ((!impl->MGraph &&
1508  (PrimaryQueue->get_context() != ExecBundle.get_context())) ||
1509  (impl->MGraph &&
1510  (impl->MGraph->getContext() != ExecBundle.get_context())))
1511  throw sycl::exception(
1513  "Context associated with the primary queue is different from the "
1514  "context associated with the kernel bundle");
1515 
1516  std::shared_ptr<detail::queue_impl> SecondaryQueue =
1517  impl->MSubmissionSecondaryQueue;
1518  if (SecondaryQueue &&
1519  SecondaryQueue->get_context() != ExecBundle.get_context())
1520  throw sycl::exception(
1522  "Context associated with the secondary queue is different from the "
1523  "context associated with the kernel bundle");
1524 
1525  setStateExplicitKernelBundle();
1526  setHandlerKernelBundle(detail::getSyclObjImpl(ExecBundle));
1527 }
1528 
1530  auto EventImpl = detail::getSyclObjImpl(Event);
1531  depends_on(EventImpl);
1532 }
1533 
1534 void handler::depends_on(const std::vector<event> &Events) {
1535  for (const event &Event : Events) {
1536  depends_on(Event);
1537  }
1538 }
1539 
1541  if (!EventImpl)
1542  return;
1543  if (EventImpl->isDiscarded()) {
1545  "Queue operation cannot depend on discarded event.");
1546  }
1547  if (auto Graph = getCommandGraph(); Graph) {
1548  auto EventGraph = EventImpl->getCommandGraph();
1549  if (EventGraph == nullptr) {
1550  throw sycl::exception(
1552  "Graph nodes cannot depend on events from outside the graph.");
1553  }
1554  if (EventGraph != Graph) {
1555  throw sycl::exception(
1557  "Graph nodes cannot depend on events from another graph.");
1558  }
1559  }
1560  impl->CGData.MEvents.push_back(EventImpl);
1561 }
1562 
1563 void handler::depends_on(const std::vector<detail::EventImplPtr> &Events) {
1564  for (const EventImplPtr &Event : Events) {
1565  depends_on(Event);
1566  }
1567 }
1568 
1569 static bool
1570 checkContextSupports(const std::shared_ptr<detail::context_impl> &ContextImpl,
1571  ur_context_info_t InfoQuery) {
1572  auto &Plugin = ContextImpl->getPlugin();
1573  ur_bool_t SupportsOp = false;
1574  Plugin->call(urContextGetInfo, ContextImpl->getHandleRef(), InfoQuery,
1575  sizeof(ur_bool_t), &SupportsOp, nullptr);
1576  return SupportsOp;
1577 }
1578 
1579 void handler::verifyDeviceHasProgressGuarantee(
1584  using forward_progress =
1586  auto deviceImplPtr = MQueue->getDeviceImplPtr();
1587  const bool supported = deviceImplPtr->supportsForwardProgress(
1588  guarantee, threadScope, coordinationScope);
1589  if (threadScope == execution_scope::work_group) {
1590  if (!supported) {
1591  throw sycl::exception(
1592  sycl::errc::feature_not_supported,
1593  "Required progress guarantee for work groups is not "
1594  "supported by this device.");
1595  }
1596  // If we are here, the device supports the guarantee required but there is a
1597  // caveat in that if the guarantee required is a concurrent guarantee, then
1598  // we most likely also need to enable cooperative launch of the kernel. That
1599  // is, although the device supports the required guarantee, some setup work
1600  // is needed to truly make the device provide that guarantee at runtime.
1601  // Otherwise, we will get the default guarantee which is weaker than
1602  // concurrent. Same reasoning applies for sub_group but not for work_item.
1603  // TODO: Further design work is probably needed to reflect this behavior in
1604  // Unified Runtime.
1605  if (guarantee == forward_progress::concurrent)
1606  setKernelIsCooperative(true);
1607  } else if (threadScope == execution_scope::sub_group) {
1608  if (!supported) {
1609  throw sycl::exception(sycl::errc::feature_not_supported,
1610  "Required progress guarantee for sub groups is not "
1611  "supported by this device.");
1612  }
1613  // Same reasoning as above.
1614  if (guarantee == forward_progress::concurrent)
1615  setKernelIsCooperative(true);
1616  } else { // threadScope is execution_scope::work_item otherwise undefined
1617  // behavior
1618  if (!supported) {
1619  throw sycl::exception(sycl::errc::feature_not_supported,
1620  "Required progress guarantee for work items is not "
1621  "supported by this device.");
1622  }
1623  }
1624 }
1625 
1626 bool handler::supportsUSMMemcpy2D() {
1627  for (const std::shared_ptr<detail::queue_impl> &QueueImpl :
1628  {impl->MSubmissionPrimaryQueue, impl->MSubmissionSecondaryQueue}) {
1629  if (QueueImpl &&
1630  !checkContextSupports(QueueImpl->getContextImplPtr(),
1631  UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT))
1632  return false;
1633  }
1634  return true;
1635 }
1636 
1637 bool handler::supportsUSMFill2D() {
1638  for (const std::shared_ptr<detail::queue_impl> &QueueImpl :
1639  {impl->MSubmissionPrimaryQueue, impl->MSubmissionSecondaryQueue}) {
1640  if (QueueImpl && !checkContextSupports(QueueImpl->getContextImplPtr(),
1641  UR_CONTEXT_INFO_USM_FILL2D_SUPPORT))
1642  return false;
1643  }
1644  return true;
1645 }
1646 
1647 bool handler::supportsUSMMemset2D() {
1648  for (const std::shared_ptr<detail::queue_impl> &QueueImpl :
1649  {impl->MSubmissionPrimaryQueue, impl->MSubmissionSecondaryQueue}) {
1650  if (QueueImpl && !checkContextSupports(QueueImpl->getContextImplPtr(),
1651  UR_CONTEXT_INFO_USM_FILL2D_SUPPORT))
1652  return false;
1653  }
1654  return true;
1655 }
1656 
1657 id<2> handler::computeFallbackKernelBounds(size_t Width, size_t Height) {
1658  device Dev = MQueue->get_device();
1659  range<2> ItemLimit = Dev.get_info<info::device::max_work_item_sizes<2>>() *
1660  Dev.get_info<info::device::max_compute_units>();
1661  return id<2>{std::min(ItemLimit[0], Height), std::min(ItemLimit[1], Width)};
1662 }
1663 
1664 backend handler::getDeviceBackend() const {
1665  if (impl->MGraph)
1666  return impl->MGraph->getDevice().get_backend();
1667  else
1668  return MQueue->getDeviceImplPtr()->getBackend();
1669 }
1670 
1671 void handler::ext_intel_read_host_pipe(detail::string_view Name, void *Ptr,
1672  size_t Size, bool Block) {
1673  impl->HostPipeName = Name.data();
1674  impl->HostPipePtr = Ptr;
1675  impl->HostPipeTypeSize = Size;
1676  impl->HostPipeBlocking = Block;
1677  impl->HostPipeRead = 1;
1679 }
1680 
1681 void handler::ext_intel_write_host_pipe(detail::string_view Name, void *Ptr,
1682  size_t Size, bool Block) {
1683  impl->HostPipeName = Name.data();
1684  impl->HostPipePtr = Ptr;
1685  impl->HostPipeTypeSize = Size;
1686  impl->HostPipeBlocking = Block;
1687  impl->HostPipeRead = 0;
1689 }
1690 
1691 void handler::memcpyToDeviceGlobal(const void *DeviceGlobalPtr, const void *Src,
1692  bool IsDeviceImageScoped, size_t NumBytes,
1693  size_t Offset) {
1694  throwIfActionIsCreated();
1695  MSrcPtr = const_cast<void *>(Src);
1696  MDstPtr = const_cast<void *>(DeviceGlobalPtr);
1697  impl->MIsDeviceImageScoped = IsDeviceImageScoped;
1698  MLength = NumBytes;
1699  impl->MOffset = Offset;
1701 }
1702 
1703 void handler::memcpyFromDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
1704  bool IsDeviceImageScoped, size_t NumBytes,
1705  size_t Offset) {
1706  throwIfActionIsCreated();
1707  MSrcPtr = const_cast<void *>(DeviceGlobalPtr);
1708  MDstPtr = Dest;
1709  impl->MIsDeviceImageScoped = IsDeviceImageScoped;
1710  MLength = NumBytes;
1711  impl->MOffset = Offset;
1713 }
1714 
1715 void handler::memcpyToHostOnlyDeviceGlobal(const void *DeviceGlobalPtr,
1716  const void *Src,
1717  size_t DeviceGlobalTSize,
1718  bool IsDeviceImageScoped,
1719  size_t NumBytes, size_t Offset) {
1720  std::weak_ptr<detail::context_impl> WeakContextImpl =
1721  MQueue->getContextImplPtr();
1722  std::weak_ptr<detail::device_impl> WeakDeviceImpl =
1723  MQueue->getDeviceImplPtr();
1724  host_task([=] {
1725  // Capture context and device as weak to avoid keeping them alive for too
1726  // long. If they are dead by the time this executes, the operation would not
1727  // have been visible anyway.
1728  std::shared_ptr<detail::context_impl> ContextImpl = WeakContextImpl.lock();
1729  std::shared_ptr<detail::device_impl> DeviceImpl = WeakDeviceImpl.lock();
1730  if (ContextImpl && DeviceImpl)
1731  ContextImpl->memcpyToHostOnlyDeviceGlobal(
1732  DeviceImpl, DeviceGlobalPtr, Src, DeviceGlobalTSize,
1733  IsDeviceImageScoped, NumBytes, Offset);
1734  });
1735 }
1736 
1737 void handler::memcpyFromHostOnlyDeviceGlobal(void *Dest,
1738  const void *DeviceGlobalPtr,
1739  bool IsDeviceImageScoped,
1740  size_t NumBytes, size_t Offset) {
1741  const std::shared_ptr<detail::context_impl> &ContextImpl =
1742  MQueue->getContextImplPtr();
1743  const std::shared_ptr<detail::device_impl> &DeviceImpl =
1744  MQueue->getDeviceImplPtr();
1745  host_task([=] {
1746  // Unlike memcpy to device_global, we need to keep the context and device
1747  // alive in the capture of this operation as we must be able to correctly
1748  // copy the value to the user-specified pointer.
1749  ContextImpl->memcpyFromHostOnlyDeviceGlobal(
1750  DeviceImpl, Dest, DeviceGlobalPtr, IsDeviceImageScoped, NumBytes,
1751  Offset);
1752  });
1753 }
1754 
1755 const std::shared_ptr<detail::context_impl> &
1756 handler::getContextImplPtr() const {
1757  return MQueue->getContextImplPtr();
1758 }
1759 
1760 void handler::setKernelCacheConfig(handler::StableKernelCacheConfig Config) {
1761  switch (Config) {
1762  case handler::StableKernelCacheConfig::Default:
1763  impl->MKernelCacheConfig = UR_KERNEL_CACHE_CONFIG_DEFAULT;
1764  break;
1765  case handler::StableKernelCacheConfig::LargeSLM:
1766  impl->MKernelCacheConfig = UR_KERNEL_CACHE_CONFIG_LARGE_SLM;
1767  break;
1768  case handler::StableKernelCacheConfig::LargeData:
1769  impl->MKernelCacheConfig = UR_KERNEL_CACHE_CONFIG_LARGE_DATA;
1770  break;
1771  }
1772 }
1773 
1774 void handler::setKernelIsCooperative(bool KernelIsCooperative) {
1775  impl->MKernelIsCooperative = KernelIsCooperative;
1776 }
1777 
1778 void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims) {
1779  throwIfGraphAssociated<
1780  syclex::detail::UnsupportedGraphFeatures::
1781  sycl_ext_oneapi_experimental_cuda_cluster_launch>();
1782  impl->MKernelUsesClusterLaunch = true;
1783  impl->MNDRDesc.setClusterDimensions(ClusterSize, Dims);
1784 }
1785 
1789  Graph) {
1791  impl->MExecGraph = detail::getSyclObjImpl(Graph);
1792 }
1793 
1794 std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
1795 handler::getCommandGraph() const {
1796  if (impl->MGraph) {
1797  return impl->MGraph;
1798  }
1799  return MQueue->getCommandGraph();
1800 }
1801 
1802 void handler::setUserFacingNodeType(ext::oneapi::experimental::node_type Type) {
1803  impl->MUserFacingNodeType = Type;
1804 }
1805 
1806 std::optional<std::array<size_t, 3>> handler::getMaxWorkGroups() {
1808  std::array<size_t, 3> UrResult = {};
1809  auto Ret = Dev->getPlugin()->call_nocheck(
1810  urDeviceGetInfo, Dev->getHandleRef(),
1811  UrInfoCode<
1812  ext::oneapi::experimental::info::device::max_work_groups<3>>::value,
1813  sizeof(UrResult), &UrResult, nullptr);
1814  if (Ret == UR_RESULT_SUCCESS) {
1815  return UrResult;
1816  }
1817  return {};
1818 }
1819 
1820 std::tuple<std::array<size_t, 3>, bool> handler::getMaxWorkGroups_v2() {
1821  auto ImmRess = getMaxWorkGroups();
1822  if (ImmRess)
1823  return {*ImmRess, true};
1824  return {std::array<size_t, 3>{0, 0, 0}, false};
1825 }
1826 
1827 void handler::setNDRangeUsed(bool Value) { impl->MNDRangeUsed = Value; }
1828 
1829 void handler::registerDynamicParameter(
1830  ext::oneapi::experimental::detail::dynamic_parameter_base &DynamicParamBase,
1831  int ArgIndex) {
1832  if (MQueue && MQueue->getCommandGraph()) {
1833  throw sycl::exception(
1835  "Dynamic Parameters cannot be used with Graph Queue recording.");
1836  }
1837  if (!impl->MGraph) {
1838  throw sycl::exception(
1840  "Dynamic Parameters cannot be used with normal SYCL submissions");
1841  }
1842 
1843  auto Paraimpl = detail::getSyclObjImpl(DynamicParamBase);
1844  if (Paraimpl->MGraph != this->impl->MGraph) {
1845  throw sycl::exception(
1847  "Cannot use a Dynamic Parameter with a node associated with a graph "
1848  "other than the one it was created with.");
1849  }
1850  impl->MDynamicParameters.emplace_back(Paraimpl.get(), ArgIndex);
1851 }
1852 
1853 bool handler::eventNeeded() const { return impl->MEventNeeded; }
1854 
1855 void *handler::storeRawArg(const void *Ptr, size_t Size) {
1856  impl->CGData.MArgsStorage.emplace_back(Size);
1857  void *Storage = static_cast<void *>(impl->CGData.MArgsStorage.back().data());
1858  std::memcpy(Storage, Ptr, Size);
1859  return Storage;
1860 }
1861 
1862 void handler::SetHostTask(std::function<void()> &&Func) {
1863  setNDRangeDescriptor(range<1>(1));
1864  impl->MHostTask.reset(new detail::HostTask(std::move(Func)));
1866 }
1867 
1868 void handler::SetHostTask(std::function<void(interop_handle)> &&Func) {
1869  setNDRangeDescriptor(range<1>(1));
1870  impl->MHostTask.reset(new detail::HostTask(std::move(Func)));
1872 }
1873 
1874 void handler::addAccessorReq(detail::AccessorImplPtr Accessor) {
1875  // Add accessor to the list of requirements.
1876  impl->CGData.MRequirements.push_back(Accessor.get());
1877  // Store copy of the accessor.
1878  impl->CGData.MAccStorage.push_back(std::move(Accessor));
1879 }
1880 
1881 void handler::addLifetimeSharedPtrStorage(std::shared_ptr<const void> SPtr) {
1882  impl->CGData.MSharedPtrStorage.push_back(std::move(SPtr));
1883 }
1884 
1885 void handler::addArg(detail::kernel_param_kind_t ArgKind, void *Req,
1886  int AccessTarget, int ArgIndex) {
1887  impl->MArgs.emplace_back(ArgKind, Req, AccessTarget, ArgIndex);
1888 }
1889 
1890 void handler::clearArgs() { impl->MArgs.clear(); }
1891 
1892 void handler::setArgsToAssociatedAccessors() {
1893  impl->MArgs = impl->MAssociatedAccesors;
1894 }
1895 
1896 bool handler::HasAssociatedAccessor(detail::AccessorImplHost *Req,
1897  access::target AccessTarget) const {
1898  return std::find_if(
1899  impl->MAssociatedAccesors.cbegin(),
1900  impl->MAssociatedAccesors.cend(), [&](const detail::ArgDesc &AD) {
1901  return AD.MType == detail::kernel_param_kind_t::kind_accessor &&
1902  AD.MPtr == Req &&
1903  AD.MSize == static_cast<int>(AccessTarget);
1904  }) == impl->MAssociatedAccesors.end();
1905 }
1906 
1907 void handler::setType(sycl::detail::CGType Type) { impl->MCGType = Type; }
1908 sycl::detail::CGType handler::getType() const { return impl->MCGType; }
1909 
1910 void handler::setNDRangeDescriptorPadded(sycl::range<3> N,
1911  bool SetNumWorkGroups, int Dims) {
1912  impl->MNDRDesc = NDRDescT{N, SetNumWorkGroups, Dims};
1913 }
1914 void handler::setNDRangeDescriptorPadded(sycl::range<3> NumWorkItems,
1915  sycl::id<3> Offset, int Dims) {
1916  impl->MNDRDesc = NDRDescT{NumWorkItems, Offset, Dims};
1917 }
1918 void handler::setNDRangeDescriptorPadded(sycl::range<3> NumWorkItems,
1919  sycl::range<3> LocalSize,
1920  sycl::id<3> Offset, int Dims) {
1921  impl->MNDRDesc = NDRDescT{NumWorkItems, LocalSize, Offset, Dims};
1922 }
1923 
1924 } // namespace _V1
1925 } // namespace sycl
static ProgramManager & getInstance()
kernel_id getSYCLKernelID(const std::string &KernelName)
DeviceGlobalMapEntry * getDeviceGlobalEntry(const void *DeviceGlobalPtr)
bool kernelUsesAssert(const std::string &KernelName) const
static const char * get()
Definition: config.hpp:115
EventImplPtr addCG(std::unique_ptr< detail::CG > CommandGroup, const QueueImplPtr &Queue, bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer=nullptr, const std::vector< ur_exp_command_buffer_sync_point_t > &Dependencies={})
Registers a command group, and adds it to the dependency graph.
Definition: scheduler.cpp:99
static Scheduler & getInstance()
Definition: scheduler.cpp:248
static bool areEventsSafeForSchedulerBypass(const std::vector< sycl::event > &DepEvents, ContextImplPtr Context)
Definition: scheduler.cpp:734
const char * c_str() const noexcept
Definition: string.hpp:60
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
std::unique_lock< std::shared_mutex > WriteLock
Definition: graph_impl.hpp:856
Command group handler class.
Definition: handler.hpp:467
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.cpp:1529
void ext_oneapi_signal_external_semaphore(ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle)
Instruct the queue to signal the external semaphore once all previous commands submitted to the queue...
Definition: handler.cpp:1461
void ext_oneapi_wait_external_semaphore(ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle)
Submit a non-blocking device-side wait on an external.
Definition: handler.cpp:1419
void ext_oneapi_copy(const void *Src, ext::oneapi::experimental::image_mem_handle Dest, const ext::oneapi::experimental::image_descriptor &DestImgDesc)
Copies data from one memory region to another, where Src is a USM pointer and Dest is an opaque image...
Definition: handler.cpp:1012
void ext_oneapi_graph(ext::oneapi::experimental::command_graph< ext::oneapi::experimental::graph_state::executable > Graph)
Executes a command_graph.
Definition: handler.cpp:1786
void memcpy(void *Dest, const void *Src, size_t Count)
Copies data from one memory region to another, each is either a host pointer or a pointer within USM ...
Definition: handler.cpp:934
void mem_advise(const void *Ptr, size_t Length, int Advice)
Provides additional information to the underlying runtime about how different allocations are used.
Definition: handler.cpp:958
void prefetch(const void *Ptr, size_t Count)
Provides hints to the runtime library that data should be made available on a device earlier than Uni...
Definition: handler.cpp:951
void memset(void *Dest, int Value, size_t Count)
Fills the memory pointed by a USM pointer with the value specified.
Definition: handler.cpp:942
void ext_oneapi_barrier()
Prevents any commands submitted afterward to this queue from executing until all commands previously ...
Definition: handler.hpp:2918
friend class stream
Definition: handler.hpp:3384
std::enable_if_t< detail::check_fn_signature< std::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< std::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
Definition: handler.hpp:2085
void use_kernel_bundle(const kernel_bundle< bundle_state::executable > &ExecBundle)
Definition: handler.cpp:1503
A unique identifier of an item in an index space.
Definition: id.hpp:36
context get_context() const noexcept
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: range.hpp:26
__SYCL_EXTERN_STREAM_ATTRS ostream cout
Linked to standard output.
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:198
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:31
ur_image_channel_type_t convertChannelType(image_channel_type Type)
Definition: image_impl.cpp:186
void * getValueFromDynamicParameter(ext::oneapi::experimental::detail::dynamic_parameter_base &DynamicParamBase)
Definition: handler.cpp:77
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
Definition: accessor.cpp:17
constexpr const char * SYCL_STREAM_NAME
ur_exp_image_copy_flags_t getUrImageCopyFlags(sycl::usm::alloc SrcPtrType, sycl::usm::alloc DstPtrType)
Definition: handler.cpp:49
bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr)
Definition: handler.cpp:42
ur_image_channel_order_t convertChannelOrder(image_channel_order Order)
Definition: image_impl.cpp:111
void enqueueImpKernel(const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector< ArgDesc > &Args, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, const std::shared_ptr< detail::kernel_impl > &MSyclKernel, const std::string &KernelName, std::vector< ur_event_handle_t > &RawEvents, const detail::EventImplPtr &OutEventImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative, const bool KernelUsesClusterLaunch, const RTDeviceBinaryImage *BinImage)
Definition: commands.cpp:2553
T createSyclObjFromImpl(decltype(T::impl) ImplObj)
Definition: impl_utils.hpp:40
std::shared_ptr< event_impl > EventImplPtr
Definition: handler.hpp:183
AccessorImplHost Requirement
CGType
Type of the command group.
Definition: cg_types.hpp:42
kernel_id get_kernel_id_impl(string_view KernelName)
auto tie(Ts &...Args)
Definition: tuple.hpp:39
std::shared_ptr< AccessorImplHost > AccessorImplPtr
Definition: accessor.hpp:514
std::tuple< const RTDeviceBinaryImage *, ur_program_handle_t > retrieveKernelBinary(const QueueImplPtr &, const char *KernelName, CGExecKernel *CGKernel=nullptr)
Definition: helpers.cpp:38
node_type getNodeTypeFromCG(sycl::detail::CGType CGType)
Definition: graph_impl.hpp:44
image_channel_order get_image_default_channel_order(unsigned int num_channels)
@ executable
In executable state, the graph is ready to execute.
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, _Tp > arg(const complex< _Tp > &__c)
constexpr size_t MaxNumAdditionalArgs
Definition: handler.cpp:817
kernel_bundle< bundle_state::executable > build(const kernel_bundle< bundle_state::input > &InputBundle, const std::vector< device > &Devs, const property_list &PropList={})
image_target
Definition: access.hpp:74
usm::alloc get_pointer_type(const void *ptr, const context &ctxt)
Query the allocation type from a USM pointer.
Definition: usm_impl.cpp:544
static bool checkContextSupports(const std::shared_ptr< detail::context_impl > &ContextImpl, ur_context_info_t InfoQuery)
Definition: handler.cpp:1570
PropertyListT Accessor
Definition: multi_ptr.hpp:510
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:65
static void addArgsForGlobalAccessor(detail::Requirement *AccImpl, size_t Index, size_t &IndexShift, int Size, bool IsKernelCreatedFromSource, size_t GlobalSize, std::vector< detail::ArgDesc > &Args, bool isESIMD)
Definition: handler.cpp:637
Definition: access.hpp:18
A struct to describe the properties of an image.
C++ utilities for Unified Runtime integration.