DPC++ Runtime
Runtime libraries for oneAPI DPC++
memory_manager.cpp
Go to the documentation of this file.
1 //==-------------- memory_manager.cpp --------------------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "ur_api.h"
10 #include <detail/context_impl.hpp>
12 #include <detail/event_impl.hpp>
15 #include <detail/queue_impl.hpp>
16 #include <detail/ur_utils.hpp>
17 #include <detail/xpti_registry.hpp>
18 
19 #include <sycl/detail/ur.hpp>
21 #include <sycl/usm/usm_enums.hpp>
23 
24 #include <algorithm>
25 #include <cassert>
26 #include <cstring>
27 #include <vector>
28 
29 #ifdef XPTI_ENABLE_INSTRUMENTATION
30 #include <xpti/xpti_data_types.h>
31 #include <xpti/xpti_trace_framework.hpp>
32 #endif
33 
34 namespace sycl {
35 inline namespace _V1 {
36 namespace detail {
37 
38 #ifdef XPTI_ENABLE_INSTRUMENTATION
39 uint8_t GMemAllocStreamID;
40 xpti::trace_event_data_t *GMemAllocEvent;
41 #endif
42 
43 uint64_t emitMemAllocBeginTrace(uintptr_t ObjHandle, size_t AllocSize,
44  size_t GuardZone) {
45  (void)ObjHandle;
46  (void)AllocSize;
47  (void)GuardZone;
48  uint64_t CorrelationID = 0;
49 #ifdef XPTI_ENABLE_INSTRUMENTATION
50  constexpr uint16_t NotificationTraceType =
51  static_cast<uint16_t>(xpti::trace_point_type_t::mem_alloc_begin);
52  if (xptiCheckTraceEnabled(GMemAllocStreamID, NotificationTraceType)) {
53  xpti::mem_alloc_data_t MemAlloc{ObjHandle, 0 /* alloc ptr */, AllocSize,
54  GuardZone};
55 
56  CorrelationID = xptiGetUniqueId();
57  xptiNotifySubscribers(GMemAllocStreamID, NotificationTraceType,
58  GMemAllocEvent, nullptr, CorrelationID, &MemAlloc);
59  }
60 #endif
61  return CorrelationID;
62 }
63 
64 void emitMemAllocEndTrace(uintptr_t ObjHandle, uintptr_t AllocPtr,
65  size_t AllocSize, size_t GuardZone,
66  uint64_t CorrelationID) {
67  (void)ObjHandle;
68  (void)AllocPtr;
69  (void)AllocSize;
70  (void)GuardZone;
71  (void)CorrelationID;
72 #ifdef XPTI_ENABLE_INSTRUMENTATION
73  constexpr uint16_t NotificationTraceType =
74  static_cast<uint16_t>(xpti::trace_point_type_t::mem_alloc_end);
75  if (xptiCheckTraceEnabled(GMemAllocStreamID, NotificationTraceType)) {
76  xpti::mem_alloc_data_t MemAlloc{ObjHandle, AllocPtr, AllocSize, GuardZone};
77 
78  xptiNotifySubscribers(GMemAllocStreamID, NotificationTraceType,
79  GMemAllocEvent, nullptr, CorrelationID, &MemAlloc);
80  }
81 #endif
82 }
83 
84 uint64_t emitMemReleaseBeginTrace(uintptr_t ObjHandle, uintptr_t AllocPtr) {
85  (void)ObjHandle;
86  (void)AllocPtr;
87  uint64_t CorrelationID = 0;
88 #ifdef XPTI_ENABLE_INSTRUMENTATION
89  constexpr uint16_t NotificationTraceType =
90  static_cast<uint16_t>(xpti::trace_point_type_t::mem_release_begin);
91  if (xptiCheckTraceEnabled(GMemAllocStreamID, NotificationTraceType)) {
92  xpti::mem_alloc_data_t MemAlloc{ObjHandle, AllocPtr, 0 /* alloc size */,
93  0 /* guard zone */};
94 
95  CorrelationID = xptiGetUniqueId();
96  xptiNotifySubscribers(GMemAllocStreamID, NotificationTraceType,
97  GMemAllocEvent, nullptr, CorrelationID, &MemAlloc);
98  }
99 #endif
100  return CorrelationID;
101 }
102 
103 void emitMemReleaseEndTrace(uintptr_t ObjHandle, uintptr_t AllocPtr,
104  uint64_t CorrelationID) {
105  (void)ObjHandle;
106  (void)AllocPtr;
107  (void)CorrelationID;
108 #ifdef XPTI_ENABLE_INSTRUMENTATION
109  constexpr uint16_t NotificationTraceType =
110  static_cast<uint16_t>(xpti::trace_point_type_t::mem_release_end);
111  if (xptiCheckTraceEnabled(GMemAllocStreamID, NotificationTraceType)) {
112  xpti::mem_alloc_data_t MemAlloc{ObjHandle, AllocPtr, 0 /* alloc size */,
113  0 /* guard zone */};
114 
115  xptiNotifySubscribers(GMemAllocStreamID, NotificationTraceType,
116  GMemAllocEvent, nullptr, CorrelationID, &MemAlloc);
117  }
118 #endif
119 }
120 
121 static void waitForEvents(const std::vector<EventImplPtr> &Events) {
122  // Assuming all events will be on the same device or
123  // devices associated with the same Backend.
124  if (!Events.empty()) {
125  const PluginPtr &Plugin = Events[0]->getPlugin();
126  std::vector<ur_event_handle_t> UrEvents(Events.size());
127  std::transform(Events.begin(), Events.end(), UrEvents.begin(),
128  [](const EventImplPtr &EventImpl) {
129  return EventImpl->getHandleRef();
130  });
131  if (!UrEvents.empty() && UrEvents[0]) {
132  Plugin->call(urEventWait, UrEvents.size(), &UrEvents[0]);
133  }
134  }
135 }
136 
137 void memBufferCreateHelper(const PluginPtr &Plugin, ur_context_handle_t Ctx,
138  ur_mem_flags_t Flags, size_t Size,
139  ur_mem_handle_t *RetMem,
140  const ur_buffer_properties_t *Props) {
141 #ifdef XPTI_ENABLE_INSTRUMENTATION
142  uint64_t CorrID = 0;
143 #endif
144  // We only want to instrument urMemBufferCreate
145  {
146 #ifdef XPTI_ENABLE_INSTRUMENTATION
147  CorrID =
148  emitMemAllocBeginTrace(0 /* mem object */, Size, 0 /* guard zone */);
149  xpti::utils::finally _{[&] {
150  // C-style cast is required for MSVC
151  uintptr_t MemObjID = (uintptr_t)(*RetMem);
152  ur_native_handle_t Ptr = 0;
153  // Always use call_nocheck here, because call may throw an exception,
154  // and this lambda will be called from destructor, which in combination
155  // rewards us with UB.
156  // When doing buffer interop we don't know what device the memory should
157  // be resident on, so pass nullptr for Device param. Buffer interop may
158  // not be supported by all backends.
159  Plugin->call_nocheck(urMemGetNativeHandle, *RetMem, /*Dev*/ nullptr,
160  &Ptr);
161  emitMemAllocEndTrace(MemObjID, (uintptr_t)(Ptr), Size, 0 /* guard zone */,
162  CorrID);
163  }};
164 #endif
165  if (Size)
166  Plugin->call(urMemBufferCreate, Ctx, Flags, Size, Props, RetMem);
167  }
168 }
169 
170 void memReleaseHelper(const PluginPtr &Plugin, ur_mem_handle_t Mem) {
171  // FIXME urMemRelease does not guarante memory release. It is only true if
172  // reference counter is 1. However, SYCL runtime currently only calls
173  // urMemRetain only for OpenCL interop
174 #ifdef XPTI_ENABLE_INSTRUMENTATION
175  uint64_t CorrID = 0;
176  // C-style cast is required for MSVC
177  uintptr_t MemObjID = (uintptr_t)(Mem);
178  uintptr_t Ptr = 0;
179  // Do not make unnecessary UR calls without instrumentation enabled
180  if (xptiTraceEnabled()) {
181  ur_native_handle_t PtrHandle = 0;
182  // When doing buffer interop we don't know what device the memory should be
183  // resident on, so pass nullptr for Device param. Buffer interop may not be
184  // supported by all backends.
185  Plugin->call_nocheck(urMemGetNativeHandle, Mem, /*Dev*/ nullptr,
186  &PtrHandle);
187  Ptr = (uintptr_t)(PtrHandle);
188  }
189 #endif
190  // We only want to instrument urMemRelease
191  {
192 #ifdef XPTI_ENABLE_INSTRUMENTATION
193  CorrID = emitMemReleaseBeginTrace(MemObjID, Ptr);
194  xpti::utils::finally _{
195  [&] { emitMemReleaseEndTrace(MemObjID, Ptr, CorrID); }};
196 #endif
197  Plugin->call(urMemRelease, Mem);
198  }
199 }
200 
201 void memBufferMapHelper(const PluginPtr &Plugin, ur_queue_handle_t Queue,
202  ur_mem_handle_t Buffer, bool Blocking,
203  ur_map_flags_t Flags, size_t Offset, size_t Size,
204  uint32_t NumEvents, const ur_event_handle_t *WaitList,
205  ur_event_handle_t *Event, void **RetMap) {
206 #ifdef XPTI_ENABLE_INSTRUMENTATION
207  uint64_t CorrID = 0;
208  uintptr_t MemObjID = (uintptr_t)(Buffer);
209 #endif
210  // We only want to instrument urEnqueueMemBufferMap
211 
212 #ifdef XPTI_ENABLE_INSTRUMENTATION
213  CorrID = emitMemAllocBeginTrace(MemObjID, Size, 0 /* guard zone */);
214  xpti::utils::finally _{[&] {
215  emitMemAllocEndTrace(MemObjID, (uintptr_t)(*RetMap), Size,
216  0 /* guard zone */, CorrID);
217  }};
218 #endif
219  Plugin->call(urEnqueueMemBufferMap, Queue, Buffer, Blocking, Flags, Offset,
220  Size, NumEvents, WaitList, Event, RetMap);
221 }
222 
223 void memUnmapHelper(const PluginPtr &Plugin, ur_queue_handle_t Queue,
224  ur_mem_handle_t Mem, void *MappedPtr, uint32_t NumEvents,
225  const ur_event_handle_t *WaitList,
226  ur_event_handle_t *Event) {
227 #ifdef XPTI_ENABLE_INSTRUMENTATION
228  uint64_t CorrID = 0;
229  uintptr_t MemObjID = (uintptr_t)(Mem);
230  uintptr_t Ptr = (uintptr_t)(MappedPtr);
231 #endif
232  // We only want to instrument urEnqueueMemUnmap
233  {
234 #ifdef XPTI_ENABLE_INSTRUMENTATION
235  CorrID = emitMemReleaseBeginTrace(MemObjID, Ptr);
236  xpti::utils::finally _{[&] {
237  // There's no way for SYCL to know, when the pointer is freed, so we have
238  // to explicitly wait for the end of data transfers here in order to
239  // report correct events.
240  // Always use call_nocheck here, because call may throw an exception,
241  // and this lambda will be called from destructor, which in combination
242  // rewards us with UB.
243  Plugin->call_nocheck(urEventWait, 1, Event);
244  emitMemReleaseEndTrace(MemObjID, Ptr, CorrID);
245  }};
246 #endif
247  Plugin->call(urEnqueueMemUnmap, Queue, Mem, MappedPtr, NumEvents, WaitList,
248  Event);
249  }
250 }
251 
253  void *MemAllocation,
254  std::vector<EventImplPtr> DepEvents,
255  ur_event_handle_t &OutEvent) {
256  // There is no async API for memory releasing. Explicitly wait for all
257  // dependency events and return empty event.
258  waitForEvents(DepEvents);
259  OutEvent = nullptr;
260  XPTIRegistry::bufferReleaseNotification(MemObj, MemAllocation);
261  MemObj->releaseMem(TargetContext, MemAllocation);
262 }
263 
265  SYCLMemObjI *MemObj, void *MemAllocation,
266  void *UserPtr) {
267  if (UserPtr == MemAllocation) {
268  // Do nothing as it's user provided memory.
269  return;
270  }
271 
272  if (!TargetContext) {
273  MemObj->releaseHostMem(MemAllocation);
274  return;
275  }
276 
277  const PluginPtr &Plugin = TargetContext->getPlugin();
278  memReleaseHelper(Plugin, ur::cast<ur_mem_handle_t>(MemAllocation));
279 }
280 
282  bool InitFromUserData, void *HostPtr,
283  std::vector<EventImplPtr> DepEvents,
284  ur_event_handle_t &OutEvent) {
285  // There is no async API for memory allocation. Explicitly wait for all
286  // dependency events and return empty event.
287  waitForEvents(DepEvents);
288  OutEvent = nullptr;
289 
290  return MemObj->allocateMem(TargetContext, InitFromUserData, HostPtr,
291  OutEvent);
292 }
293 
294 void *MemoryManager::allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr,
295  bool HostPtrReadOnly, size_t Size,
296  const sycl::property_list &) {
297  std::ignore = HostPtrReadOnly;
298  std::ignore = Size;
299 
300  // Can return user pointer directly if it is not a nullptr.
301  if (UserPtr)
302  return UserPtr;
303 
304  return MemObj->allocateHostMem();
305 }
306 
308  ContextImplPtr TargetContext, void *UserPtr,
309  const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext,
310  const sycl::property_list &, ur_event_handle_t &OutEventToWait) {
311  (void)TargetContext;
312  (void)InteropContext;
313  // If memory object is created with interop c'tor return cl_mem as is.
314  assert(TargetContext == InteropContext && "Expected matching contexts");
315 
316  OutEventToWait = InteropEvent->getHandleRef();
317  // Retain the event since it will be released during alloca command
318  // destruction
319  if (nullptr != OutEventToWait) {
320  const PluginPtr &Plugin = InteropEvent->getPlugin();
321  Plugin->call(urEventRetain, OutEventToWait);
322  }
323  return UserPtr;
324 }
325 
326 static ur_mem_flags_t getMemObjCreationFlags(void *UserPtr,
327  bool HostPtrReadOnly) {
328  // Create read_write mem object to handle arbitrary uses.
329  ur_mem_flags_t Result =
330  HostPtrReadOnly ? UR_MEM_FLAG_READ_ONLY : UR_MEM_FLAG_READ_WRITE;
331  if (UserPtr)
332  Result |= UR_MEM_FLAG_USE_HOST_POINTER;
333  return Result;
334 }
335 
337  void *UserPtr, bool HostPtrReadOnly,
338  const ur_image_desc_t &Desc,
339  const ur_image_format_t &Format,
340  const sycl::property_list &) {
341  ur_mem_flags_t CreationFlags =
342  getMemObjCreationFlags(UserPtr, HostPtrReadOnly);
343 
344  ur_mem_handle_t NewMem = nullptr;
345  const PluginPtr &Plugin = TargetContext->getPlugin();
346  Plugin->call(urMemImageCreate, TargetContext->getHandleRef(), CreationFlags,
347  &Format, &Desc, UserPtr, &NewMem);
348  return NewMem;
349 }
350 
351 void *
353  bool HostPtrReadOnly, const size_t Size,
354  const sycl::property_list &PropsList) {
355  ur_mem_flags_t CreationFlags =
356  getMemObjCreationFlags(UserPtr, HostPtrReadOnly);
357  if (PropsList.has_property<
359  CreationFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER;
360 
361  ur_mem_handle_t NewMem = nullptr;
362  const PluginPtr &Plugin = TargetContext->getPlugin();
363 
364  ur_buffer_properties_t AllocProps = {UR_STRUCTURE_TYPE_BUFFER_PROPERTIES,
365  nullptr, UserPtr};
366 
367  void **Next = &AllocProps.pNext;
368  ur_buffer_alloc_location_properties_t LocationProperties = {
369  UR_STRUCTURE_TYPE_BUFFER_ALLOC_LOCATION_PROPERTIES, nullptr, 0};
371  TargetContext->isBufferLocationSupported()) {
372  LocationProperties.location =
374  .get_buffer_location();
375  *Next = &LocationProperties;
376  Next = &LocationProperties.pNext;
377  }
378 
379  ur_buffer_channel_properties_t ChannelProperties = {
380  UR_STRUCTURE_TYPE_BUFFER_CHANNEL_PROPERTIES, nullptr, 0};
381  if (PropsList.has_property<property::buffer::mem_channel>()) {
382  ChannelProperties.channel =
383  PropsList.get_property<property::buffer::mem_channel>().get_channel();
384  *Next = &ChannelProperties;
385  }
386 
387  memBufferCreateHelper(Plugin, TargetContext->getHandleRef(), CreationFlags,
388  Size, &NewMem, &AllocProps);
389  return NewMem;
390 }
391 
393  SYCLMemObjI *MemObj, void *UserPtr,
394  bool HostPtrReadOnly, size_t Size,
395  const EventImplPtr &InteropEvent,
396  const ContextImplPtr &InteropContext,
397  const sycl::property_list &PropsList,
398  ur_event_handle_t &OutEventToWait) {
399  void *MemPtr;
400  if (!TargetContext)
401  MemPtr =
402  allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size, PropsList);
403  else if (UserPtr && InteropContext)
404  MemPtr =
405  allocateInteropMemObject(TargetContext, UserPtr, InteropEvent,
406  InteropContext, PropsList, OutEventToWait);
407  else
408  MemPtr = allocateBufferObject(TargetContext, UserPtr, HostPtrReadOnly, Size,
409  PropsList);
411  return MemPtr;
412 }
413 
415  ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *UserPtr,
416  bool HostPtrReadOnly, size_t Size, const ur_image_desc_t &Desc,
417  const ur_image_format_t &Format, const EventImplPtr &InteropEvent,
418  const ContextImplPtr &InteropContext, const sycl::property_list &PropsList,
419  ur_event_handle_t &OutEventToWait) {
420  if (!TargetContext)
421  return allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size,
422  PropsList);
423  if (UserPtr && InteropContext)
424  return allocateInteropMemObject(TargetContext, UserPtr, InteropEvent,
425  InteropContext, PropsList, OutEventToWait);
426  return allocateImageObject(TargetContext, UserPtr, HostPtrReadOnly, Desc,
427  Format, PropsList);
428 }
429 
431  void *ParentMemObj, size_t ElemSize,
432  size_t Offset, range<3> Range,
433  std::vector<EventImplPtr> DepEvents,
434  ur_event_handle_t &OutEvent) {
435  waitForEvents(DepEvents);
436  OutEvent = nullptr;
437 
438  if (!TargetContext)
439  return static_cast<void *>(static_cast<char *>(ParentMemObj) + Offset);
440 
441  size_t SizeInBytes = ElemSize;
442  for (size_t I = 0; I < 3; ++I)
443  SizeInBytes *= Range[I];
444 
445  ur_result_t Error = UR_RESULT_SUCCESS;
446  ur_buffer_region_t Region = {UR_STRUCTURE_TYPE_BUFFER_REGION, nullptr, Offset,
447  SizeInBytes};
448  ur_mem_handle_t NewMem;
449  const PluginPtr &Plugin = TargetContext->getPlugin();
450  Error = Plugin->call_nocheck(
451  urMemBufferPartition, ur::cast<ur_mem_handle_t>(ParentMemObj),
452  UR_MEM_FLAG_READ_WRITE, UR_BUFFER_CREATE_TYPE_REGION, &Region, &NewMem);
453  if (Error == UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET)
454  throw detail::set_ur_error(
456  "Specified offset of the sub-buffer being constructed is not "
457  "a multiple of the memory base address alignment"),
458  Error);
459 
460  Plugin->checkUrResult(Error);
461 
462  return NewMem;
463 }
464 
466  int XTerm;
467  int YTerm;
468  int ZTerm;
469 };
472  // For buffers, the offsets/ranges coming from accessor are always
473  // id<3>/range<3> But their organization varies by dimension:
474  // 1 ==> {width, 1, 1}
475  // 2 ==> {height, width, 1}
476  // 3 ==> {depth, height, width}
477  // Some callers schedule 0 as DimDst/DimSrc.
478 
479  if (Type == detail::SYCLMemObjI::MemObjType::Buffer) {
480  if (Dimensions == 3) {
481  pos.XTerm = 2, pos.YTerm = 1, pos.ZTerm = 0;
482  } else if (Dimensions == 2) {
483  pos.XTerm = 1, pos.YTerm = 0, pos.ZTerm = 2;
484  } else { // Dimension is 1 or 0
485  pos.XTerm = 0, pos.YTerm = 1, pos.ZTerm = 2;
486  }
487  } else { // While range<>/id<> use by images is different than buffers, it's
488  // consistent with their accessors.
489  pos.XTerm = 0;
490  pos.YTerm = 1;
491  pos.ZTerm = 2;
492  }
493 }
494 
495 void copyH2D(SYCLMemObjI *SYCLMemObj, char *SrcMem, QueueImplPtr,
496  unsigned int DimSrc, sycl::range<3> SrcSize,
497  sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset,
498  unsigned int SrcElemSize, ur_mem_handle_t DstMem,
499  QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range<3> DstSize,
500  sycl::range<3> DstAccessRange, sycl::id<3> DstOffset,
501  unsigned int DstElemSize, std::vector<ur_event_handle_t> DepEvents,
502  ur_event_handle_t &OutEvent,
503  const detail::EventImplPtr &OutEventImpl) {
504  (void)SrcAccessRange;
505  assert(SYCLMemObj && "The SYCLMemObj is nullptr");
506  assert(TgtQueue && "Destination mem object queue must be not nullptr");
507 
508  const ur_queue_handle_t Queue = TgtQueue->getHandleRef();
509  const PluginPtr &Plugin = TgtQueue->getPlugin();
510 
511  detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType();
512  TermPositions SrcPos, DstPos;
513  prepTermPositions(SrcPos, DimSrc, MemType);
514  prepTermPositions(DstPos, DimDst, MemType);
515 
516  size_t DstXOffBytes = DstOffset[DstPos.XTerm] * DstElemSize;
517  size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm] * SrcElemSize;
518  size_t DstAccessRangeWidthBytes = DstAccessRange[DstPos.XTerm] * DstElemSize;
519  size_t DstSzWidthBytes = DstSize[DstPos.XTerm] * DstElemSize;
520  size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm] * SrcElemSize;
521 
522  if (MemType == detail::SYCLMemObjI::MemObjType::Buffer) {
523  if (1 == DimDst && 1 == DimSrc) {
524  if (OutEventImpl != nullptr)
525  OutEventImpl->setHostEnqueueTime();
526  Plugin->call(urEnqueueMemBufferWrite, Queue, DstMem,
527  /*blocking_write=*/false, DstXOffBytes,
528  DstAccessRangeWidthBytes, SrcMem + SrcXOffBytes,
529  DepEvents.size(), DepEvents.data(), &OutEvent);
530  } else {
531  size_t BufferRowPitch = (1 == DimDst) ? 0 : DstSzWidthBytes;
532  size_t BufferSlicePitch =
533  (3 == DimDst) ? DstSzWidthBytes * DstSize[DstPos.YTerm] : 0;
534  size_t HostRowPitch = (1 == DimSrc) ? 0 : SrcSzWidthBytes;
535  size_t HostSlicePitch =
536  (3 == DimSrc) ? SrcSzWidthBytes * SrcSize[SrcPos.YTerm] : 0;
537 
538  ur_rect_offset_t BufferOffset{DstXOffBytes, DstOffset[DstPos.YTerm],
539  DstOffset[DstPos.ZTerm]};
540  ur_rect_offset_t HostOffset{SrcXOffBytes, SrcOffset[SrcPos.YTerm],
541  SrcOffset[SrcPos.ZTerm]};
542  ur_rect_region_t RectRegion{DstAccessRangeWidthBytes,
543  DstAccessRange[DstPos.YTerm],
544  DstAccessRange[DstPos.ZTerm]};
545  if (OutEventImpl != nullptr)
546  OutEventImpl->setHostEnqueueTime();
547  Plugin->call(urEnqueueMemBufferWriteRect, Queue, DstMem,
548  /*blocking_write=*/false, BufferOffset, HostOffset,
549  RectRegion, BufferRowPitch, BufferSlicePitch, HostRowPitch,
550  HostSlicePitch, SrcMem, DepEvents.size(), DepEvents.data(),
551  &OutEvent);
552  }
553  } else {
554  size_t InputRowPitch = (1 == DimDst) ? 0 : DstSzWidthBytes;
555  size_t InputSlicePitch =
556  (3 == DimDst) ? DstSzWidthBytes * DstSize[DstPos.YTerm] : 0;
557 
558  ur_rect_offset_t Origin{DstOffset[DstPos.XTerm], DstOffset[DstPos.YTerm],
559  DstOffset[DstPos.ZTerm]};
560  ur_rect_region_t Region{DstAccessRange[DstPos.XTerm],
561  DstAccessRange[DstPos.YTerm],
562  DstAccessRange[DstPos.ZTerm]};
563  if (OutEventImpl != nullptr)
564  OutEventImpl->setHostEnqueueTime();
565  Plugin->call(urEnqueueMemImageWrite, Queue, DstMem,
566  /*blocking_write=*/false, Origin, Region, InputRowPitch,
567  InputSlicePitch, SrcMem, DepEvents.size(), DepEvents.data(),
568  &OutEvent);
569  }
570 }
571 
572 void copyD2H(SYCLMemObjI *SYCLMemObj, ur_mem_handle_t SrcMem,
573  QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range<3> SrcSize,
574  sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset,
575  unsigned int SrcElemSize, char *DstMem, QueueImplPtr,
576  unsigned int DimDst, sycl::range<3> DstSize,
577  sycl::range<3> DstAccessRange, sycl::id<3> DstOffset,
578  unsigned int DstElemSize, std::vector<ur_event_handle_t> DepEvents,
579  ur_event_handle_t &OutEvent,
580  const detail::EventImplPtr &OutEventImpl) {
581  (void)DstAccessRange;
582  assert(SYCLMemObj && "The SYCLMemObj is nullptr");
583  assert(SrcQueue && "Source mem object queue is expected to be not nullptr");
584 
585  const ur_queue_handle_t Queue = SrcQueue->getHandleRef();
586  const PluginPtr &Plugin = SrcQueue->getPlugin();
587 
588  detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType();
589  TermPositions SrcPos, DstPos;
590  prepTermPositions(SrcPos, DimSrc, MemType);
591  prepTermPositions(DstPos, DimDst, MemType);
592 
593  // For a given buffer, the various mem copy routines (copyD2H, copyH2D,
594  // copyD2D) will usually have the same values for AccessRange, Size,
595  // Dimension, Offset, etc. EXCEPT when the dtor for ~SYCLMemObjT is called.
596  // Essentially, it schedules a copyBack of chars thus in copyD2H the
597  // Dimension will then be 1 and DstAccessRange[0] and DstSize[0] will be
598  // sized to bytes with a DstElemSize of 1.
599  size_t DstXOffBytes = DstOffset[DstPos.XTerm] * DstElemSize;
600  size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm] * SrcElemSize;
601  size_t SrcAccessRangeWidthBytes = SrcAccessRange[SrcPos.XTerm] * SrcElemSize;
602  size_t DstSzWidthBytes = DstSize[DstPos.XTerm] * DstElemSize;
603  size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm] * SrcElemSize;
604 
605  if (MemType == detail::SYCLMemObjI::MemObjType::Buffer) {
606  if (1 == DimDst && 1 == DimSrc) {
607  if (OutEventImpl != nullptr)
608  OutEventImpl->setHostEnqueueTime();
609  Plugin->call(urEnqueueMemBufferRead, Queue, SrcMem,
610  /*blocking_read=*/false, SrcXOffBytes,
611  SrcAccessRangeWidthBytes, DstMem + DstXOffBytes,
612  DepEvents.size(), DepEvents.data(), &OutEvent);
613  } else {
614  size_t BufferRowPitch = (1 == DimSrc) ? 0 : SrcSzWidthBytes;
615  size_t BufferSlicePitch =
616  (3 == DimSrc) ? SrcSzWidthBytes * SrcSize[SrcPos.YTerm] : 0;
617  size_t HostRowPitch = (1 == DimDst) ? 0 : DstSzWidthBytes;
618  size_t HostSlicePitch =
619  (3 == DimDst) ? DstSzWidthBytes * DstSize[DstPos.YTerm] : 0;
620 
621  ur_rect_offset_t BufferOffset{SrcXOffBytes, SrcOffset[SrcPos.YTerm],
622  SrcOffset[SrcPos.ZTerm]};
623  ur_rect_offset_t HostOffset{DstXOffBytes, DstOffset[DstPos.YTerm],
624  DstOffset[DstPos.ZTerm]};
625  ur_rect_region_t RectRegion{SrcAccessRangeWidthBytes,
626  SrcAccessRange[SrcPos.YTerm],
627  SrcAccessRange[SrcPos.ZTerm]};
628  if (OutEventImpl != nullptr)
629  OutEventImpl->setHostEnqueueTime();
630  Plugin->call(urEnqueueMemBufferReadRect, Queue, SrcMem,
631  /*blocking_read=*/false, BufferOffset, HostOffset,
632  RectRegion, BufferRowPitch, BufferSlicePitch, HostRowPitch,
633  HostSlicePitch, DstMem, DepEvents.size(), DepEvents.data(),
634  &OutEvent);
635  }
636  } else {
637  size_t RowPitch = (1 == DimSrc) ? 0 : SrcSzWidthBytes;
638  size_t SlicePitch =
639  (3 == DimSrc) ? SrcSzWidthBytes * SrcSize[SrcPos.YTerm] : 0;
640 
641  ur_rect_offset_t Offset{SrcOffset[SrcPos.XTerm], SrcOffset[SrcPos.YTerm],
642  SrcOffset[SrcPos.ZTerm]};
643  ur_rect_region_t Region{SrcAccessRange[SrcPos.XTerm],
644  SrcAccessRange[SrcPos.YTerm],
645  SrcAccessRange[SrcPos.ZTerm]};
646  if (OutEventImpl != nullptr)
647  OutEventImpl->setHostEnqueueTime();
648  Plugin->call(urEnqueueMemImageRead, Queue, SrcMem, false, Offset, Region,
649  RowPitch, SlicePitch, DstMem, DepEvents.size(),
650  DepEvents.data(), &OutEvent);
651  }
652 }
653 
654 void copyD2D(SYCLMemObjI *SYCLMemObj, ur_mem_handle_t SrcMem,
655  QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range<3> SrcSize,
656  sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset,
657  unsigned int SrcElemSize, ur_mem_handle_t DstMem, QueueImplPtr,
658  unsigned int DimDst, sycl::range<3> DstSize, sycl::range<3>,
659  sycl::id<3> DstOffset, unsigned int DstElemSize,
660  std::vector<ur_event_handle_t> DepEvents,
661  ur_event_handle_t &OutEvent,
662  const detail::EventImplPtr &OutEventImpl) {
663  assert(SYCLMemObj && "The SYCLMemObj is nullptr");
664  assert(SrcQueue && "Source mem object and target mem object queues are "
665  "expected to be not nullptr");
666 
667  const ur_queue_handle_t Queue = SrcQueue->getHandleRef();
668  const PluginPtr &Plugin = SrcQueue->getPlugin();
669 
670  detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType();
671  TermPositions SrcPos, DstPos;
672  prepTermPositions(SrcPos, DimSrc, MemType);
673  prepTermPositions(DstPos, DimDst, MemType);
674 
675  size_t DstXOffBytes = DstOffset[DstPos.XTerm] * DstElemSize;
676  size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm] * SrcElemSize;
677  size_t SrcAccessRangeWidthBytes = SrcAccessRange[SrcPos.XTerm] * SrcElemSize;
678  size_t DstSzWidthBytes = DstSize[DstPos.XTerm] * DstElemSize;
679  size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm] * SrcElemSize;
680 
681  if (MemType == detail::SYCLMemObjI::MemObjType::Buffer) {
682  if (1 == DimDst && 1 == DimSrc) {
683  if (OutEventImpl != nullptr)
684  OutEventImpl->setHostEnqueueTime();
685  Plugin->call(urEnqueueMemBufferCopy, Queue, SrcMem, DstMem, SrcXOffBytes,
686  DstXOffBytes, SrcAccessRangeWidthBytes, DepEvents.size(),
687  DepEvents.data(), &OutEvent);
688  } else {
689  // passing 0 for pitches not allowed. Because clEnqueueCopyBufferRect will
690  // calculate both src and dest pitch using region[0], which is not correct
691  // if src and dest are not the same size.
692  size_t SrcRowPitch = SrcSzWidthBytes;
693  size_t SrcSlicePitch = (DimSrc <= 1)
694  ? SrcSzWidthBytes
695  : SrcSzWidthBytes * SrcSize[SrcPos.YTerm];
696  size_t DstRowPitch = DstSzWidthBytes;
697  size_t DstSlicePitch = (DimDst <= 1)
698  ? DstSzWidthBytes
699  : DstSzWidthBytes * DstSize[DstPos.YTerm];
700 
701  ur_rect_offset_t SrcOrigin{SrcXOffBytes, SrcOffset[SrcPos.YTerm],
702  SrcOffset[SrcPos.ZTerm]};
703  ur_rect_offset_t DstOrigin{DstXOffBytes, DstOffset[DstPos.YTerm],
704  DstOffset[DstPos.ZTerm]};
705  ur_rect_region_t Region{SrcAccessRangeWidthBytes,
706  SrcAccessRange[SrcPos.YTerm],
707  SrcAccessRange[SrcPos.ZTerm]};
708  if (OutEventImpl != nullptr)
709  OutEventImpl->setHostEnqueueTime();
710  Plugin->call(urEnqueueMemBufferCopyRect, Queue, SrcMem, DstMem, SrcOrigin,
711  DstOrigin, Region, SrcRowPitch, SrcSlicePitch, DstRowPitch,
712  DstSlicePitch, DepEvents.size(), DepEvents.data(),
713  &OutEvent);
714  }
715  } else {
716  ur_rect_offset_t SrcOrigin{SrcOffset[SrcPos.XTerm], SrcOffset[SrcPos.YTerm],
717  SrcOffset[SrcPos.ZTerm]};
718  ur_rect_offset_t DstOrigin{DstOffset[DstPos.XTerm], DstOffset[DstPos.YTerm],
719  DstOffset[DstPos.ZTerm]};
720  ur_rect_region_t Region{SrcAccessRange[SrcPos.XTerm],
721  SrcAccessRange[SrcPos.YTerm],
722  SrcAccessRange[SrcPos.ZTerm]};
723  if (OutEventImpl != nullptr)
724  OutEventImpl->setHostEnqueueTime();
725  Plugin->call(urEnqueueMemImageCopy, Queue, SrcMem, DstMem, SrcOrigin,
726  DstOrigin, Region, DepEvents.size(), DepEvents.data(),
727  &OutEvent);
728  }
729 }
730 
731 static void copyH2H(SYCLMemObjI *, char *SrcMem, QueueImplPtr,
732  unsigned int DimSrc, sycl::range<3> SrcSize,
733  sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset,
734  unsigned int SrcElemSize, char *DstMem, QueueImplPtr,
735  unsigned int DimDst, sycl::range<3> DstSize,
736  sycl::range<3> DstAccessRange, sycl::id<3> DstOffset,
737  unsigned int DstElemSize, std::vector<ur_event_handle_t>,
738  ur_event_handle_t &, const detail::EventImplPtr &) {
739  if ((DimSrc != 1 || DimDst != 1) &&
740  (SrcOffset != id<3>{0, 0, 0} || DstOffset != id<3>{0, 0, 0} ||
741  SrcSize != SrcAccessRange || DstSize != DstAccessRange)) {
742  throw exception(make_error_code(errc::feature_not_supported),
743  "Not supported configuration of memcpy requested");
744  }
745 
746  SrcMem += SrcOffset[0] * SrcElemSize;
747  DstMem += DstOffset[0] * DstElemSize;
748 
749  if (SrcMem == DstMem)
750  return;
751 
752  size_t BytesToCopy =
753  SrcAccessRange[0] * SrcElemSize * SrcAccessRange[1] * SrcAccessRange[2];
754  std::memcpy(DstMem, SrcMem, BytesToCopy);
755 }
756 
757 // Copies memory between: host and device, host and host,
758 // device and device if memory objects bound to the one context.
760  SYCLMemObjI *SYCLMemObj, void *SrcMem, QueueImplPtr SrcQueue,
761  unsigned int DimSrc, sycl::range<3> SrcSize, sycl::range<3> SrcAccessRange,
762  sycl::id<3> SrcOffset, unsigned int SrcElemSize, void *DstMem,
763  QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range<3> DstSize,
764  sycl::range<3> DstAccessRange, sycl::id<3> DstOffset,
765  unsigned int DstElemSize, std::vector<ur_event_handle_t> DepEvents,
766  ur_event_handle_t &OutEvent, const detail::EventImplPtr &OutEventImpl) {
767 
768  if (!SrcQueue) {
769  if (!TgtQueue)
770  copyH2H(SYCLMemObj, (char *)SrcMem, nullptr, DimSrc, SrcSize,
771  SrcAccessRange, SrcOffset, SrcElemSize, (char *)DstMem, nullptr,
772  DimDst, DstSize, DstAccessRange, DstOffset, DstElemSize,
773  std::move(DepEvents), OutEvent, OutEventImpl);
774  else
775  copyH2D(SYCLMemObj, (char *)SrcMem, nullptr, DimSrc, SrcSize,
776  SrcAccessRange, SrcOffset, SrcElemSize,
777  ur::cast<ur_mem_handle_t>(DstMem), std::move(TgtQueue), DimDst,
778  DstSize, DstAccessRange, DstOffset, DstElemSize,
779  std::move(DepEvents), OutEvent, OutEventImpl);
780  } else {
781  if (!TgtQueue)
782  copyD2H(SYCLMemObj, ur::cast<ur_mem_handle_t>(SrcMem),
783  std::move(SrcQueue), DimSrc, SrcSize, SrcAccessRange, SrcOffset,
784  SrcElemSize, (char *)DstMem, nullptr, DimDst, DstSize,
785  DstAccessRange, DstOffset, DstElemSize, std::move(DepEvents),
786  OutEvent, OutEventImpl);
787  else
788  copyD2D(SYCLMemObj, ur::cast<ur_mem_handle_t>(SrcMem),
789  std::move(SrcQueue), DimSrc, SrcSize, SrcAccessRange, SrcOffset,
790  SrcElemSize, ur::cast<ur_mem_handle_t>(DstMem),
791  std::move(TgtQueue), DimDst, DstSize, DstAccessRange, DstOffset,
792  DstElemSize, std::move(DepEvents), OutEvent, OutEventImpl);
793  }
794 }
795 
796 void MemoryManager::fill(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue,
797  size_t PatternSize, const unsigned char *Pattern,
798  unsigned int Dim, sycl::range<3> MemRange,
799  sycl::range<3> AccRange, sycl::id<3> Offset,
800  unsigned int ElementSize,
801  std::vector<ur_event_handle_t> DepEvents,
802  ur_event_handle_t &OutEvent,
803  const detail::EventImplPtr &OutEventImpl) {
804  assert(SYCLMemObj && "The SYCLMemObj is nullptr");
805  assert(Queue && "Fill should be called only with a valid device queue");
806 
807  const PluginPtr &Plugin = Queue->getPlugin();
808 
809  if (SYCLMemObj->getType() == detail::SYCLMemObjI::MemObjType::Buffer) {
810  if (OutEventImpl != nullptr)
811  OutEventImpl->setHostEnqueueTime();
812 
813  // 2D and 3D buffers accessors can't have custom range or the data will
814  // likely be discontiguous.
815  bool RangesUsable = (Dim <= 1) || (MemRange == AccRange);
816  // For 2D and 3D buffers, the offset must be 0, or the data will be
817  // discontiguous.
818  bool OffsetUsable = (Dim <= 1) || (Offset == sycl::id<3>{0, 0, 0});
819  size_t RangeMultiplier = AccRange[0] * AccRange[1] * AccRange[2];
820 
821  if (RangesUsable && OffsetUsable) {
822  Plugin->call(urEnqueueMemBufferFill, Queue->getHandleRef(),
823  ur::cast<ur_mem_handle_t>(Mem), Pattern, PatternSize,
824  Offset[0] * ElementSize, RangeMultiplier * ElementSize,
825  DepEvents.size(), DepEvents.data(), &OutEvent);
826  return;
827  }
828  // The sycl::handler uses a parallel_for kernel in the case of unusable
829  // Range or Offset, not CG:Fill. So we should not be here.
830  throw exception(make_error_code(errc::runtime),
831  "Not supported configuration of fill requested");
832  } else {
833  if (OutEventImpl != nullptr)
834  OutEventImpl->setHostEnqueueTime();
835  // We don't have any backend implementations that support enqueueing a fill
836  // on non-buffer mem objects like this. The old UR function was a stub with
837  // an abort.
838  throw exception(make_error_code(errc::runtime),
839  "Fill operation not supported for the given mem object");
840  }
841 }
842 
843 void *MemoryManager::map(SYCLMemObjI *, void *Mem, QueueImplPtr Queue,
844  access::mode AccessMode, unsigned int, sycl::range<3>,
845  sycl::range<3> AccessRange, sycl::id<3> AccessOffset,
846  unsigned int ElementSize,
847  std::vector<ur_event_handle_t> DepEvents,
848  ur_event_handle_t &OutEvent) {
849  if (!Queue) {
850  throw exception(make_error_code(errc::runtime),
851  "Not supported configuration of map requested");
852  }
853 
854  ur_map_flags_t Flags = 0;
855 
856  switch (AccessMode) {
857  case access::mode::read:
858  Flags |= UR_MAP_FLAG_READ;
859  break;
860  case access::mode::write:
861  Flags |= UR_MAP_FLAG_WRITE;
862  break;
864  case access::mode::atomic:
865  Flags = UR_MAP_FLAG_WRITE | UR_MAP_FLAG_READ;
866  break;
867  case access::mode::discard_write:
868  case access::mode::discard_read_write:
869  Flags |= UR_MAP_FLAG_WRITE_INVALIDATE_REGION;
870  break;
871  }
872 
873  AccessOffset[0] *= ElementSize;
874  AccessRange[0] *= ElementSize;
875 
876  // TODO: Handle offset
877  assert(AccessOffset[0] == 0 && "Handle offset");
878 
879  void *MappedPtr = nullptr;
880  const size_t BytesToMap = AccessRange[0] * AccessRange[1] * AccessRange[2];
881  const PluginPtr &Plugin = Queue->getPlugin();
882  memBufferMapHelper(Plugin, Queue->getHandleRef(),
883  ur::cast<ur_mem_handle_t>(Mem), false, Flags,
884  AccessOffset[0], BytesToMap, DepEvents.size(),
885  DepEvents.data(), &OutEvent, &MappedPtr);
886  return MappedPtr;
887 }
888 
890  void *MappedPtr,
891  std::vector<ur_event_handle_t> DepEvents,
892  ur_event_handle_t &OutEvent) {
893 
894  // Execution on host is not supported here.
895  if (!Queue) {
896  throw exception(make_error_code(errc::runtime),
897  "Not supported configuration of unmap requested");
898  }
899  // All DepEvents are to the same Context.
900  // Using the plugin of the Queue.
901 
902  const PluginPtr &Plugin = Queue->getPlugin();
903  memUnmapHelper(Plugin, Queue->getHandleRef(), ur::cast<ur_mem_handle_t>(Mem),
904  MappedPtr, DepEvents.size(), DepEvents.data(), &OutEvent);
905 }
906 
907 void MemoryManager::copy_usm(const void *SrcMem, QueueImplPtr SrcQueue,
908  size_t Len, void *DstMem,
909  std::vector<ur_event_handle_t> DepEvents,
910  ur_event_handle_t *OutEvent,
911  const detail::EventImplPtr &OutEventImpl) {
912  assert(SrcQueue && "USM copy must be called with a valid device queue");
913  if (!Len) { // no-op, but ensure DepEvents will still be waited on
914  if (!DepEvents.empty()) {
915  if (OutEventImpl != nullptr)
916  OutEventImpl->setHostEnqueueTime();
917  SrcQueue->getPlugin()->call(urEnqueueEventsWait, SrcQueue->getHandleRef(),
918  DepEvents.size(), DepEvents.data(), OutEvent);
919  }
920  return;
921  }
922 
923  if (!SrcMem || !DstMem)
924  throw exception(make_error_code(errc::invalid),
925  "NULL pointer argument in memory copy operation.");
926 
927  const PluginPtr &Plugin = SrcQueue->getPlugin();
928  if (OutEventImpl != nullptr)
929  OutEventImpl->setHostEnqueueTime();
930  Plugin->call(urEnqueueUSMMemcpy, SrcQueue->getHandleRef(),
931  /* blocking */ false, DstMem, SrcMem, Len, DepEvents.size(),
932  DepEvents.data(), OutEvent);
933 }
934 
935 void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length,
936  const std::vector<unsigned char> &Pattern,
937  std::vector<ur_event_handle_t> DepEvents,
938  ur_event_handle_t *OutEvent,
939  const detail::EventImplPtr &OutEventImpl) {
940  assert(Queue && "USM fill must be called with a valid device queue");
941  if (!Length) { // no-op, but ensure DepEvents will still be waited on
942  if (!DepEvents.empty()) {
943  if (OutEventImpl != nullptr)
944  OutEventImpl->setHostEnqueueTime();
945  Queue->getPlugin()->call(urEnqueueEventsWait, Queue->getHandleRef(),
946  DepEvents.size(), DepEvents.data(), OutEvent);
947  }
948  return;
949  }
950 
951  if (!Mem)
952  throw exception(make_error_code(errc::invalid),
953  "NULL pointer argument in memory fill operation.");
954  if (OutEventImpl != nullptr)
955  OutEventImpl->setHostEnqueueTime();
956  const PluginPtr &Plugin = Queue->getPlugin();
957  Plugin->call(urEnqueueUSMFill, Queue->getHandleRef(), Mem, Pattern.size(),
958  Pattern.data(), Length, DepEvents.size(), DepEvents.data(),
959  OutEvent);
960 }
961 
962 void MemoryManager::prefetch_usm(void *Mem, QueueImplPtr Queue, size_t Length,
963  std::vector<ur_event_handle_t> DepEvents,
964  ur_event_handle_t *OutEvent,
965  const detail::EventImplPtr &OutEventImpl) {
966  assert(Queue && "USM prefetch must be called with a valid device queue");
967  const PluginPtr &Plugin = Queue->getPlugin();
968  if (OutEventImpl != nullptr)
969  OutEventImpl->setHostEnqueueTime();
970  Plugin->call(urEnqueueUSMPrefetch, Queue->getHandleRef(), Mem, Length, 0,
971  DepEvents.size(), DepEvents.data(), OutEvent);
972 }
973 
974 void MemoryManager::advise_usm(const void *Mem, QueueImplPtr Queue,
975  size_t Length, ur_usm_advice_flags_t Advice,
976  std::vector<ur_event_handle_t> /*DepEvents*/,
977  ur_event_handle_t *OutEvent,
978  const detail::EventImplPtr &OutEventImpl) {
979  assert(Queue && "USM advise must be called with a valid device queue");
980  const PluginPtr &Plugin = Queue->getPlugin();
981  if (OutEventImpl != nullptr)
982  OutEventImpl->setHostEnqueueTime();
983  Plugin->call(urEnqueueUSMAdvise, Queue->getHandleRef(), Mem, Length, Advice,
984  OutEvent);
985 }
986 
987 void MemoryManager::copy_2d_usm(const void *SrcMem, size_t SrcPitch,
988  QueueImplPtr Queue, void *DstMem,
989  size_t DstPitch, size_t Width, size_t Height,
990  std::vector<ur_event_handle_t> DepEvents,
991  ur_event_handle_t *OutEvent,
992  const detail::EventImplPtr &OutEventImpl) {
993  assert(Queue && "USM copy 2d must be called with a valid device queue");
994  if (Width == 0 || Height == 0) {
995  // no-op, but ensure DepEvents will still be waited on
996  if (!DepEvents.empty()) {
997  if (OutEventImpl != nullptr)
998  OutEventImpl->setHostEnqueueTime();
999  Queue->getPlugin()->call(urEnqueueEventsWait, Queue->getHandleRef(),
1000  DepEvents.size(), DepEvents.data(), OutEvent);
1001  }
1002  return;
1003  }
1004 
1005  if (!DstMem || !SrcMem)
1006  throw sycl::exception(sycl::make_error_code(errc::invalid),
1007  "NULL pointer argument in 2D memory copy operation.");
1008 
1009  const PluginPtr &Plugin = Queue->getPlugin();
1010 
1011  bool SupportsUSMMemcpy2D = false;
1012  Plugin->call(urContextGetInfo, Queue->getContextImplPtr()->getHandleRef(),
1013  UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT, sizeof(bool),
1014  &SupportsUSMMemcpy2D, nullptr);
1015 
1016  if (SupportsUSMMemcpy2D) {
1017  if (OutEventImpl != nullptr)
1018  OutEventImpl->setHostEnqueueTime();
1019  // Direct memcpy2D is supported so we use this function.
1020  Plugin->call(urEnqueueUSMMemcpy2D, Queue->getHandleRef(),
1021  /*blocking=*/false, DstMem, DstPitch, SrcMem, SrcPitch, Width,
1022  Height, DepEvents.size(), DepEvents.data(), OutEvent);
1023  return;
1024  }
1025 
1026  // Otherwise we allow the special case where the copy is to or from host.
1027 #ifndef NDEBUG
1028  context Ctx = createSyclObjFromImpl<context>(Queue->getContextImplPtr());
1029  usm::alloc SrcAllocType = get_pointer_type(SrcMem, Ctx);
1030  usm::alloc DstAllocType = get_pointer_type(DstMem, Ctx);
1031  bool SrcIsHost =
1032  SrcAllocType == usm::alloc::unknown || SrcAllocType == usm::alloc::host;
1033  bool DstIsHost =
1034  DstAllocType == usm::alloc::unknown || DstAllocType == usm::alloc::host;
1035  assert((SrcIsHost || DstIsHost) && "In fallback path for copy_2d_usm either "
1036  "source or destination must be on host.");
1037 #endif // NDEBUG
1038 
1039  // The fallback in this case is to insert a copy per row.
1040  std::vector<OwnedUrEvent> CopyEventsManaged;
1041  CopyEventsManaged.reserve(Height);
1042  // We'll need continuous range of events for a wait later as well.
1043  std::vector<ur_event_handle_t> CopyEvents(Height);
1044  if (OutEventImpl != nullptr)
1045  OutEventImpl->setHostEnqueueTime();
1046 
1047  for (size_t I = 0; I < Height; ++I) {
1048  char *DstItBegin = static_cast<char *>(DstMem) + I * DstPitch;
1049  const char *SrcItBegin = static_cast<const char *>(SrcMem) + I * SrcPitch;
1050  Plugin->call(urEnqueueUSMMemcpy, Queue->getHandleRef(),
1051  /* blocking */ false, DstItBegin, SrcItBegin, Width,
1052  DepEvents.size(), DepEvents.data(), CopyEvents.data() + I);
1053  CopyEventsManaged.emplace_back(CopyEvents[I], Plugin,
1054  /*TakeOwnership=*/true);
1055  }
1056  if (OutEventImpl != nullptr)
1057  OutEventImpl->setHostEnqueueTime();
1058  // Then insert a wait to coalesce the copy events.
1059  Queue->getPlugin()->call(urEnqueueEventsWait, Queue->getHandleRef(),
1060  CopyEvents.size(), CopyEvents.data(), OutEvent);
1061 }
1062 
1063 void MemoryManager::fill_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch,
1064  size_t Width, size_t Height,
1065  const std::vector<unsigned char> &Pattern,
1066  std::vector<ur_event_handle_t> DepEvents,
1067  ur_event_handle_t *OutEvent,
1068  const detail::EventImplPtr &OutEventImpl) {
1069  assert(Queue && "USM fill 2d must be called with a valid device queue");
1070  if (Width == 0 || Height == 0) {
1071  // no-op, but ensure DepEvents will still be waited on
1072  if (!DepEvents.empty()) {
1073  if (OutEventImpl != nullptr)
1074  OutEventImpl->setHostEnqueueTime();
1075  Queue->getPlugin()->call(urEnqueueEventsWait, Queue->getHandleRef(),
1076  DepEvents.size(), DepEvents.data(), OutEvent);
1077  }
1078  return;
1079  }
1080 
1081  if (!DstMem)
1082  throw sycl::exception(sycl::make_error_code(errc::invalid),
1083  "NULL pointer argument in 2D memory fill operation.");
1084  if (OutEventImpl != nullptr)
1085  OutEventImpl->setHostEnqueueTime();
1086  const PluginPtr &Plugin = Queue->getPlugin();
1087  Plugin->call(urEnqueueUSMFill2D, Queue->getHandleRef(), DstMem, Pitch,
1088  Pattern.size(), Pattern.data(), Width, Height, DepEvents.size(),
1089  DepEvents.data(), OutEvent);
1090 }
1091 
1092 void MemoryManager::memset_2d_usm(void *DstMem, QueueImplPtr Queue,
1093  size_t Pitch, size_t Width, size_t Height,
1094  char Value,
1095  std::vector<ur_event_handle_t> DepEvents,
1096  ur_event_handle_t *OutEvent,
1097  const detail::EventImplPtr &OutEventImpl) {
1098  assert(Queue && "USM memset 2d must be called with a valid device queue");
1099  if (Width == 0 || Height == 0) {
1100  // no-op, but ensure DepEvents will still be waited on
1101  if (!DepEvents.empty()) {
1102  if (OutEventImpl != nullptr)
1103  OutEventImpl->setHostEnqueueTime();
1104  Queue->getPlugin()->call(urEnqueueEventsWait, Queue->getHandleRef(),
1105  DepEvents.size(), DepEvents.data(), OutEvent);
1106  }
1107  return;
1108  }
1109 
1110  if (!DstMem)
1111  throw sycl::exception(
1112  sycl::make_error_code(errc::invalid),
1113  "NULL pointer argument in 2D memory memset operation.");
1114  if (OutEventImpl != nullptr)
1115  OutEventImpl->setHostEnqueueTime();
1116  MemoryManager::fill_2d_usm(DstMem, Queue, Pitch, Width, Height,
1117  {static_cast<unsigned char>(Value)}, DepEvents,
1118  OutEvent, nullptr);
1119 }
1120 
1122  QueueImplPtr Queue, DeviceGlobalMapEntry *DeviceGlobalEntry,
1123  size_t NumBytes, size_t Offset, const void *Src,
1124  const std::vector<ur_event_handle_t> &DepEvents,
1125  ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl) {
1126  assert(Queue &&
1127  "Copy to device global USM must be called with a valid device queue");
1128  // Get or allocate USM memory for the device_global.
1129  DeviceGlobalUSMMem &DeviceGlobalUSM =
1130  DeviceGlobalEntry->getOrAllocateDeviceGlobalUSM(Queue);
1131  void *Dest = DeviceGlobalUSM.getPtr();
1132 
1133  // OwnedPiEvent will keep the initialization event alive for the duration
1134  // of this function call.
1135  OwnedUrEvent ZIEvent = DeviceGlobalUSM.getInitEvent(Queue->getPlugin());
1136 
1137  // We may need addtional events, so create a non-const dependency events list
1138  // to use if we need to modify it.
1139  std::vector<ur_event_handle_t> AuxDepEventsStorage;
1140  const std::vector<ur_event_handle_t> &ActualDepEvents =
1141  ZIEvent ? AuxDepEventsStorage : DepEvents;
1142 
1143  // If there is a zero-initializer event the memory operation should wait for
1144  // it.
1145  if (ZIEvent) {
1146  AuxDepEventsStorage = DepEvents;
1147  AuxDepEventsStorage.push_back(ZIEvent.GetEvent());
1148  }
1149 
1150  MemoryManager::copy_usm(Src, Queue, NumBytes,
1151  reinterpret_cast<char *>(Dest) + Offset,
1152  ActualDepEvents, OutEvent, OutEventImpl);
1153 }
1154 
1156  QueueImplPtr Queue, DeviceGlobalMapEntry *DeviceGlobalEntry,
1157  size_t NumBytes, size_t Offset, void *Dest,
1158  const std::vector<ur_event_handle_t> &DepEvents,
1159  ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl) {
1160  assert(Queue && "Copying from device global USM must be called with a valid "
1161  "device queue");
1162  // Get or allocate USM memory for the device_global. Since we are reading from
1163  // it, we need it initialized if it has not been yet.
1164  DeviceGlobalUSMMem &DeviceGlobalUSM =
1165  DeviceGlobalEntry->getOrAllocateDeviceGlobalUSM(Queue);
1166  void *Src = DeviceGlobalUSM.getPtr();
1167 
1168  // OwnedPiEvent will keep the initialization event alive for the duration
1169  // of this function call.
1170  OwnedUrEvent ZIEvent = DeviceGlobalUSM.getInitEvent(Queue->getPlugin());
1171 
1172  // We may need addtional events, so create a non-const dependency events list
1173  // to use if we need to modify it.
1174  std::vector<ur_event_handle_t> AuxDepEventsStorage;
1175  const std::vector<ur_event_handle_t> &ActualDepEvents =
1176  ZIEvent ? AuxDepEventsStorage : DepEvents;
1177 
1178  // If there is a zero-initializer event the memory operation should wait for
1179  // it.
1180  if (ZIEvent) {
1181  AuxDepEventsStorage = DepEvents;
1182  AuxDepEventsStorage.push_back(ZIEvent.GetEvent());
1183  }
1184 
1185  MemoryManager::copy_usm(reinterpret_cast<const char *>(Src) + Offset, Queue,
1186  NumBytes, Dest, ActualDepEvents, OutEvent,
1187  OutEventImpl);
1188 }
1189 
1190 static ur_program_handle_t
1192  DeviceGlobalMapEntry *DeviceGlobalEntry) {
1193  assert(DeviceGlobalEntry->MIsDeviceImageScopeDecorated &&
1194  "device_global is not device image scope decorated.");
1195 
1196  // If the device global is used in multiple device images we cannot proceed.
1197  if (DeviceGlobalEntry->MImageIdentifiers.size() > 1)
1198  throw sycl::exception(make_error_code(errc::invalid),
1199  "More than one image exists with the device_global.");
1200 
1201  // If there are no kernels using the device_global we cannot proceed.
1202  if (DeviceGlobalEntry->MImageIdentifiers.size() == 0)
1203  throw sycl::exception(make_error_code(errc::invalid),
1204  "No image exists with the device_global.");
1205 
1206  // Look for cached programs with the device_global.
1207  device Device = Queue->get_device();
1208  ContextImplPtr ContextImpl = Queue->getContextImplPtr();
1209  std::optional<ur_program_handle_t> CachedProgram =
1210  ContextImpl->getProgramForDeviceGlobal(Device, DeviceGlobalEntry);
1211  if (CachedProgram)
1212  return *CachedProgram;
1213 
1214  // If there was no cached program, build one.
1215  auto Context = createSyclObjFromImpl<context>(ContextImpl);
1216  ProgramManager &PM = ProgramManager::getInstance();
1217  RTDeviceBinaryImage &Img =
1218  PM.getDeviceImage(DeviceGlobalEntry->MImages, Context, Device);
1219  device_image_plain DeviceImage =
1220  PM.getDeviceImageFromBinaryImage(&Img, Context, Device);
1221  device_image_plain BuiltImage = PM.build(DeviceImage, {Device}, {});
1222  return getSyclObjImpl(BuiltImage)->get_ur_program_ref();
1223 }
1224 
1225 static void
1227  DeviceGlobalMapEntry *DeviceGlobalEntry,
1228  size_t NumBytes, size_t Offset, const void *Src,
1229  const std::vector<ur_event_handle_t> &DepEvents,
1230  ur_event_handle_t *OutEvent) {
1231  assert(
1232  Queue &&
1233  "Direct copy to device global must be called with a valid device queue");
1234  ur_program_handle_t Program =
1235  getOrBuildProgramForDeviceGlobal(Queue, DeviceGlobalEntry);
1236  const PluginPtr &Plugin = Queue->getPlugin();
1237  Plugin->call(urEnqueueDeviceGlobalVariableWrite, Queue->getHandleRef(),
1238  Program, DeviceGlobalEntry->MUniqueId.c_str(), false, NumBytes,
1239  Offset, Src, DepEvents.size(), DepEvents.data(), OutEvent);
1240 }
1241 
1242 static void
1244  DeviceGlobalMapEntry *DeviceGlobalEntry,
1245  size_t NumBytes, size_t Offset, void *Dest,
1246  const std::vector<ur_event_handle_t> &DepEvents,
1247  ur_event_handle_t *OutEvent) {
1248  assert(Queue && "Direct copy from device global must be called with a valid "
1249  "device queue");
1250  ur_program_handle_t Program =
1251  getOrBuildProgramForDeviceGlobal(Queue, DeviceGlobalEntry);
1252  const PluginPtr &Plugin = Queue->getPlugin();
1253  Plugin->call(urEnqueueDeviceGlobalVariableRead, Queue->getHandleRef(),
1254  Program, DeviceGlobalEntry->MUniqueId.c_str(), false, NumBytes,
1255  Offset, Dest, DepEvents.size(), DepEvents.data(), OutEvent);
1256 }
1257 
1258 void MemoryManager::copy_to_device_global(
1259  const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue,
1260  size_t NumBytes, size_t Offset, const void *SrcMem,
1261  const std::vector<ur_event_handle_t> &DepEvents,
1262  ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl) {
1263  DeviceGlobalMapEntry *DGEntry =
1264  detail::ProgramManager::getInstance().getDeviceGlobalEntry(
1265  DeviceGlobalPtr);
1266  assert(DGEntry &&
1267  DGEntry->MIsDeviceImageScopeDecorated == IsDeviceImageScoped &&
1268  "Invalid copy operation for device_global.");
1269  assert(DGEntry->MDeviceGlobalTSize >= Offset + NumBytes &&
1270  "Copy to device_global is out of bounds.");
1271 
1272  if (IsDeviceImageScoped)
1273  memcpyToDeviceGlobalDirect(Queue, DGEntry, NumBytes, Offset, SrcMem,
1274  DepEvents, OutEvent);
1275  else
1276  memcpyToDeviceGlobalUSM(Queue, DGEntry, NumBytes, Offset, SrcMem, DepEvents,
1277  OutEvent, OutEventImpl);
1278 }
1279 
1280 void MemoryManager::copy_from_device_global(
1281  const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue,
1282  size_t NumBytes, size_t Offset, void *DstMem,
1283  const std::vector<ur_event_handle_t> &DepEvents,
1284  ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl) {
1285  DeviceGlobalMapEntry *DGEntry =
1286  detail::ProgramManager::getInstance().getDeviceGlobalEntry(
1287  DeviceGlobalPtr);
1288  assert(DGEntry &&
1289  DGEntry->MIsDeviceImageScopeDecorated == IsDeviceImageScoped &&
1290  "Invalid copy operation for device_global.");
1291  assert(DGEntry->MDeviceGlobalTSize >= Offset + NumBytes &&
1292  "Copy from device_global is out of bounds.");
1293 
1294  if (IsDeviceImageScoped)
1295  memcpyFromDeviceGlobalDirect(Queue, DGEntry, NumBytes, Offset, DstMem,
1296  DepEvents, OutEvent);
1297  else
1298  memcpyFromDeviceGlobalUSM(Queue, DGEntry, NumBytes, Offset, DstMem,
1299  DepEvents, OutEvent, OutEventImpl);
1300 }
1301 
1302 // Command buffer methods
1303 void MemoryManager::ext_oneapi_copyD2D_cmd_buffer(
1305  ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj,
1306  void *SrcMem, unsigned int DimSrc, sycl::range<3> SrcSize,
1307  sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset,
1308  unsigned int SrcElemSize, void *DstMem, unsigned int DimDst,
1309  sycl::range<3> DstSize, sycl::range<3> DstAccessRange,
1310  sycl::id<3> DstOffset, unsigned int DstElemSize,
1311  std::vector<ur_exp_command_buffer_sync_point_t> Deps,
1312  ur_exp_command_buffer_sync_point_t *OutSyncPoint) {
1313  assert(SYCLMemObj && "The SYCLMemObj is nullptr");
1314  (void)DstAccessRange;
1315 
1316  const PluginPtr &Plugin = Context->getPlugin();
1317 
1318  detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType();
1319  TermPositions SrcPos, DstPos;
1320  prepTermPositions(SrcPos, DimSrc, MemType);
1321  prepTermPositions(DstPos, DimDst, MemType);
1322 
1323  size_t DstXOffBytes = DstOffset[DstPos.XTerm] * DstElemSize;
1324  size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm] * SrcElemSize;
1325  size_t SrcAccessRangeWidthBytes = SrcAccessRange[SrcPos.XTerm] * SrcElemSize;
1326  size_t DstSzWidthBytes = DstSize[DstPos.XTerm] * DstElemSize;
1327  size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm] * SrcElemSize;
1328 
1329  if (MemType != detail::SYCLMemObjI::MemObjType::Buffer) {
1330  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
1331  "Images are not supported in Graphs");
1332  }
1333 
1334  if (1 == DimDst && 1 == DimSrc) {
1335  Plugin->call(urCommandBufferAppendMemBufferCopyExp, CommandBuffer,
1336  sycl::detail::ur::cast<ur_mem_handle_t>(SrcMem),
1337  sycl::detail::ur::cast<ur_mem_handle_t>(DstMem), SrcXOffBytes,
1338  DstXOffBytes, SrcAccessRangeWidthBytes, Deps.size(),
1339  Deps.data(), OutSyncPoint);
1340  } else {
1341  // passing 0 for pitches not allowed. Because clEnqueueCopyBufferRect will
1342  // calculate both src and dest pitch using region[0], which is not correct
1343  // if src and dest are not the same size.
1344  size_t SrcRowPitch = SrcSzWidthBytes;
1345  size_t SrcSlicePitch = (DimSrc <= 1)
1346  ? SrcSzWidthBytes
1347  : SrcSzWidthBytes * SrcSize[SrcPos.YTerm];
1348  size_t DstRowPitch = DstSzWidthBytes;
1349  size_t DstSlicePitch = (DimDst <= 1)
1350  ? DstSzWidthBytes
1351  : DstSzWidthBytes * DstSize[DstPos.YTerm];
1352 
1353  ur_rect_offset_t SrcOrigin{SrcXOffBytes, SrcOffset[SrcPos.YTerm],
1354  SrcOffset[SrcPos.ZTerm]};
1355  ur_rect_offset_t DstOrigin{DstXOffBytes, DstOffset[DstPos.YTerm],
1356  DstOffset[DstPos.ZTerm]};
1357  ur_rect_region_t Region{SrcAccessRangeWidthBytes,
1358  SrcAccessRange[SrcPos.YTerm],
1359  SrcAccessRange[SrcPos.ZTerm]};
1360 
1361  Plugin->call(urCommandBufferAppendMemBufferCopyRectExp, CommandBuffer,
1362  sycl::detail::ur::cast<ur_mem_handle_t>(SrcMem),
1363  sycl::detail::ur::cast<ur_mem_handle_t>(DstMem), SrcOrigin,
1364  DstOrigin, Region, SrcRowPitch, SrcSlicePitch, DstRowPitch,
1365  DstSlicePitch, Deps.size(), Deps.data(), OutSyncPoint);
1366  }
1367 }
1368 
1369 void MemoryManager::ext_oneapi_copyD2H_cmd_buffer(
1371  ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj,
1372  void *SrcMem, unsigned int DimSrc, sycl::range<3> SrcSize,
1373  sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset,
1374  unsigned int SrcElemSize, char *DstMem, unsigned int DimDst,
1375  sycl::range<3> DstSize, sycl::id<3> DstOffset, unsigned int DstElemSize,
1376  std::vector<ur_exp_command_buffer_sync_point_t> Deps,
1377  ur_exp_command_buffer_sync_point_t *OutSyncPoint) {
1378  assert(SYCLMemObj && "The SYCLMemObj is nullptr");
1379 
1380  const PluginPtr &Plugin = Context->getPlugin();
1381 
1382  detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType();
1383  TermPositions SrcPos, DstPos;
1384  prepTermPositions(SrcPos, DimSrc, MemType);
1385  prepTermPositions(DstPos, DimDst, MemType);
1386 
1387  size_t DstXOffBytes = DstOffset[DstPos.XTerm] * DstElemSize;
1388  size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm] * SrcElemSize;
1389  size_t SrcAccessRangeWidthBytes = SrcAccessRange[SrcPos.XTerm] * SrcElemSize;
1390  size_t DstSzWidthBytes = DstSize[DstPos.XTerm] * DstElemSize;
1391  size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm] * SrcElemSize;
1392 
1393  if (MemType != detail::SYCLMemObjI::MemObjType::Buffer) {
1394  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
1395  "Images are not supported in Graphs");
1396  }
1397 
1398  if (1 == DimDst && 1 == DimSrc) {
1399  ur_result_t Result = Plugin->call_nocheck(
1400  urCommandBufferAppendMemBufferReadExp, CommandBuffer,
1401  sycl::detail::ur::cast<ur_mem_handle_t>(SrcMem), SrcXOffBytes,
1402  SrcAccessRangeWidthBytes, DstMem + DstXOffBytes, Deps.size(),
1403  Deps.data(), OutSyncPoint);
1404 
1405  if (Result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
1406  throw sycl::exception(
1407  sycl::make_error_code(sycl::errc::feature_not_supported),
1408  "Device-to-host buffer copy command not supported by graph backend");
1409  } else {
1410  Plugin->checkUrResult(Result);
1411  }
1412  } else {
1413  size_t BufferRowPitch = (1 == DimSrc) ? 0 : SrcSzWidthBytes;
1414  size_t BufferSlicePitch =
1415  (3 == DimSrc) ? SrcSzWidthBytes * SrcSize[SrcPos.YTerm] : 0;
1416  size_t HostRowPitch = (1 == DimDst) ? 0 : DstSzWidthBytes;
1417  size_t HostSlicePitch =
1418  (3 == DimDst) ? DstSzWidthBytes * DstSize[DstPos.YTerm] : 0;
1419 
1420  ur_rect_offset_t BufferOffset{SrcXOffBytes, SrcOffset[SrcPos.YTerm],
1421  SrcOffset[SrcPos.ZTerm]};
1422  ur_rect_offset_t HostOffset{DstXOffBytes, DstOffset[DstPos.YTerm],
1423  DstOffset[DstPos.ZTerm]};
1424  ur_rect_region_t RectRegion{SrcAccessRangeWidthBytes,
1425  SrcAccessRange[SrcPos.YTerm],
1426  SrcAccessRange[SrcPos.ZTerm]};
1427 
1428  ur_result_t Result = Plugin->call_nocheck(
1429  urCommandBufferAppendMemBufferReadRectExp, CommandBuffer,
1430  sycl::detail::ur::cast<ur_mem_handle_t>(SrcMem), BufferOffset,
1431  HostOffset, RectRegion, BufferRowPitch, BufferSlicePitch, HostRowPitch,
1432  HostSlicePitch, DstMem, Deps.size(), Deps.data(), OutSyncPoint);
1433  if (Result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
1434  throw sycl::exception(
1435  sycl::make_error_code(sycl::errc::feature_not_supported),
1436  "Device-to-host buffer copy command not supported by graph backend");
1437  } else {
1438  Plugin->checkUrResult(Result);
1439  }
1440  }
1441 }
1442 
1443 void MemoryManager::ext_oneapi_copyH2D_cmd_buffer(
1445  ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj,
1446  char *SrcMem, unsigned int DimSrc, sycl::range<3> SrcSize,
1447  sycl::id<3> SrcOffset, unsigned int SrcElemSize, void *DstMem,
1448  unsigned int DimDst, sycl::range<3> DstSize, sycl::range<3> DstAccessRange,
1449  sycl::id<3> DstOffset, unsigned int DstElemSize,
1450  std::vector<ur_exp_command_buffer_sync_point_t> Deps,
1451  ur_exp_command_buffer_sync_point_t *OutSyncPoint) {
1452  assert(SYCLMemObj && "The SYCLMemObj is nullptr");
1453 
1454  const PluginPtr &Plugin = Context->getPlugin();
1455 
1456  detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType();
1457  TermPositions SrcPos, DstPos;
1458  prepTermPositions(SrcPos, DimSrc, MemType);
1459  prepTermPositions(DstPos, DimDst, MemType);
1460 
1461  size_t DstXOffBytes = DstOffset[DstPos.XTerm] * DstElemSize;
1462  size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm] * SrcElemSize;
1463  size_t DstAccessRangeWidthBytes = DstAccessRange[DstPos.XTerm] * DstElemSize;
1464  size_t DstSzWidthBytes = DstSize[DstPos.XTerm] * DstElemSize;
1465  size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm] * SrcElemSize;
1466 
1467  if (MemType != detail::SYCLMemObjI::MemObjType::Buffer) {
1468  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
1469  "Images are not supported in Graphs");
1470  }
1471 
1472  if (1 == DimDst && 1 == DimSrc) {
1473  ur_result_t Result = Plugin->call_nocheck(
1474  urCommandBufferAppendMemBufferWriteExp, CommandBuffer,
1475  sycl::detail::ur::cast<ur_mem_handle_t>(DstMem), DstXOffBytes,
1476  DstAccessRangeWidthBytes, SrcMem + SrcXOffBytes, Deps.size(),
1477  Deps.data(), OutSyncPoint);
1478 
1479  if (Result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
1480  throw sycl::exception(
1481  sycl::make_error_code(sycl::errc::feature_not_supported),
1482  "Host-to-device buffer copy command not supported by graph backend");
1483  } else {
1484  Plugin->checkUrResult(Result);
1485  }
1486  } else {
1487  size_t BufferRowPitch = (1 == DimDst) ? 0 : DstSzWidthBytes;
1488  size_t BufferSlicePitch =
1489  (3 == DimDst) ? DstSzWidthBytes * DstSize[DstPos.YTerm] : 0;
1490  size_t HostRowPitch = (1 == DimSrc) ? 0 : SrcSzWidthBytes;
1491  size_t HostSlicePitch =
1492  (3 == DimSrc) ? SrcSzWidthBytes * SrcSize[SrcPos.YTerm] : 0;
1493 
1494  ur_rect_offset_t BufferOffset{DstXOffBytes, DstOffset[DstPos.YTerm],
1495  DstOffset[DstPos.ZTerm]};
1496  ur_rect_offset_t HostOffset{SrcXOffBytes, SrcOffset[SrcPos.YTerm],
1497  SrcOffset[SrcPos.ZTerm]};
1498  ur_rect_region_t RectRegion{DstAccessRangeWidthBytes,
1499  DstAccessRange[DstPos.YTerm],
1500  DstAccessRange[DstPos.ZTerm]};
1501 
1502  ur_result_t Result = Plugin->call_nocheck(
1503  urCommandBufferAppendMemBufferWriteRectExp, CommandBuffer,
1504  sycl::detail::ur::cast<ur_mem_handle_t>(DstMem), BufferOffset,
1505  HostOffset, RectRegion, BufferRowPitch, BufferSlicePitch, HostRowPitch,
1506  HostSlicePitch, SrcMem, Deps.size(), Deps.data(), OutSyncPoint);
1507 
1508  if (Result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
1509  throw sycl::exception(
1510  sycl::make_error_code(sycl::errc::feature_not_supported),
1511  "Host-to-device buffer copy command not supported by graph backend");
1512  } else {
1513  Plugin->checkUrResult(Result);
1514  }
1515  }
1516 }
1517 
1518 void MemoryManager::ext_oneapi_copy_usm_cmd_buffer(
1519  ContextImplPtr Context, const void *SrcMem,
1520  ur_exp_command_buffer_handle_t CommandBuffer, size_t Len, void *DstMem,
1521  std::vector<ur_exp_command_buffer_sync_point_t> Deps,
1522  ur_exp_command_buffer_sync_point_t *OutSyncPoint) {
1523  if (!SrcMem || !DstMem)
1524  throw exception(make_error_code(errc::invalid),
1525  "NULL pointer argument in memory copy operation.");
1526 
1527  const PluginPtr &Plugin = Context->getPlugin();
1528  ur_result_t Result = Plugin->call_nocheck(
1529  urCommandBufferAppendUSMMemcpyExp, CommandBuffer, DstMem, SrcMem, Len,
1530  Deps.size(), Deps.data(), OutSyncPoint);
1531  if (Result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
1532  throw sycl::exception(
1533  sycl::make_error_code(sycl::errc::feature_not_supported),
1534  "USM copy command not supported by graph backend");
1535  } else {
1536  Plugin->checkUrResult(Result);
1537  }
1538 }
1539 
1540 void MemoryManager::ext_oneapi_fill_usm_cmd_buffer(
1542  ur_exp_command_buffer_handle_t CommandBuffer, void *DstMem, size_t Len,
1543  const std::vector<unsigned char> &Pattern,
1544  std::vector<ur_exp_command_buffer_sync_point_t> Deps,
1545  ur_exp_command_buffer_sync_point_t *OutSyncPoint) {
1546 
1547  if (!DstMem)
1548  throw exception(make_error_code(errc::invalid),
1549  "NULL pointer argument in memory fill operation.");
1550 
1551  const PluginPtr &Plugin = Context->getPlugin();
1552  Plugin->call(urCommandBufferAppendUSMFillExp, CommandBuffer, DstMem,
1553  Pattern.data(), Pattern.size(), Len, Deps.size(), Deps.data(),
1554  OutSyncPoint);
1555 }
1556 
1557 void MemoryManager::ext_oneapi_fill_cmd_buffer(
1559  ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj,
1560  void *Mem, size_t PatternSize, const unsigned char *Pattern,
1561  unsigned int Dim, sycl::range<3> Size, sycl::range<3> AccessRange,
1562  sycl::id<3> AccessOffset, unsigned int ElementSize,
1563  std::vector<ur_exp_command_buffer_sync_point_t> Deps,
1564  ur_exp_command_buffer_sync_point_t *OutSyncPoint) {
1565  assert(SYCLMemObj && "The SYCLMemObj is nullptr");
1566 
1567  const PluginPtr &Plugin = Context->getPlugin();
1568  if (SYCLMemObj->getType() != detail::SYCLMemObjI::MemObjType::Buffer) {
1569  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
1570  "Images are not supported in Graphs");
1571  }
1572 
1573  // 2D and 3D buffers accessors can't have custom range or the data will
1574  // likely be discontiguous.
1575  bool RangesUsable = (Dim <= 1) || (Size == AccessRange);
1576  // For 2D and 3D buffers, the offset must be 0, or the data will be
1577  // discontiguous.
1578  bool OffsetUsable = (Dim <= 1) || (AccessOffset == sycl::id<3>{0, 0, 0});
1579  size_t RangeMultiplier = AccessRange[0] * AccessRange[1] * AccessRange[2];
1580 
1581  if (RangesUsable && OffsetUsable) {
1582  Plugin->call(urCommandBufferAppendMemBufferFillExp, CommandBuffer,
1583  ur::cast<ur_mem_handle_t>(Mem), Pattern, PatternSize,
1584  AccessOffset[0] * ElementSize, RangeMultiplier * ElementSize,
1585  Deps.size(), Deps.data(), OutSyncPoint);
1586  return;
1587  }
1588  // The sycl::handler uses a parallel_for kernel in the case of unusable
1589  // Range or Offset, not CG:Fill. So we should not be here.
1590  throw exception(make_error_code(errc::runtime),
1591  "Not supported configuration of fill requested");
1592 }
1593 
1594 void MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer(
1596  ur_exp_command_buffer_handle_t CommandBuffer, void *Mem, size_t Length,
1597  std::vector<ur_exp_command_buffer_sync_point_t> Deps,
1598  ur_exp_command_buffer_sync_point_t *OutSyncPoint) {
1599  const PluginPtr &Plugin = Context->getPlugin();
1600  Plugin->call(urCommandBufferAppendUSMPrefetchExp, CommandBuffer, Mem, Length,
1601  ur_usm_migration_flags_t(0), Deps.size(), Deps.data(),
1602  OutSyncPoint);
1603 }
1604 
1605 void MemoryManager::ext_oneapi_advise_usm_cmd_buffer(
1607  ur_exp_command_buffer_handle_t CommandBuffer, const void *Mem,
1608  size_t Length, ur_usm_advice_flags_t Advice,
1609  std::vector<ur_exp_command_buffer_sync_point_t> Deps,
1610  ur_exp_command_buffer_sync_point_t *OutSyncPoint) {
1611  const PluginPtr &Plugin = Context->getPlugin();
1612  Plugin->call(urCommandBufferAppendUSMAdviseExp, CommandBuffer, Mem, Length,
1613  Advice, Deps.size(), Deps.data(), OutSyncPoint);
1614 }
1615 
1616 void MemoryManager::copy_image_bindless(
1617  QueueImplPtr Queue, const void *Src, void *Dst,
1618  const ur_image_desc_t &SrcDesc, const ur_image_desc_t &DstDesc,
1619  const ur_image_format_t &SrcFormat, const ur_image_format_t &DstFormat,
1620  const ur_exp_image_copy_flags_t Flags, ur_rect_offset_t SrcOffset,
1621  ur_rect_offset_t DstOffset, ur_rect_region_t CopyExtent,
1622  const std::vector<ur_event_handle_t> &DepEvents,
1623  ur_event_handle_t *OutEvent) {
1624  assert(Queue &&
1625  "Copy image bindless must be called with a valid device queue");
1626  assert((Flags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE ||
1627  Flags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST ||
1628  Flags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE) &&
1629  "Invalid flags passed to copy_image_bindless.");
1630  if (!Dst || !Src)
1631  throw sycl::exception(
1632  sycl::make_error_code(errc::invalid),
1633  "NULL pointer argument in bindless image copy operation.");
1634 
1635  const detail::PluginPtr &Plugin = Queue->getPlugin();
1636 
1637  ur_exp_image_copy_region_t CopyRegion{};
1638  CopyRegion.stype = UR_STRUCTURE_TYPE_EXP_IMAGE_COPY_REGION;
1639  CopyRegion.copyExtent = CopyExtent;
1640  CopyRegion.srcOffset = SrcOffset;
1641  CopyRegion.dstOffset = DstOffset;
1642 
1643  Plugin->call(urBindlessImagesImageCopyExp, Queue->getHandleRef(), Src, Dst,
1644  &SrcDesc, &DstDesc, &SrcFormat, &DstFormat, &CopyRegion, Flags,
1645  DepEvents.size(), DepEvents.data(), OutEvent);
1646 }
1647 
1648 } // namespace detail
1649 } // namespace _V1
1650 } // namespace sycl
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:50
static void * allocateImageObject(ContextImplPtr TargetContext, void *UserPtr, bool HostPtrReadOnly, const ur_image_desc_t &Desc, const ur_image_format_t &Format, const sycl::property_list &PropsList)
static void * allocateMemBuffer(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *UserPtr, bool HostPtrReadOnly, size_t Size, const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext, const sycl::property_list &PropsList, ur_event_handle_t &OutEventToWait)
static void * allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr, bool HostPtrReadOnly, size_t Size, const sycl::property_list &PropsList)
static void * allocateMemSubBuffer(ContextImplPtr TargetContext, void *ParentMemObj, size_t ElemSize, size_t Offset, range< 3 > Range, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
static void * allocateBufferObject(ContextImplPtr TargetContext, void *UserPtr, bool HostPtrReadOnly, const size_t Size, const sycl::property_list &PropsList)
static void * allocateMemImage(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *UserPtr, bool HostPtrReadOnly, size_t Size, const ur_image_desc_t &Desc, const ur_image_format_t &Format, const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext, const sycl::property_list &PropsList, ur_event_handle_t &OutEventToWait)
static void * allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
static void * allocateInteropMemObject(ContextImplPtr TargetContext, void *UserPtr, const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext, const sycl::property_list &PropsList, ur_event_handle_t &OutEventToWait)
static void releaseMemObj(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, void *UserPtr)
static void release(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
device_image_plain getDeviceImageFromBinaryImage(RTDeviceBinaryImage *BinImage, const context &Ctx, const device &Dev)
RTDeviceBinaryImage & getDeviceImage(const std::string &KernelName, const context &Context, const device &Device, bool JITCompilationIsRequired=false)
device_image_plain build(const device_image_plain &DeviceImage, const std::vector< device > &Devs, const property_list &PropList)
virtual void * allocateHostMem()=0
virtual void releaseMem(ContextImplPtr Context, void *Ptr)=0
virtual MemObjType getType() const =0
virtual void releaseHostMem(void *Ptr)=0
virtual void * allocateMem(ContextImplPtr Context, bool InitFromUserData, void *HostPtr, ur_event_handle_t &InteropEvent)=0
static void bufferAssociateNotification(const void *, const void *)
static void bufferReleaseNotification(const void *, const void *)
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
A unique identifier of an item in an index space.
Definition: id.hpp:36
Objects of the property_list class are containers for the SYCL properties.
bool has_property() const noexcept
uint64_t emitMemAllocBeginTrace(uintptr_t ObjHandle, size_t AllocSize, size_t GuardZone)
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:31
void memReleaseHelper(const PluginPtr &Plugin, ur_mem_handle_t Mem)
static void memcpyFromDeviceGlobalUSM(QueueImplPtr Queue, DeviceGlobalMapEntry *DeviceGlobalEntry, size_t NumBytes, size_t Offset, void *Dest, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
void prepTermPositions(TermPositions &pos, int Dimensions, detail::SYCLMemObjI::MemObjType Type)
static void waitForEvents(const std::vector< EventImplPtr > &Events)
static ur_program_handle_t getOrBuildProgramForDeviceGlobal(QueueImplPtr Queue, DeviceGlobalMapEntry *DeviceGlobalEntry)
void copyH2D(SYCLMemObjI *SYCLMemObj, char *SrcMem, QueueImplPtr, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, ur_mem_handle_t DstMem, QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent, const detail::EventImplPtr &OutEventImpl)
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:32
void memUnmapHelper(const PluginPtr &Plugin, ur_queue_handle_t command_queue, ur_mem_handle_t memobj, void *mapped_ptr, uint32_t num_events_in_wait_list, const ur_event_handle_t *event_wait_list, ur_event_handle_t *event)
static void memcpyToDeviceGlobalUSM(QueueImplPtr Queue, DeviceGlobalMapEntry *DeviceGlobalEntry, size_t NumBytes, size_t Offset, const void *Src, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
void copyD2D(SYCLMemObjI *SYCLMemObj, ur_mem_handle_t SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, ur_mem_handle_t DstMem, QueueImplPtr, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 >, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent, const detail::EventImplPtr &OutEventImpl)
std::shared_ptr< event_impl > EventImplPtr
Definition: handler.hpp:183
void memBufferCreateHelper(const PluginPtr &Plugin, ur_context_handle_t Ctx, ur_mem_flags_t Flags, size_t Size, ur_mem_handle_t *RetMem, const ur_buffer_properties_t *Props=nullptr)
static void memcpyFromDeviceGlobalDirect(QueueImplPtr Queue, DeviceGlobalMapEntry *DeviceGlobalEntry, size_t NumBytes, size_t Offset, void *Dest, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent)
std::shared_ptr< plugin > PluginPtr
Definition: ur.hpp:60
uint64_t emitMemReleaseBeginTrace(uintptr_t ObjHandle, uintptr_t AllocPtr)
void emitMemAllocEndTrace(uintptr_t ObjHandle, uintptr_t AllocPtr, size_t AllocSize, size_t GuardZone, uint64_t CorrelationID)
void emitMemReleaseEndTrace(uintptr_t ObjHandle, uintptr_t AllocPtr, uint64_t CorrelationID)
static void memcpyToDeviceGlobalDirect(QueueImplPtr Queue, DeviceGlobalMapEntry *DeviceGlobalEntry, size_t NumBytes, size_t Offset, const void *Src, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent)
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: helpers.hpp:45
static void copyH2H(SYCLMemObjI *, char *SrcMem, QueueImplPtr, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, char *DstMem, QueueImplPtr, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_event_handle_t >, ur_event_handle_t &, const detail::EventImplPtr &)
static ur_mem_flags_t getMemObjCreationFlags(void *UserPtr, bool HostPtrReadOnly)
void write(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize, unsigned WIOffset, const char *Str, unsigned Len, unsigned Padding=0)
Definition: stream.hpp:131
void copyD2H(SYCLMemObjI *SYCLMemObj, ur_mem_handle_t SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, char *DstMem, QueueImplPtr, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent, const detail::EventImplPtr &OutEventImpl)
exception set_ur_error(exception &&e, int32_t ur_err)
Definition: exception.hpp:157
void memBufferMapHelper(const PluginPtr &Plugin, ur_queue_handle_t command_queue, ur_mem_handle_t buffer, bool blocking_map, ur_map_flags_t map_flags, size_t offset, size_t size, uint32_t num_events_in_wait_list, const ur_event_handle_t *event_wait_list, ur_event_handle_t *event, void **ret_map)
void copy(handler &CGH, const T *Src, T *Dest, size_t Count)
void unmap(const void *Ptr, size_t NumBytes, const context &SyclContext)
void fill(sycl::handler &CGH, T *Ptr, const T &Pattern, size_t Count)
class __SYCL_EBO __SYCL_SPECIAL_CLASS Dimensions
usm::alloc get_pointer_type(const void *ptr, const context &ctxt)
Query the allocation type from a USM pointer.
Definition: usm_impl.cpp:544
constexpr mode_tag_t< access_mode::read_write > read_write
Definition: access.hpp:85
class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:65
Definition: access.hpp:18
DeviceGlobalUSMMem & getOrAllocateDeviceGlobalUSM(const std::shared_ptr< queue_impl > &QueueImpl)
std::unordered_set< RTDeviceBinaryImage * > MImages
OwnedUrEvent getInitEvent(const PluginPtr &Plugin)
ur_event_handle_t GetEvent()
Definition: ur_utils.hpp:56
C++ utilities for Unified Runtime integration.