DPC++ Runtime
Runtime libraries for oneAPI DPC++
pi_level_zero.cpp
Go to the documentation of this file.
1 //===-------- pi_level_zero.cpp - Level Zero Plugin --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===------------------------------------------------------------------===//
8 
13 
14 #include "pi_level_zero.hpp"
15 #include <algorithm>
16 #include <cstdarg>
17 #include <cstdio>
18 #include <cstring>
19 #include <memory>
20 #include <set>
21 #include <sstream>
22 #include <string>
23 #include <sycl/detail/pi.h>
24 #include <sycl/detail/spinlock.hpp>
25 #include <thread>
26 #include <utility>
27 
28 #include <zet_api.h>
29 
31 #include "ur_bindings.hpp"
32 
33 extern "C" {
34 // Forward declarartions.
37 static pi_result EventCreate(pi_context Context, pi_queue Queue,
38  bool HostVisible, pi_event *RetEvent);
39 }
40 
41 // Defined in tracing.cpp
42 void enableZeTracing();
43 void disableZeTracing();
44 
45 namespace {
46 
47 // This is an experimental option to test performance of device to device copy
48 // operations on copy engines (versus compute engine)
49 static const bool UseCopyEngineForD2DCopy = [] {
50  const char *CopyEngineForD2DCopy =
51  std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY");
52  return (CopyEngineForD2DCopy && (std::stoi(CopyEngineForD2DCopy) != 0));
53 }();
54 
55 // This is an experimental option that allows the use of copy engine, if
56 // available in the device, in Level Zero plugin for copy operations submitted
57 // to an in-order queue. The default is 1.
58 static const bool UseCopyEngineForInOrderQueue = [] {
59  const char *CopyEngineForInOrderQueue =
60  std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE");
61  return (!CopyEngineForInOrderQueue ||
62  (std::stoi(CopyEngineForInOrderQueue) != 0));
63 }();
64 
65 // This is an experimental option that allows the use of multiple command lists
66 // when submitting barriers. The default is 0.
67 static const bool UseMultipleCmdlistBarriers = [] {
68  const char *UseMultipleCmdlistBarriersFlag =
69  std::getenv("SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS");
70  if (!UseMultipleCmdlistBarriersFlag)
71  return true;
72  return std::stoi(UseMultipleCmdlistBarriersFlag) > 0;
73 }();
74 
75 // This is an experimental option that allows to disable caching of events in
76 // the context.
77 static const bool DisableEventsCaching = [] {
78  const char *DisableEventsCachingFlag =
79  std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING");
80  if (!DisableEventsCachingFlag)
81  return false;
82  return std::stoi(DisableEventsCachingFlag) != 0;
83 }();
84 
85 // This is an experimental option that allows reset and reuse of uncompleted
86 // events in the in-order queue with discard_events property.
87 static const bool ReuseDiscardedEvents = [] {
88  const char *ReuseDiscardedEventsFlag =
89  std::getenv("SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS");
90  if (!ReuseDiscardedEventsFlag)
91  return true;
92  return std::stoi(ReuseDiscardedEventsFlag) > 0;
93 }();
94 
95 // Controls support of the indirect access kernels and deferred memory release.
96 static const bool IndirectAccessTrackingEnabled = [] {
97  return std::getenv("SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY") !=
98  nullptr;
99 }();
100 
101 // Due to a bug with 2D memory copy to and from non-USM pointers, this option is
102 // disabled by default.
103 static const bool UseMemcpy2DOperations = [] {
104  const char *UseMemcpy2DOperationsFlag =
105  std::getenv("SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D");
106  if (!UseMemcpy2DOperationsFlag)
107  return false;
108  return std::stoi(UseMemcpy2DOperationsFlag) > 0;
109 }();
110 
111 static usm_settings::USMAllocatorConfig USMAllocatorConfigInstance;
112 
113 // Map from L0 to PI result.
114 static inline pi_result mapError(ze_result_t Result) {
115  return ur2piResult(ze2urResult(Result));
116 }
117 
118 // Trace a call to Level-Zero RT
119 #define ZE_CALL(ZeName, ZeArgs) \
120  { \
121  ze_result_t ZeResult = ZeName ZeArgs; \
122  if (auto Result = ZeCall().doCall(ZeResult, #ZeName, #ZeArgs, true)) \
123  return mapError(Result); \
124  }
125 
126 // Trace an internal PI call; returns in case of an error.
127 #define PI_CALL(Call) \
128  { \
129  if (PrintTrace) \
130  fprintf(stderr, "PI ---> %s\n", #Call); \
131  pi_result Result = (Call); \
132  if (Result != PI_SUCCESS) \
133  return Result; \
134  }
135 
136 // Controls if we should choose doing eager initialization
137 // to make it happen on warmup paths and have the reportable
138 // paths be less likely affected.
139 //
140 static bool doEagerInit = [] {
141  const char *EagerInit = std::getenv("SYCL_EAGER_INIT");
142  return EagerInit ? std::atoi(EagerInit) != 0 : false;
143 }();
144 
145 // Maximum number of events that can be present in an event ZePool is captured
146 // here. Setting it to 256 gave best possible performance for several
147 // benchmarks.
148 static const pi_uint32 MaxNumEventsPerPool = [] {
149  const auto MaxNumEventsPerPoolEnv =
150  std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
151  pi_uint32 Result =
152  MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256;
153  if (Result <= 0)
154  Result = 256;
155  return Result;
156 }();
157 
158 // Helper function to implement zeHostSynchronize.
159 // The behavior is to avoid infinite wait during host sync under ZE_DEBUG.
160 // This allows for a much more responsive debugging of hangs.
161 //
162 template <typename T, typename Func>
163 ze_result_t zeHostSynchronizeImpl(Func Api, T Handle) {
164  if (!ZeDebug) {
165  return Api(Handle, UINT64_MAX);
166  }
167 
168  ze_result_t R;
169  while ((R = Api(Handle, 1000)) == ZE_RESULT_NOT_READY)
170  ;
171  return R;
172 }
173 
174 // Template function to do various types of host synchronizations.
175 // This is intended to be used instead of direct calls to specific
176 // Level-Zero synchronization APIs.
177 //
178 template <typename T> ze_result_t zeHostSynchronize(T Handle);
179 template <> ze_result_t zeHostSynchronize(ze_event_handle_t Handle) {
180  return zeHostSynchronizeImpl(zeEventHostSynchronize, Handle);
181 }
182 template <> ze_result_t zeHostSynchronize(ze_command_queue_handle_t Handle) {
183  return zeHostSynchronizeImpl(zeCommandQueueSynchronize, Handle);
184 }
185 
186 } // anonymous namespace
187 
188 // SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in
189 // which case all compute commands will be submitted to the command-queue
190 // with the given index in the compute command group. If it is instead set
191 // to negative then all available compute engines may be used.
192 //
193 // The default value is "0".
194 //
195 static const std::pair<int, int> getRangeOfAllowedComputeEngines() {
196  static const char *EnvVar =
197  std::getenv("SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE");
198  // If the environment variable is not set only use "0" CCS for now.
199  // TODO: allow all CCSs when HW support is complete.
200  if (!EnvVar)
201  return std::pair<int, int>(0, 0);
202 
203  auto EnvVarValue = std::atoi(EnvVar);
204  if (EnvVarValue >= 0) {
205  return std::pair<int, int>(EnvVarValue, EnvVarValue);
206  }
207 
208  return std::pair<int, int>(0, INT_MAX);
209 }
210 
211 pi_platform _pi_context::getPlatform() const { return Devices[0]->Platform; }
212 
214  while (Device) {
215  if (std::find(Devices.begin(), Devices.end(), Device) != Devices.end())
216  return true;
217  Device = Device->RootDevice;
218  }
219  return false;
220 }
221 
222 pi_result
223 _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool,
224  size_t &Index, bool HostVisible,
225  bool ProfilingEnabled) {
226  // Lock while updating event pool machinery.
227  std::scoped_lock<pi_mutex> Lock(ZeEventPoolCacheMutex);
228 
229  std::list<ze_event_pool_handle_t> *ZePoolCache =
230  getZeEventPoolCache(HostVisible, ProfilingEnabled);
231 
232  if (!ZePoolCache->empty()) {
233  if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) {
234  if (DisableEventsCaching) {
235  // Remove full pool from the cache if events caching is disabled.
236  ZePoolCache->erase(ZePoolCache->begin());
237  } else {
238  // If event caching is enabled then we don't destroy events so there is
239  // no need to remove pool from the cache and add it back when it has
240  // available slots. Just keep it in the tail of the cache so that all
241  // pools can be destroyed during context destruction.
242  ZePoolCache->push_front(nullptr);
243  }
244  }
245  }
246  if (ZePoolCache->empty()) {
247  ZePoolCache->push_back(nullptr);
248  }
249 
250  // We shall be adding an event to the front pool.
251  ze_event_pool_handle_t *ZePool = &ZePoolCache->front();
252  Index = 0;
253  // Create one event ZePool per MaxNumEventsPerPool events
254  if (*ZePool == nullptr) {
255  ZeStruct<ze_event_pool_desc_t> ZeEventPoolDesc;
256  ZeEventPoolDesc.count = MaxNumEventsPerPool;
257  ZeEventPoolDesc.flags = 0;
258  if (HostVisible)
259  ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
260  if (ProfilingEnabled)
261  ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
262  zePrint("ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags);
263 
264  std::vector<ze_device_handle_t> ZeDevices;
265  std::for_each(Devices.begin(), Devices.end(), [&](const pi_device &D) {
266  ZeDevices.push_back(D->ZeDevice);
267  });
268 
269  ZE_CALL(zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc, ZeDevices.size(),
270  &ZeDevices[0], ZePool));
271  NumEventsAvailableInEventPool[*ZePool] = MaxNumEventsPerPool - 1;
272  NumEventsUnreleasedInEventPool[*ZePool] = 1;
273  } else {
274  Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[*ZePool];
275  --NumEventsAvailableInEventPool[*ZePool];
276  ++NumEventsUnreleasedInEventPool[*ZePool];
277  }
278  Pool = *ZePool;
279  return PI_SUCCESS;
280 }
281 
283  std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex, std::defer_lock);
284  std::scoped_lock<pi_mutex, std::shared_lock<pi_shared_mutex>> LockAll(
285  ZeEventPoolCacheMutex, EventLock);
286  if (!Event->ZeEventPool) {
287  // This must be an interop event created on a users's pool.
288  // Do nothing.
289  return PI_SUCCESS;
290  }
291 
292  std::list<ze_event_pool_handle_t> *ZePoolCache =
293  getZeEventPoolCache(Event->isHostVisible(), Event->isProfilingEnabled());
294 
295  // Put the empty pool to the cache of the pools.
296  if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0)
297  die("Invalid event release: event pool doesn't have unreleased events");
298  if (--NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) {
299  if (ZePoolCache->front() != Event->ZeEventPool) {
300  ZePoolCache->push_back(Event->ZeEventPool);
301  }
302  NumEventsAvailableInEventPool[Event->ZeEventPool] = MaxNumEventsPerPool;
303  }
304 
305  return PI_SUCCESS;
306 }
307 
308 // Forward declarations
310  pi_queue Queue, void *Dst,
311  pi_bool BlockingWrite, size_t Size,
312  const void *Src,
313  pi_uint32 NumEventsInWaitList,
314  const pi_event *EventWaitList,
315  pi_event *Event, bool PreferCopyEngine);
316 
318  pi_command_type CommandType, pi_queue Queue, const void *SrcBuffer,
319  void *DstBuffer, pi_buff_rect_offset SrcOrigin,
320  pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region,
321  size_t SrcRowPitch, size_t DstRowPitch, size_t SrcSlicePitch,
322  size_t DstSlicePitch, pi_bool Blocking, pi_uint32 NumEventsInWaitList,
323  const pi_event *EventWaitList, pi_event *Event,
324  bool PreferCopyEngine = false);
325 
327  return ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents();
328 }
329 
332  ZE_CALL(zeCommandListAppendBarrier,
333  (CommandList->first, nullptr, 1, &(LastCommandEvent->ZeEvent)));
334  ZE_CALL(zeCommandListAppendEventReset,
335  (CommandList->first, LastCommandEvent->ZeEvent));
336 
337  // Create new pi_event but with the same ze_event_handle_t. We are going
338  // to use this pi_event for the next command with discarded event.
340  try {
343  PI_COMMAND_TYPE_USER, true);
344  } catch (const std::bad_alloc &) {
345  return PI_ERROR_OUT_OF_HOST_MEMORY;
346  } catch (...) {
347  return PI_ERROR_UNKNOWN;
348  }
349 
352 
354  }
355 
356  return PI_SUCCESS;
357 }
358 
359 // This helper function creates a pi_event and associate a pi_queue.
360 // Note that the caller of this function must have acquired lock on the Queue
361 // that is passed in.
362 // \param Queue pi_queue to associate with a new event.
363 // \param Event a pointer to hold the newly created pi_event
364 // \param CommandType various command type determined by the caller
365 // \param CommandList is the command list where the event is added
366 // \param IsInternal tells if the event is internal, i.e. visible in the L0
367 // plugin only.
368 // \param ForceHostVisible tells if the event must be created in
369 // the host-visible pool
371  pi_queue Queue, pi_event *Event, pi_command_type CommandType,
372  pi_command_list_ptr_t CommandList, bool IsInternal = false,
373  bool ForceHostVisible = false) {
374 
375  if (!ForceHostVisible)
376  ForceHostVisible = Queue->Device->ZeEventsScope == AllHostVisible;
377 
378  // If event is discarded then try to get event from the queue cache.
379  *Event =
380  IsInternal ? Queue->getEventFromQueueCache(ForceHostVisible) : nullptr;
381 
382  if (*Event == nullptr)
383  PI_CALL(EventCreate(Queue->Context, Queue, ForceHostVisible, Event));
384 
385  (*Event)->Queue = Queue;
386  (*Event)->CommandType = CommandType;
387  (*Event)->IsDiscarded = IsInternal;
388  (*Event)->CommandList = CommandList;
389  // Discarded event doesn't own ze_event, it is used by multiple pi_event
390  // objects. We destroy corresponding ze_event by releasing events from the
391  // events cache at queue destruction. Event in the cache owns the Level Zero
392  // event.
393  if (IsInternal)
394  (*Event)->OwnZeEvent = false;
395 
396  // Append this Event to the CommandList, if any
397  if (CommandList != Queue->CommandListMap.end()) {
398  CommandList->second.append(*Event);
399  (*Event)->RefCount.increment();
400  }
401 
402  // We need to increment the reference counter here to avoid pi_queue
403  // being released before the associated pi_event is released because
404  // piEventRelease requires access to the associated pi_queue.
405  // In piEventRelease, the reference counter of the Queue is decremented
406  // to release it.
407  Queue->RefCount.increment();
408 
409  // SYCL RT does not track completion of the events, so it could
410  // release a PI event as soon as that's not being waited in the app.
411  // But we have to ensure that the event is not destroyed before
412  // it is really signalled, so retain it explicitly here and
413  // release in CleanupCompletedEvent(Event).
414  // If the event is internal then don't increment the reference count as this
415  // event will not be waited/released by SYCL RT, so it must be destroyed by
416  // EventRelease in resetCommandList.
417  if (!IsInternal)
418  PI_CALL(piEventRetain(*Event));
419 
420  return PI_SUCCESS;
421 }
422 
424  pi_command_list_ptr_t CommandList) {
425  // We signal new event at the end of command list only if we have queue with
426  // discard_events property and the last command event is discarded.
429  return PI_SUCCESS;
430 
431  pi_event Event;
433  this, &Event, PI_COMMAND_TYPE_USER, CommandList,
434  /* IsDiscarded */ false, /* ForceHostVisible */ false))
436  LastCommandEvent = Event;
437 
438  ZE_CALL(zeCommandListAppendSignalEvent, (CommandList->first, Event->ZeEvent));
439  return PI_SUCCESS;
440 }
441 
443  auto Cache = HostVisible ? &EventCaches[0] : &EventCaches[1];
444 
445  // If we don't have any events, return nullptr.
446  // If we have only a single event then it was used by the last command and we
447  // can't use it now because we have to enforce round robin between two events.
448  if (Cache->size() < 2)
449  return nullptr;
450 
451  // If there are two events then return an event from the beginning of the list
452  // since event of the last command is added to the end of the list.
453  auto It = Cache->begin();
454  pi_event RetEvent = *It;
455  Cache->erase(It);
456  return RetEvent;
457 }
458 
460  auto Cache = Event->isHostVisible() ? &EventCaches[0] : &EventCaches[1];
461  Cache->emplace_back(Event);
462  return PI_SUCCESS;
463 }
464 
465 // Get value of the threshold for number of events in immediate command lists.
466 // If number of events in the immediate command list exceeds this threshold then
467 // cleanup process for those events is executed.
468 static const size_t ImmCmdListsEventCleanupThreshold = [] {
469  const char *ImmCmdListsEventCleanupThresholdStr = std::getenv(
470  "SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD");
471  static constexpr int Default = 20;
472  if (!ImmCmdListsEventCleanupThresholdStr)
473  return Default;
474 
475  int Threshold = std::atoi(ImmCmdListsEventCleanupThresholdStr);
476 
477  // Basically disable threshold if negative value is provided.
478  if (Threshold < 0)
479  return INT_MAX;
480 
481  return Threshold;
482 }();
483 
484 pi_device _pi_context::getRootDevice() const {
485  assert(Devices.size() > 0);
486 
487  if (Devices.size() == 1)
488  return Devices[0];
489 
490  // Check if we have context with subdevices of the same device (context
491  // may include root device itself as well)
492  pi_device ContextRootDevice =
493  Devices[0]->RootDevice ? Devices[0]->RootDevice : Devices[0];
494 
495  // For context with sub subdevices, the ContextRootDevice might still
496  // not be the root device.
497  // Check whether the ContextRootDevice is the subdevice or root device.
498  if (ContextRootDevice->isSubDevice()) {
499  ContextRootDevice = ContextRootDevice->RootDevice;
500  }
501 
502  for (auto &Device : Devices) {
503  if ((!Device->RootDevice && Device != ContextRootDevice) ||
504  (Device->RootDevice && Device->RootDevice != ContextRootDevice)) {
505  ContextRootDevice = nullptr;
506  break;
507  }
508  }
509  return ContextRootDevice;
510 }
511 
513 
514  // Helper lambda to create various USM allocators for a device.
515  // Note that the CCS devices and their respective subdevices share a
516  // common ze_device_handle and therefore, also share USM allocators.
517  auto createUSMAllocators = [this](pi_device Device) {
518  SharedMemAllocContexts.emplace(
519  std::piecewise_construct, std::make_tuple(Device->ZeDevice),
521  std::unique_ptr<SystemMemory>(
522  new USMSharedMemoryAlloc(this, Device)),
523  USMAllocatorConfigInstance.Configs[usm_settings::MemType::Shared]));
524 
526  std::piecewise_construct, std::make_tuple(Device->ZeDevice),
527  std::make_tuple(std::unique_ptr<SystemMemory>(
529  USMAllocatorConfigInstance
531 
532  DeviceMemAllocContexts.emplace(
533  std::piecewise_construct, std::make_tuple(Device->ZeDevice),
535  std::unique_ptr<SystemMemory>(
536  new USMDeviceMemoryAlloc(this, Device)),
537  USMAllocatorConfigInstance.Configs[usm_settings::MemType::Device]));
538  };
539 
540  // Recursive helper to call createUSMAllocators for all sub-devices
541  std::function<void(pi_device)> createUSMAllocatorsRecursive;
542  createUSMAllocatorsRecursive =
543  [createUSMAllocators,
544  &createUSMAllocatorsRecursive](pi_device Device) -> void {
545  createUSMAllocators(Device);
546  for (auto &SubDevice : Device->SubDevices)
547  createUSMAllocatorsRecursive(SubDevice);
548  };
549 
550  // Create USM allocator context for each pair (device, context).
551  //
552  for (auto &Device : Devices) {
553  createUSMAllocatorsRecursive(Device);
554  }
555  // Create USM allocator context for host. Device and Shared USM allocations
556  // are device-specific. Host allocations are not device-dependent therefore
557  // we don't need a map with device as key.
558  HostMemAllocContext = std::make_unique<USMAllocContext>(
559  std::unique_ptr<SystemMemory>(new USMHostMemoryAlloc(this)),
560  USMAllocatorConfigInstance.Configs[usm_settings::MemType::Host]);
561 
562  // We may allocate memory to this root device so create allocators.
563  if (SingleRootDevice &&
564  DeviceMemAllocContexts.find(SingleRootDevice->ZeDevice) ==
565  DeviceMemAllocContexts.end()) {
566  createUSMAllocators(SingleRootDevice);
567  }
568 
569  // Create the immediate command list to be used for initializations.
570  // Created as synchronous so level-zero performs implicit synchronization and
571  // there is no need to query for completion in the plugin
572  //
573  // TODO: we use Device[0] here as the single immediate command-list
574  // for buffer creation and migration. Initialization is in
575  // in sync and is always performed to Devices[0] as well but
576  // D2D migartion, if no P2P, is broken since it should use
577  // immediate command-list for the specfic devices, and this single one.
578  //
580 
581  // Prefer to use copy engine for initialization copies,
582  // if available and allowed (main copy engine with index 0).
583  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
584  const auto &Range = getRangeOfAllowedCopyEngines((ur_device_handle_t)Device);
585  ZeCommandQueueDesc.ordinal =
586  Device->QueueGroup[_pi_device::queue_group_info_t::Compute].ZeOrdinal;
587  if (Range.first >= 0 &&
588  Device->QueueGroup[_pi_device::queue_group_info_t::MainCopy].ZeOrdinal !=
589  -1)
590  ZeCommandQueueDesc.ordinal =
591  Device->QueueGroup[_pi_device::queue_group_info_t::MainCopy].ZeOrdinal;
592 
593  ZeCommandQueueDesc.index = 0;
594  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
595  ZE_CALL(
596  zeCommandListCreateImmediate,
597  (ZeContext, Device->ZeDevice, &ZeCommandQueueDesc, &ZeCommandListInit));
598  return PI_SUCCESS;
599 }
600 
602  // This function is called when pi_context is deallocated, piContextRelease.
603  // There could be some memory that may have not been deallocated.
604  // For example, event and event pool caches would be still alive.
605 
606  if (!DisableEventsCaching) {
607  std::scoped_lock<pi_mutex> Lock(EventCacheMutex);
608  for (auto &EventCache : EventCaches) {
609  for (auto &Event : EventCache) {
610  ZE_CALL(zeEventDestroy, (Event->ZeEvent));
611  delete Event;
612  }
613  EventCache.clear();
614  }
615  }
616  {
617  std::scoped_lock<pi_mutex> Lock(ZeEventPoolCacheMutex);
618  for (auto &ZePoolCache : ZeEventPoolCache) {
619  for (auto &ZePool : ZePoolCache)
620  ZE_CALL(zeEventPoolDestroy, (ZePool));
621  ZePoolCache.clear();
622  }
623  }
624 
625  // Destroy the command list used for initializations
626  ZE_CALL(zeCommandListDestroy, (ZeCommandListInit));
627 
628  std::scoped_lock<pi_mutex> Lock(ZeCommandListCacheMutex);
629  for (auto &List : ZeComputeCommandListCache) {
630  for (ze_command_list_handle_t &ZeCommandList : List.second) {
631  if (ZeCommandList)
632  ZE_CALL(zeCommandListDestroy, (ZeCommandList));
633  }
634  }
635  for (auto &List : ZeCopyCommandListCache) {
636  for (ze_command_list_handle_t &ZeCommandList : List.second) {
637  if (ZeCommandList)
638  ZE_CALL(zeCommandListDestroy, (ZeCommandList));
639  }
640  }
641  return PI_SUCCESS;
642 }
643 
645  return ZeQueueGroupOrdinal !=
646  (uint32_t)Queue->Device
647  ->QueueGroup[_pi_device::queue_group_info_t::type::Compute]
648  .ZeOrdinal;
649 }
650 
652  // If out-of-order queue property is not set, then this is a in-order queue.
654  0);
655 }
656 
658  return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS) != 0);
659 }
660 
662  return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW) != 0);
663 }
664 
666  return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH) != 0);
667 }
668 
670  bool MakeAvailable,
671  std::vector<pi_event> &EventListToCleanup,
672  bool CheckStatus) {
673  bool UseCopyEngine = CommandList->second.isCopy(this);
674 
675  // Immediate commandlists do not have an associated fence.
676  if (CommandList->second.ZeFence != nullptr) {
677  // Fence had been signalled meaning the associated command-list completed.
678  // Reset the fence and put the command list into a cache for reuse in PI
679  // calls.
680  ZE_CALL(zeFenceReset, (CommandList->second.ZeFence));
681  ZE_CALL(zeCommandListReset, (CommandList->first));
682  CommandList->second.ZeFenceInUse = false;
683  CommandList->second.IsClosed = false;
684  }
685 
686  auto &EventList = CommandList->second.EventList;
687  // Check if standard commandlist or fully synced in-order queue.
688  // If one of those conditions is met then we are sure that all events are
689  // completed so we don't need to check event status.
690  if (!CheckStatus || CommandList->second.ZeFence != nullptr ||
692  // Remember all the events in this command list which needs to be
693  // released/cleaned up and clear event list associated with command list.
694  std::move(std::begin(EventList), std::end(EventList),
695  std::back_inserter(EventListToCleanup));
696  EventList.clear();
697  } else if (!isDiscardEvents()) {
698  // For immediate commandlist reset only those events that have signalled.
699  // If events in the queue are discarded then we can't check their status.
700  for (auto it = EventList.begin(); it != EventList.end();) {
701  std::scoped_lock<pi_shared_mutex> EventLock((*it)->Mutex);
702  ze_result_t ZeResult =
703  (*it)->Completed
704  ? ZE_RESULT_SUCCESS
705  : ZE_CALL_NOCHECK(zeEventQueryStatus, ((*it)->ZeEvent));
706  // Break early as soon as we found first incomplete event because next
707  // events are submitted even later. We are not trying to find all
708  // completed events here because it may be costly. I.e. we are checking
709  // only elements which are most likely completed because they were
710  // submitted earlier. It is guaranteed that all events will be eventually
711  // cleaned up at queue sync/release.
712  if (ZeResult == ZE_RESULT_NOT_READY)
713  break;
714 
715  if (ZeResult != ZE_RESULT_SUCCESS)
716  return mapError(ZeResult);
717 
718  EventListToCleanup.push_back(std::move((*it)));
719  it = EventList.erase(it);
720  }
721  }
722 
723  // Standard commandlists move in and out of the cache as they are recycled.
724  // Immediate commandlists are always available.
725  if (CommandList->second.ZeFence != nullptr && MakeAvailable) {
726  std::scoped_lock<pi_mutex> Lock(this->Context->ZeCommandListCacheMutex);
727  auto &ZeCommandListCache =
728  UseCopyEngine
729  ? this->Context->ZeCopyCommandListCache[this->Device->ZeDevice]
730  : this->Context->ZeComputeCommandListCache[this->Device->ZeDevice];
731  ZeCommandListCache.push_back(CommandList->first);
732  }
733 
734  return PI_SUCCESS;
735 }
736 
737 // Configuration of the command-list batching.
739  // Default value of 0. This specifies to use dynamic batch size adjustment.
740  // Other values will try to collect specified amount of commands.
742 
743  // If doing dynamic batching, specifies start batch size.
745 
746  // The maximum size for dynamic batch.
748 
749  // The step size for dynamic batch increases.
751 
752  // Thresholds for when increase batch size (number of closed early is small
753  // and number of closed full is high).
756 
757  // Tells the starting size of a batch.
758  pi_uint32 startSize() const { return Size > 0 ? Size : DynamicSizeStart; }
759  // Tells is we are doing dynamic batch size adjustment.
760  bool dynamic() const { return Size == 0; }
761 };
762 
763 // Helper function to initialize static variables that holds batch config info
764 // for compute and copy command batching.
766  zeCommandListBatchConfig Config{}; // default initialize
767 
768  // Default value of 0. This specifies to use dynamic batch size adjustment.
769  const auto BatchSizeStr =
770  (IsCopy) ? std::getenv("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE")
771  : std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
772  if (BatchSizeStr) {
773  pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
774  // Level Zero may only support a limted number of commands per command
775  // list. The actual upper limit is not specified by the Level Zero
776  // Specification. For now we allow an arbitrary upper limit.
777  if (BatchSizeStrVal > 0) {
778  Config.Size = BatchSizeStrVal;
779  } else if (BatchSizeStrVal == 0) {
780  Config.Size = 0;
781  // We are requested to do dynamic batching. Collect specifics, if any.
782  // The extended format supported is ":" separated values.
783  //
784  // NOTE: these extra settings are experimental and are intended to
785  // be used only for finding a better default heuristic.
786  //
787  std::string BatchConfig(BatchSizeStr);
788  size_t Ord = 0;
789  size_t Pos = 0;
790  while (true) {
791  if (++Ord > 5)
792  break;
793 
794  Pos = BatchConfig.find(":", Pos);
795  if (Pos == std::string::npos)
796  break;
797  ++Pos; // past the ":"
798 
799  pi_uint32 Val;
800  try {
801  Val = std::stoi(BatchConfig.substr(Pos));
802  } catch (...) {
803  if (IsCopy)
804  zePrint(
805  "SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: failed to parse value\n");
806  else
807  zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: failed to parse value\n");
808  break;
809  }
810  switch (Ord) {
811  case 1:
812  Config.DynamicSizeStart = Val;
813  break;
814  case 2:
815  Config.DynamicSizeMax = Val;
816  break;
817  case 3:
818  Config.DynamicSizeStep = Val;
819  break;
820  case 4:
821  Config.NumTimesClosedEarlyThreshold = Val;
822  break;
823  case 5:
824  Config.NumTimesClosedFullThreshold = Val;
825  break;
826  default:
827  die("Unexpected batch config");
828  }
829  if (IsCopy)
830  zePrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: dynamic batch param "
831  "#%d: %d\n",
832  (int)Ord, (int)Val);
833  else
834  zePrint(
835  "SYCL_PI_LEVEL_ZERO_BATCH_SIZE: dynamic batch param #%d: %d\n",
836  (int)Ord, (int)Val);
837  };
838 
839  } else {
840  // Negative batch sizes are silently ignored.
841  if (IsCopy)
842  zePrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: ignored negative value\n");
843  else
844  zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: ignored negative value\n");
845  }
846  }
847  return Config;
848 }
849 
850 // Static variable that holds batch config info for compute command batching.
852  using IsCopy = bool;
853  return ZeCommandListBatchConfig(IsCopy{false});
854 }();
855 
856 // Static variable that holds batch config info for copy command batching.
858  using IsCopy = bool;
859  return ZeCommandListBatchConfig(IsCopy{true});
860 }();
861 
862 _pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
863  std::vector<ze_command_queue_handle_t> &CopyQueues,
864  pi_context Context, pi_device Device,
865  bool OwnZeCommandQueue,
867  int ForceComputeIndex)
868  : Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue},
869  Properties(PiQueueProperties) {
870 
871  // Compute group initialization.
872  // First, see if the queue's device allows for round-robin or it is
873  // fixed to one particular compute CCS (it is so for sub-sub-devices).
874  auto &ComputeQueueGroupInfo = Device->QueueGroup[queue_type::Compute];
875  pi_queue_group_t ComputeQueueGroup{this, queue_type::Compute};
876  ComputeQueueGroup.ZeQueues = ComputeQueues;
877  // Create space to hold immediate commandlists corresponding to the
878  // ZeQueues
879  if (Device->ImmCommandListUsed) {
880  ComputeQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
881  ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
882  }
883  if (ComputeQueueGroupInfo.ZeIndex >= 0) {
884  // Sub-sub-device
885 
886  // sycl::ext::intel::property::queue::compute_index works with any
887  // backend/device by allowing single zero index if multiple compute CCSes
888  // are not supported. Sub-sub-device falls into the same bucket.
889  assert(ForceComputeIndex <= 0);
890  ComputeQueueGroup.LowerIndex = ComputeQueueGroupInfo.ZeIndex;
891  ComputeQueueGroup.UpperIndex = ComputeQueueGroupInfo.ZeIndex;
892  ComputeQueueGroup.NextIndex = ComputeQueueGroupInfo.ZeIndex;
893  } else if (ForceComputeIndex >= 0) {
894  ComputeQueueGroup.LowerIndex = ForceComputeIndex;
895  ComputeQueueGroup.UpperIndex = ForceComputeIndex;
896  ComputeQueueGroup.NextIndex = ForceComputeIndex;
897  } else {
898  // Set-up to round-robin across allowed range of engines.
899  uint32_t FilterLowerIndex = getRangeOfAllowedComputeEngines().first;
900  uint32_t FilterUpperIndex = getRangeOfAllowedComputeEngines().second;
901  FilterUpperIndex = std::min((size_t)FilterUpperIndex,
902  FilterLowerIndex + ComputeQueues.size() - 1);
903  if (FilterLowerIndex <= FilterUpperIndex) {
904  ComputeQueueGroup.LowerIndex = FilterLowerIndex;
905  ComputeQueueGroup.UpperIndex = FilterUpperIndex;
906  ComputeQueueGroup.NextIndex = ComputeQueueGroup.LowerIndex;
907  } else {
908  die("No compute queue available/allowed.");
909  }
910  }
911  if (Device->ImmCommandListUsed) {
912  // Create space to hold immediate commandlists corresponding to the
913  // ZeQueues
914  ComputeQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
915  ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
916  }
917 
918  // Thread id will be used to create separate queue groups per thread.
919  auto TID = std::this_thread::get_id();
920  ComputeQueueGroupsByTID.insert({TID, ComputeQueueGroup});
921 
922  // Copy group initialization.
923  pi_queue_group_t CopyQueueGroup{this, queue_type::MainCopy};
924  const auto &Range = getRangeOfAllowedCopyEngines((ur_device_handle_t)Device);
925  if (Range.first < 0 || Range.second < 0) {
926  // We are asked not to use copy engines, just do nothing.
927  // Leave CopyQueueGroup.ZeQueues empty, and it won't be used.
928  } else {
929  uint32_t FilterLowerIndex = Range.first;
930  uint32_t FilterUpperIndex = Range.second;
931  FilterUpperIndex = std::min((size_t)FilterUpperIndex,
932  FilterLowerIndex + CopyQueues.size() - 1);
933  if (FilterLowerIndex <= FilterUpperIndex) {
934  CopyQueueGroup.ZeQueues = CopyQueues;
935  CopyQueueGroup.LowerIndex = FilterLowerIndex;
936  CopyQueueGroup.UpperIndex = FilterUpperIndex;
937  CopyQueueGroup.NextIndex = CopyQueueGroup.LowerIndex;
938  // Create space to hold immediate commandlists corresponding to the
939  // ZeQueues
940  if (Device->ImmCommandListUsed) {
941  CopyQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
942  CopyQueueGroup.ZeQueues.size(), CommandListMap.end());
943  }
944  }
945  }
946  CopyQueueGroupsByTID.insert({TID, CopyQueueGroup});
947 
948  // Initialize compute/copy command batches.
949  ComputeCommandBatch.OpenCommandList = CommandListMap.end();
950  CopyCommandBatch.OpenCommandList = CommandListMap.end();
951  ComputeCommandBatch.QueueBatchSize =
953  CopyCommandBatch.QueueBatchSize = ZeCommandListBatchCopyConfig.startSize();
954 }
955 
957  bool QueueLocked = false);
958 
959 // Helper function to perform the necessary cleanup of the events from reset cmd
960 // list.
961 static pi_result
962 CleanupEventListFromResetCmdList(std::vector<pi_event> &EventListToCleanup,
963  bool QueueLocked = false) {
964  for (auto &Event : EventListToCleanup) {
965  // We don't need to synchronize the events since the fence associated with
966  // the command list was synchronized.
967  {
968  std::scoped_lock<pi_shared_mutex> EventLock(Event->Mutex);
969  Event->Completed = true;
970  }
971  PI_CALL(CleanupCompletedEvent(Event, QueueLocked));
972  // This event was removed from the command list, so decrement ref count
973  // (it was incremented when they were added to the command list).
975  }
976  return PI_SUCCESS;
977 }
978 
990  bool QueueLocked = false,
991  bool QueueSynced = false,
992  pi_event CompletedEvent = nullptr) {
993  // Handle only immediate command lists here.
994  if (!Queue || !Queue->Device->ImmCommandListUsed)
995  return PI_SUCCESS;
996 
997  std::vector<pi_event> EventListToCleanup;
998  {
999  std::unique_lock<pi_shared_mutex> QueueLock(Queue->Mutex, std::defer_lock);
1000  if (!QueueLocked)
1001  QueueLock.lock();
1002  // If queue is locked and fully synchronized then cleanup all events.
1003  // If queue is not locked then by this time there may be new submitted
1004  // commands so we can't do full cleanup.
1005  if (QueueLocked &&
1006  (QueueSynced || (Queue->isInOrderQueue() &&
1007  (CompletedEvent == Queue->LastCommandEvent ||
1008  !Queue->LastCommandEvent)))) {
1009  Queue->LastCommandEvent = nullptr;
1010  for (auto &&It = Queue->CommandListMap.begin();
1011  It != Queue->CommandListMap.end(); ++It) {
1012  PI_CALL(Queue->resetCommandList(It, true, EventListToCleanup,
1013  /* CheckStatus */ false));
1014  }
1015  } else if (Queue->isInOrderQueue() && CompletedEvent) {
1016  // If the queue is in-order and we have information about completed event
1017  // then cleanup all events in the command list preceding to CompletedEvent
1018  // including itself.
1019 
1020  // Check that the comleted event has associated command list.
1021  if (!(CompletedEvent->CommandList &&
1022  CompletedEvent->CommandList.value() != Queue->CommandListMap.end()))
1023  return PI_SUCCESS;
1024 
1025  auto &CmdListEvents =
1026  CompletedEvent->CommandList.value()->second.EventList;
1027  auto CompletedEventIt =
1028  std::find(CmdListEvents.begin(), CmdListEvents.end(), CompletedEvent);
1029  if (CompletedEventIt != CmdListEvents.end()) {
1030  // We can cleanup all events prior to the completed event in this
1031  // command list and completed event itself.
1032  // TODO: we can potentially cleanup more events here by finding
1033  // completed events on another command lists, but it is currently not
1034  // implemented.
1035  std::move(std::begin(CmdListEvents), CompletedEventIt + 1,
1036  std::back_inserter(EventListToCleanup));
1037  CmdListEvents.erase(CmdListEvents.begin(), CompletedEventIt + 1);
1038  }
1039  } else {
1040  // Fallback to resetCommandList over all command lists.
1041  for (auto &&It = Queue->CommandListMap.begin();
1042  It != Queue->CommandListMap.end(); ++It) {
1043  PI_CALL(Queue->resetCommandList(It, true, EventListToCleanup,
1044  /* CheckStatus */ true));
1045  }
1046  }
1047  }
1048  PI_CALL(CleanupEventListFromResetCmdList(EventListToCleanup, QueueLocked));
1049  return PI_SUCCESS;
1050 }
1051 
1059  // Handle immediate command lists here, they don't need to be reset and we
1060  // only need to cleanup events.
1061  if (Queue->Device->ImmCommandListUsed) {
1063  return PI_SUCCESS;
1064  }
1065 
1066  // We need events to be cleaned up out of scope where queue is locked to avoid
1067  // nested locks, because event cleanup requires event to be locked. Nested
1068  // locks are hard to control and can cause deadlocks if mutexes are locked in
1069  // different order.
1070  std::vector<pi_event> EventListToCleanup;
1071  {
1072  // We check for command lists that have been already signalled, but have not
1073  // been added to the available list yet. Each command list has a fence
1074  // associated which tracks if a command list has completed dispatch of its
1075  // commands and is ready for reuse. If a command list is found to have been
1076  // signalled, then the command list & fence are reset and command list is
1077  // returned to the command list cache. All events associated with command
1078  // list are cleaned up if command list was reset.
1079  std::unique_lock<pi_shared_mutex> QueueLock(Queue->Mutex);
1080  for (auto &&it = Queue->CommandListMap.begin();
1081  it != Queue->CommandListMap.end(); ++it) {
1082  // Immediate commandlists don't use a fence and are handled separately
1083  // above.
1084  assert(it->second.ZeFence != nullptr);
1085  // It is possible that the fence was already noted as signalled and
1086  // reset. In that case the ZeFenceInUse flag will be false.
1087  if (it->second.ZeFenceInUse) {
1088  ze_result_t ZeResult =
1089  ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence));
1090  if (ZeResult == ZE_RESULT_SUCCESS)
1091  PI_CALL(Queue->resetCommandList(it, true, EventListToCleanup));
1092  }
1093  }
1094  }
1095  CleanupEventListFromResetCmdList(EventListToCleanup);
1096  return PI_SUCCESS;
1097 }
1098 
1099 // Retrieve an available command list to be used in a PI call.
1101  pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine,
1102  bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) {
1103  // Immediate commandlists have been pre-allocated and are always available.
1104  if (Queue->Device->ImmCommandListUsed) {
1105  CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList();
1106  if (CommandList->second.EventList.size() >
1108  std::vector<pi_event> EventListToCleanup;
1109  Queue->resetCommandList(CommandList, false, EventListToCleanup);
1110  CleanupEventListFromResetCmdList(EventListToCleanup, true);
1111  }
1112  PI_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
1113  if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine))
1114  return Res;
1115  return PI_SUCCESS;
1116  }
1117 
1118  auto &CommandBatch =
1119  UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
1120  // Handle batching of commands
1121  // First see if there is an command-list open for batching commands
1122  // for this queue.
1123  if (Queue->hasOpenCommandList(UseCopyEngine)) {
1124  if (AllowBatching) {
1125  CommandList = CommandBatch.OpenCommandList;
1126  PI_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
1127  return PI_SUCCESS;
1128  }
1129  // If this command isn't allowed to be batched or doesn't match the forced
1130  // command queue, then we need to go ahead and execute what is already in
1131  // the batched list, and then go on to process this. On exit from
1132  // executeOpenCommandList OpenCommandList will be invalidated.
1133  if (auto Res = Queue->executeOpenCommandList(UseCopyEngine))
1134  return Res;
1135  // Note that active barriers do not need to be inserted here as they will
1136  // have been enqueued into the command-list when they were created.
1137  }
1138 
1139  // Create/Reuse the command list, because in Level Zero commands are added to
1140  // the command lists, and later are then added to the command queue.
1141  // Each command list is paired with an associated fence to track when the
1142  // command list is available for reuse.
1143  _pi_result pi_result = PI_ERROR_OUT_OF_RESOURCES;
1144 
1145  // Initally, we need to check if a command list has already been created
1146  // on this device that is available for use. If so, then reuse that
1147  // Level-Zero Command List and Fence for this PI call.
1148  {
1149  // Make sure to acquire the lock before checking the size, or there
1150  // will be a race condition.
1151  std::scoped_lock<pi_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
1152  // Under mutex since operator[] does insertion on the first usage for every
1153  // unique ZeDevice.
1154  auto &ZeCommandListCache =
1155  UseCopyEngine
1156  ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]
1157  : Queue->Context
1158  ->ZeComputeCommandListCache[Queue->Device->ZeDevice];
1159 
1160  for (auto ZeCommandListIt = ZeCommandListCache.begin();
1161  ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) {
1162  auto &ZeCommandList = *ZeCommandListIt;
1163  auto it = Queue->CommandListMap.find(ZeCommandList);
1164  if (it != Queue->CommandListMap.end()) {
1165  if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue)
1166  continue;
1167  CommandList = it;
1168  if (CommandList->second.ZeFence != nullptr)
1169  CommandList->second.ZeFenceInUse = true;
1170  } else {
1171  // If there is a command list available on this context, but it
1172  // wasn't yet used in this queue then create a new entry in this
1173  // queue's map to hold the fence and other associated command
1174  // list information.
1175  auto &QGroup = Queue->getQueueGroup(UseCopyEngine);
1176  uint32_t QueueGroupOrdinal;
1177  auto &ZeCommandQueue = ForcedCmdQueue
1178  ? *ForcedCmdQueue
1179  : QGroup.getZeQueue(&QueueGroupOrdinal);
1180  if (ForcedCmdQueue)
1181  QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);
1182 
1183  ze_fence_handle_t ZeFence;
1184  ZeStruct<ze_fence_desc_t> ZeFenceDesc;
1185  ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
1186  CommandList = Queue->CommandListMap
1187  .emplace(ZeCommandList,
1188  pi_command_list_info_t{ZeFence, true, false,
1189  ZeCommandQueue,
1190  QueueGroupOrdinal})
1191  .first;
1192  }
1193  ZeCommandListCache.erase(ZeCommandListIt);
1194  if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList))
1195  return Res;
1196  if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine))
1197  return Res;
1198  return PI_SUCCESS;
1199  }
1200  }
1201 
1202  // If there are no available command lists in the cache, then we check for
1203  // command lists that have already signalled, but have not been added to the
1204  // available list yet. Each command list has a fence associated which tracks
1205  // if a command list has completed dispatch of its commands and is ready for
1206  // reuse. If a command list is found to have been signalled, then the
1207  // command list & fence are reset and we return.
1208  for (auto it = Queue->CommandListMap.begin();
1209  it != Queue->CommandListMap.end(); ++it) {
1210  // Make sure this is the command list type needed.
1211  if (UseCopyEngine != it->second.isCopy(Queue))
1212  continue;
1213 
1214  ze_result_t ZeResult =
1215  ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence));
1216  if (ZeResult == ZE_RESULT_SUCCESS) {
1217  std::vector<pi_event> EventListToCleanup;
1218  Queue->resetCommandList(it, false, EventListToCleanup);
1219  CleanupEventListFromResetCmdList(EventListToCleanup,
1220  true /* QueueLocked */);
1221  CommandList = it;
1222  CommandList->second.ZeFenceInUse = true;
1223  if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList))
1224  return Res;
1225  return PI_SUCCESS;
1226  }
1227  }
1228 
1229  // If there are no available command lists nor signalled command lists,
1230  // then we must create another command list.
1231  pi_result = Queue->createCommandList(UseCopyEngine, CommandList);
1232  CommandList->second.ZeFenceInUse = true;
1233  return pi_result;
1234 }
1235 
1237  auto &Map = (UseCopyEngine ? CopyQueueGroupsByTID : ComputeQueueGroupsByTID);
1238  auto &InitialGroup = Map.begin()->second;
1239 
1240  // Check if thread-specifc immediate commandlists are requested.
1241  if (Device->ImmCommandListUsed == _pi_device::PerThreadPerQueue) {
1242  // Thread id is used to create separate imm cmdlists per thread.
1243  auto Result = Map.insert({std::this_thread::get_id(), InitialGroup});
1244  auto &QueueGroupRef = Result.first->second;
1245  // If an entry for this thread does not exists, create an entry.
1246  if (Result.second) {
1247  // Create space for immediate commandlists, which are created on demand.
1248  QueueGroupRef.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
1249  InitialGroup.ZeQueues.size(), CommandListMap.end());
1250  }
1251  return QueueGroupRef;
1252  }
1253 
1254  // If not PerThreadPerQueue then use the groups from Queue creation time.
1255  return InitialGroup;
1256 }
1257 
1258 // Helper function to create a new command-list to this queue and associated
1259 // fence tracking its completion. This command list & fence are added to the
1260 // map of command lists in this queue with ZeFenceInUse = false.
1261 // The caller must hold a lock of the queue already.
1262 pi_result
1263 _pi_queue::createCommandList(bool UseCopyEngine,
1264  pi_command_list_ptr_t &CommandList,
1265  ze_command_queue_handle_t *ForcedCmdQueue) {
1266 
1267  ze_fence_handle_t ZeFence;
1268  ZeStruct<ze_fence_desc_t> ZeFenceDesc;
1269  ze_command_list_handle_t ZeCommandList;
1270 
1271  uint32_t QueueGroupOrdinal;
1272  auto &QGroup = getQueueGroup(UseCopyEngine);
1273  auto &ZeCommandQueue =
1274  ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue(&QueueGroupOrdinal);
1275  if (ForcedCmdQueue)
1276  QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);
1277 
1278  ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
1279  ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
1280 
1281  ZE_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice,
1282  &ZeCommandListDesc, &ZeCommandList));
1283 
1284  ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
1285  std::tie(CommandList, std::ignore) = CommandListMap.insert(
1286  std::pair<ze_command_list_handle_t, pi_command_list_info_t>(
1287  ZeCommandList,
1288  {ZeFence, false, false, ZeCommandQueue, QueueGroupOrdinal}));
1289 
1291  PI_CALL(insertActiveBarriers(CommandList, UseCopyEngine));
1292  return PI_SUCCESS;
1293 }
1294 
1296  auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch;
1297  auto &ZeCommandListBatchConfig =
1299  pi_uint32 &QueueBatchSize = CommandBatch.QueueBatchSize;
1300  // QueueBatchSize of 0 means never allow batching.
1301  if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic())
1302  return;
1303  CommandBatch.NumTimesClosedFull += 1;
1304 
1305  // If the number of times the list has been closed early is low, and
1306  // the number of times it has been closed full is high, then raise
1307  // the batching size slowly. Don't raise it if it is already pretty
1308  // high.
1309  if (CommandBatch.NumTimesClosedEarly <=
1311  CommandBatch.NumTimesClosedFull >
1313  if (QueueBatchSize < ZeCommandListBatchConfig.DynamicSizeMax) {
1314  QueueBatchSize += ZeCommandListBatchConfig.DynamicSizeStep;
1315  zePrint("Raising QueueBatchSize to %d\n", QueueBatchSize);
1316  }
1317  CommandBatch.NumTimesClosedEarly = 0;
1318  CommandBatch.NumTimesClosedFull = 0;
1319  }
1320 }
1321 
1323  auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch;
1324  auto &ZeCommandListBatchConfig =
1326  pi_uint32 &QueueBatchSize = CommandBatch.QueueBatchSize;
1327  // QueueBatchSize of 0 means never allow batching.
1328  if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic())
1329  return;
1330  CommandBatch.NumTimesClosedEarly += 1;
1331 
1332  // If we are closing early more than about 3x the number of times
1333  // it is closing full, lower the batch size to the value of the
1334  // current open command list. This is trying to quickly get to a
1335  // batch size that will be able to be closed full at least once
1336  // in a while.
1337  if (CommandBatch.NumTimesClosedEarly >
1338  (CommandBatch.NumTimesClosedFull + 1) * 3) {
1339  QueueBatchSize = CommandBatch.OpenCommandList->second.size() - 1;
1340  if (QueueBatchSize < 1)
1341  QueueBatchSize = 1;
1342  zePrint("Lowering QueueBatchSize to %d\n", QueueBatchSize);
1343  CommandBatch.NumTimesClosedEarly = 0;
1344  CommandBatch.NumTimesClosedFull = 0;
1345  }
1346 }
1347 
1349  for (auto &Kernel : KernelsToBeSubmitted) {
1350  if (!Kernel->hasIndirectAccess())
1351  continue;
1352 
1353  auto &Contexts = Device->Platform->Contexts;
1354  for (auto &Ctx : Contexts) {
1355  for (auto &Elem : Ctx->MemAllocs) {
1356  const auto &Pair = Kernel->MemAllocs.insert(&Elem);
1357  // Kernel is referencing this memory allocation from now.
1358  // If this memory allocation was already captured for this kernel, it
1359  // means that kernel is submitted several times. Increase reference
1360  // count only once because we release all allocations only when
1361  // SubmissionsCount turns to 0. We don't want to know how many times
1362  // allocation was retained by each submission.
1363  if (Pair.second)
1364  Elem.second.RefCount.increment();
1365  }
1366  }
1367  Kernel->SubmissionsCount++;
1368  }
1369  KernelsToBeSubmitted.clear();
1370 }
1371 
1373  bool IsBlocking,
1374  bool OKToBatchCommand) {
1375  // Do nothing if command list is already closed.
1376  if (CommandList->second.IsClosed)
1377  return PI_SUCCESS;
1378 
1379  bool UseCopyEngine = CommandList->second.isCopy(this);
1380 
1381  // If the current LastCommandEvent is the nullptr, then it means
1382  // either that no command has ever been issued to the queue
1383  // or it means that the LastCommandEvent has been signalled and
1384  // therefore that this Queue is idle.
1385  //
1386  // NOTE: this behavior adds some flakyness to the batching
1387  // since last command's event may or may not be completed by the
1388  // time we get here depending on timings and system/gpu load.
1389  // So, disable it for modes where we print PI traces. Printing
1390  // traces incurs much different timings than real execution
1391  // ansyway, and many regression tests use it.
1392  //
1393  bool CurrentlyEmpty = !PrintTrace && this->LastCommandEvent == nullptr;
1394 
1395  // The list can be empty if command-list only contains signals of proxy
1396  // events. It is possible that executeCommandList is called twice for the same
1397  // command list without new appended command. We don't to want process the
1398  // same last command event twice that's why additionally check that new
1399  // command was appended to the command list.
1400  if (!CommandList->second.EventList.empty() &&
1401  this->LastCommandEvent != CommandList->second.EventList.back()) {
1402  this->LastCommandEvent = CommandList->second.EventList.back();
1403  if (doReuseDiscardedEvents()) {
1404  PI_CALL(resetDiscardedEvent(CommandList));
1405  }
1406  }
1407 
1408  this->LastUsedCommandList = CommandList;
1409 
1410  if (!Device->ImmCommandListUsed) {
1411  // Batch if allowed to, but don't batch if we know there are no kernels
1412  // from this queue that are currently executing. This is intended to get
1413  // kernels started as soon as possible when there are no kernels from this
1414  // queue awaiting execution, while allowing batching to occur when there
1415  // are kernels already executing. Also, if we are using fixed size batching,
1416  // as indicated by !ZeCommandListBatch.dynamic(), then just ignore
1417  // CurrentlyEmpty as we want to strictly follow the batching the user
1418  // specified.
1419  auto &CommandBatch = UseCopyEngine ? CopyCommandBatch : ComputeCommandBatch;
1420  auto &ZeCommandListBatchConfig = UseCopyEngine
1423  if (OKToBatchCommand && this->isBatchingAllowed(UseCopyEngine) &&
1424  (!ZeCommandListBatchConfig.dynamic() || !CurrentlyEmpty)) {
1425 
1426  if (hasOpenCommandList(UseCopyEngine) &&
1427  CommandBatch.OpenCommandList != CommandList)
1428  die("executeCommandList: OpenCommandList should be equal to"
1429  "null or CommandList");
1430 
1431  if (CommandList->second.size() < CommandBatch.QueueBatchSize) {
1432  CommandBatch.OpenCommandList = CommandList;
1433  return PI_SUCCESS;
1434  }
1435 
1436  adjustBatchSizeForFullBatch(UseCopyEngine);
1437  CommandBatch.OpenCommandList = CommandListMap.end();
1438  }
1439  }
1440 
1441  auto &ZeCommandQueue = CommandList->second.ZeQueue;
1442  // Scope of the lock must be till the end of the function, otherwise new mem
1443  // allocs can be created between the moment when we made a snapshot and the
1444  // moment when command list is closed and executed. But mutex is locked only
1445  // if indirect access tracking enabled, because std::defer_lock is used.
1446  // unique_lock destructor at the end of the function will unlock the mutex
1447  // if it was locked (which happens only if IndirectAccessTrackingEnabled is
1448  // true).
1449  std::unique_lock<pi_shared_mutex> ContextsLock(
1450  Device->Platform->ContextsMutex, std::defer_lock);
1451 
1452  if (IndirectAccessTrackingEnabled) {
1453  // We are going to submit kernels for execution. If indirect access flag is
1454  // set for a kernel then we need to make a snapshot of existing memory
1455  // allocations in all contexts in the platform. We need to lock the mutex
1456  // guarding the list of contexts in the platform to prevent creation of new
1457  // memory alocations in any context before we submit the kernel for
1458  // execution.
1459  ContextsLock.lock();
1461  }
1462 
1463  if (!Device->ImmCommandListUsed) {
1464  // In this mode all inner-batch events have device visibility only,
1465  // and we want the last command in the batch to signal a host-visible
1466  // event that anybody waiting for any event in the batch will
1467  // really be using.
1468  // We need to create a proxy host-visible event only if the list of events
1469  // in the command list is not empty, otherwise we are going to just create
1470  // and remove proxy event right away and dereference deleted object
1471  // afterwards.
1472  if (Device->ZeEventsScope == LastCommandInBatchHostVisible &&
1473  !CommandList->second.EventList.empty()) {
1474  // If there are only internal events in the command list then we don't
1475  // need to create host proxy event.
1476  auto Result =
1477  std::find_if(CommandList->second.EventList.begin(),
1478  CommandList->second.EventList.end(),
1479  [](pi_event E) { return E->hasExternalRefs(); });
1480  if (Result != CommandList->second.EventList.end()) {
1481  // Create a "proxy" host-visible event.
1482  //
1483  pi_event HostVisibleEvent;
1484  auto Res = createEventAndAssociateQueue(
1485  this, &HostVisibleEvent, PI_COMMAND_TYPE_USER, CommandList,
1486  /* IsInternal */ false, /* ForceHostVisible */ true);
1487  if (Res)
1488  return Res;
1489 
1490  // Update each command's event in the command-list to "see" this
1491  // proxy event as a host-visible counterpart.
1492  for (auto &Event : CommandList->second.EventList) {
1493  std::scoped_lock<pi_shared_mutex> EventLock(Event->Mutex);
1494  // Internal event doesn't need host-visible proxy.
1495  if (!Event->hasExternalRefs())
1496  continue;
1497 
1498  if (!Event->HostVisibleEvent) {
1499  Event->HostVisibleEvent = HostVisibleEvent;
1500  HostVisibleEvent->RefCount.increment();
1501  }
1502  }
1503 
1504  // Decrement the reference count of the event such that all the
1505  // remaining references are from the other commands in this batch and
1506  // from the command-list itself. This host-visible event will not be
1507  // waited/released by SYCL RT, so it must be destroyed after all events
1508  // in the batch are gone. We know that refcount is more than 2 because
1509  // we check that EventList of the command list is not empty above, i.e.
1510  // after createEventAndAssociateQueue ref count is 2 and then +1 for
1511  // each event in the EventList.
1512  PI_CALL(piEventReleaseInternal(HostVisibleEvent));
1513 
1514  if (doReuseDiscardedEvents()) {
1515  // If we have in-order queue with discarded events then we want to
1516  // treat this event as regular event. We insert a barrier in the next
1517  // command list to wait for this event.
1518  LastCommandEvent = HostVisibleEvent;
1519  } else {
1520  // For all other queues treat this as a special event and indicate no
1521  // cleanup is needed.
1522  // TODO: always treat this host event as a regular event.
1523  PI_CALL(piEventReleaseInternal(HostVisibleEvent));
1524  HostVisibleEvent->CleanedUp = true;
1525  }
1526 
1527  // Finally set to signal the host-visible event at the end of the
1528  // command-list after a barrier that waits for all commands
1529  // completion.
1532  // If we the last event is discarded then we already have a barrier
1533  // inserted, so just signal the event.
1534  ZE_CALL(zeCommandListAppendSignalEvent,
1535  (CommandList->first, HostVisibleEvent->ZeEvent));
1536  } else {
1537  ZE_CALL(zeCommandListAppendBarrier,
1538  (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr));
1539  }
1540  } else {
1541  // If we don't have host visible proxy then signal event if needed.
1542  this->signalEventFromCmdListIfLastEventDiscarded(CommandList);
1543  }
1544  } else {
1545  // If we don't have host visible proxy then signal event if needed.
1546  this->signalEventFromCmdListIfLastEventDiscarded(CommandList);
1547  }
1548 
1549  // Close the command list and have it ready for dispatch.
1550  ZE_CALL(zeCommandListClose, (CommandList->first));
1551  // Mark this command list as closed.
1552  CommandList->second.IsClosed = true;
1553  this->LastUsedCommandList = CommandListMap.end();
1554  // Offload command list to the GPU for asynchronous execution
1555  auto ZeCommandList = CommandList->first;
1556  auto ZeResult = ZE_CALL_NOCHECK(
1557  zeCommandQueueExecuteCommandLists,
1558  (ZeCommandQueue, 1, &ZeCommandList, CommandList->second.ZeFence));
1559  if (ZeResult != ZE_RESULT_SUCCESS) {
1560  this->Healthy = false;
1561  if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
1562  // Turn into a more informative end-user error.
1563  return PI_ERROR_COMMAND_EXECUTION_FAILURE;
1564  }
1565  return mapError(ZeResult);
1566  }
1567  }
1568 
1569  // Check global control to make every command blocking for debugging.
1570  if (IsBlocking || (ZeSerialize & ZeSerializeBlock) != 0) {
1571  if (Device->ImmCommandListUsed) {
1572  synchronize();
1573  } else {
1574  // Wait until command lists attached to the command queue are executed.
1575  ZE_CALL(zeHostSynchronize, (ZeCommandQueue));
1576  }
1577  }
1578  return PI_SUCCESS;
1579 }
1580 
1581 bool _pi_queue::isBatchingAllowed(bool IsCopy) const {
1582  auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch;
1583  return (CommandBatch.QueueBatchSize > 0 &&
1584  ((ZeSerialize & ZeSerializeBlock) == 0));
1585 }
1586 
1587 // Return the index of the next queue to use based on a
1588 // round robin strategy and the queue group ordinal.
1589 uint32_t _pi_queue::pi_queue_group_t::getQueueIndex(uint32_t *QueueGroupOrdinal,
1590  uint32_t *QueueIndex,
1591  bool QueryOnly) {
1592  auto CurrentIndex = NextIndex;
1593 
1594  if (!QueryOnly) {
1595  ++NextIndex;
1596  if (NextIndex > UpperIndex)
1598  }
1599 
1600  // Find out the right queue group ordinal (first queue might be "main" or
1601  // "link")
1602  auto QueueType = Type;
1603  if (QueueType != queue_type::Compute)
1604  QueueType = (CurrentIndex == 0 && Queue->Device->hasMainCopyEngine())
1605  ? queue_type::MainCopy
1606  : queue_type::LinkCopy;
1607 
1608  *QueueGroupOrdinal = Queue->Device->QueueGroup[QueueType].ZeOrdinal;
1609  // Adjust the index to the L0 queue group since we represent "main" and
1610  // "link"
1611  // L0 groups with a single copy group ("main" would take "0" index).
1612  auto ZeCommandQueueIndex = CurrentIndex;
1613  if (QueueType == queue_type::LinkCopy && Queue->Device->hasMainCopyEngine()) {
1614  ZeCommandQueueIndex -= 1;
1615  }
1616  *QueueIndex = ZeCommandQueueIndex;
1617 
1618  return CurrentIndex;
1619 }
1620 
1622  ze_command_queue_handle_t CmdQueue) {
1623  // Find out the right queue group ordinal (first queue might be "main" or
1624  // "link")
1625  auto QueueType = Type;
1626  if (QueueType != queue_type::Compute)
1627  QueueType = (ZeQueues[0] == CmdQueue && Queue->Device->hasMainCopyEngine())
1628  ? queue_type::MainCopy
1629  : queue_type::LinkCopy;
1630  return Queue->Device->QueueGroup[QueueType].ZeOrdinal;
1631 }
1632 
1633 // This function will return one of possibly multiple available native
1634 // queues and the value of the queue group ordinal.
1636 _pi_queue::pi_queue_group_t::getZeQueue(uint32_t *QueueGroupOrdinal) {
1637 
1638  // QueueIndex is the proper L0 index.
1639  // Index is the plugins concept of index, with main and link copy engines in
1640  // one range.
1641  uint32_t QueueIndex;
1642  auto Index = getQueueIndex(QueueGroupOrdinal, &QueueIndex);
1643 
1644  ze_command_queue_handle_t &ZeQueue = ZeQueues[Index];
1645  if (ZeQueue)
1646  return ZeQueue;
1647 
1648  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
1649  ZeCommandQueueDesc.ordinal = *QueueGroupOrdinal;
1650  ZeCommandQueueDesc.index = QueueIndex;
1651  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
1652  const char *Priority = "Normal";
1653  if (Queue->isPriorityLow()) {
1654  ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW;
1655  Priority = "Low";
1656  } else if (Queue->isPriorityHigh()) {
1657  ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH;
1658  Priority = "High";
1659  }
1660 
1661  // Evaluate performance of explicit usage for "0" index.
1662  if (QueueIndex != 0) {
1663  ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
1664  }
1665 
1666  zePrint("[getZeQueue]: create queue ordinal = %d, index = %d "
1667  "(round robin in [%d, %d]) priority = %s\n",
1668  ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
1669  UpperIndex, Priority);
1670 
1671  auto ZeResult = ZE_CALL_NOCHECK(
1672  zeCommandQueueCreate, (Queue->Context->ZeContext, Queue->Device->ZeDevice,
1673  &ZeCommandQueueDesc, &ZeQueue));
1674  if (ZeResult) {
1675  die("[L0] getZeQueue: failed to create queue");
1676  }
1677 
1678  return ZeQueue;
1679 }
1680 
1681 // This function will return one of possibly multiple available
1682 // immediate commandlists associated with this Queue.
1684 
1685  uint32_t QueueIndex, QueueOrdinal;
1686  auto Index = getQueueIndex(&QueueOrdinal, &QueueIndex);
1687 
1688  if (ImmCmdLists[Index] != Queue->CommandListMap.end())
1689  return ImmCmdLists[Index];
1690 
1691  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
1692  ZeCommandQueueDesc.ordinal = QueueOrdinal;
1693  ZeCommandQueueDesc.index = QueueIndex;
1694  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
1695  const char *Priority = "Normal";
1696  if (Queue->isPriorityLow()) {
1697  ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW;
1698  Priority = "Low";
1699  } else if (Queue->isPriorityHigh()) {
1700  ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH;
1701  Priority = "High";
1702  }
1703 
1704  // Evaluate performance of explicit usage for "0" index.
1705  if (QueueIndex != 0) {
1706  ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
1707  }
1708 
1709  zePrint("[getZeQueue]: create queue ordinal = %d, index = %d "
1710  "(round robin in [%d, %d]) priority = %s\n",
1711  ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
1712  UpperIndex, Priority);
1713 
1714  ze_command_list_handle_t ZeCommandList;
1715  ZE_CALL_NOCHECK(zeCommandListCreateImmediate,
1716  (Queue->Context->ZeContext, Queue->Device->ZeDevice,
1717  &ZeCommandQueueDesc, &ZeCommandList));
1718  ImmCmdLists[Index] =
1719  Queue->CommandListMap
1720  .insert(std::pair<ze_command_list_handle_t, pi_command_list_info_t>{
1721  ZeCommandList, {nullptr, true, false, nullptr, QueueOrdinal}})
1722  .first;
1723  // Add this commandlist to the cache so it can be destroyed as part of
1724  // piQueueReleaseInternal
1725  auto QueueType = Type;
1726  std::scoped_lock<pi_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
1727  auto &ZeCommandListCache =
1728  QueueType == queue_type::Compute
1729  ? Queue->Context->ZeComputeCommandListCache[Queue->Device->ZeDevice]
1730  : Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice];
1731  ZeCommandListCache.push_back(ZeCommandList);
1732 
1733  return ImmCmdLists[Index];
1734 }
1735 
1737  using IsCopy = bool;
1738 
1739  if (Device->ImmCommandListUsed) {
1740  // When using immediate commandlists there are no open command lists.
1741  return CommandListMap.end();
1742  }
1743 
1744  if (hasOpenCommandList(IsCopy{false})) {
1745  const auto &ComputeEventList =
1746  ComputeCommandBatch.OpenCommandList->second.EventList;
1747  if (std::find(ComputeEventList.begin(), ComputeEventList.end(), Event) !=
1748  ComputeEventList.end())
1750  }
1751  if (hasOpenCommandList(IsCopy{true})) {
1752  const auto &CopyEventList =
1753  CopyCommandBatch.OpenCommandList->second.EventList;
1754  if (std::find(CopyEventList.begin(), CopyEventList.end(), Event) !=
1755  CopyEventList.end())
1757  }
1758  return CommandListMap.end();
1759 }
1760 
1762  pi_command_list_ptr_t &CmdList) {
1763  // If current command list is different from the last command list then insert
1764  // a barrier waiting for the last command event.
1765  if (doReuseDiscardedEvents() && CmdList != LastUsedCommandList &&
1766  LastCommandEvent) {
1767  ZE_CALL(zeCommandListAppendBarrier,
1768  (CmdList->first, nullptr, 1, &(LastCommandEvent->ZeEvent)));
1769  LastCommandEvent = nullptr;
1770  }
1771  return PI_SUCCESS;
1772 }
1773 
1775  bool UseCopyEngine) {
1776  // Early exit if there are no active barriers.
1777  if (ActiveBarriers.empty())
1778  return PI_SUCCESS;
1779 
1780  // Create a wait-list and retain events.
1781  _pi_ze_event_list_t ActiveBarriersWaitList;
1782  if (auto Res = ActiveBarriersWaitList.createAndRetainPiZeEventList(
1783  ActiveBarriers.vector().size(), ActiveBarriers.vector().data(), this,
1784  UseCopyEngine))
1785  return Res;
1786 
1787  // We can now replace active barriers with the ones in the wait list.
1788  if (auto Res = ActiveBarriers.clear())
1789  return Res;
1790 
1791  if (ActiveBarriersWaitList.Length == 0) {
1792  return PI_SUCCESS;
1793  }
1794 
1795  for (pi_uint32 I = 0; I < ActiveBarriersWaitList.Length; ++I) {
1796  auto &Event = ActiveBarriersWaitList.PiEventList[I];
1797  ActiveBarriers.add(Event);
1798  }
1799 
1800  pi_event Event = nullptr;
1801  if (auto Res = createEventAndAssociateQueue(
1802  this, &Event, PI_COMMAND_TYPE_USER, CmdList, /*IsInternal*/ true))
1803  return Res;
1804 
1805  Event->WaitList = ActiveBarriersWaitList;
1806  Event->OwnZeEvent = true;
1807 
1808  // If there are more active barriers, insert a barrier on the command-list. We
1809  // do not need an event for finishing so we pass nullptr.
1810  ZE_CALL(zeCommandListAppendBarrier,
1811  (CmdList->first, nullptr, ActiveBarriersWaitList.Length,
1812  ActiveBarriersWaitList.ZeEventList));
1813  return PI_SUCCESS;
1814 }
1815 
1817  auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch;
1818  // If there are any commands still in the open command list for this
1819  // queue, then close and execute that command list now.
1820  if (hasOpenCommandList(IsCopy)) {
1822  auto Res = executeCommandList(CommandBatch.OpenCommandList, false, false);
1823  CommandBatch.OpenCommandList = CommandListMap.end();
1824  return Res;
1825  }
1826 
1827  return PI_SUCCESS;
1828 }
1829 
1830 static const bool FilterEventWaitList = [] {
1831  const char *Ret = std::getenv("SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST");
1832  const bool RetVal = Ret ? std::stoi(Ret) : 0;
1833  return RetVal;
1834 }();
1835 
1837  pi_uint32 EventListLength, const pi_event *EventList, pi_queue CurQueue,
1838  bool UseCopyEngine) {
1839  this->Length = 0;
1840  this->ZeEventList = nullptr;
1841  this->PiEventList = nullptr;
1842 
1843  if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) {
1844  if (CurQueue->Device->ImmCommandListUsed) {
1845  if (ReuseDiscardedEvents && CurQueue->isDiscardEvents()) {
1846  // If queue is in-order with discarded events and if
1847  // new command list is different from the last used command list then
1848  // signal new event from the last immediate command list. We are going
1849  // to insert a barrier in the new command list waiting for that event.
1850  auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine);
1851  uint32_t QueueGroupOrdinal, QueueIndex;
1852  auto NextIndex =
1853  QueueGroup.getQueueIndex(&QueueGroupOrdinal, &QueueIndex,
1854  /*QueryOnly */ true);
1855  auto NextImmCmdList = QueueGroup.ImmCmdLists[NextIndex];
1856  if (CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() &&
1857  CurQueue->LastUsedCommandList != NextImmCmdList) {
1859  CurQueue->LastUsedCommandList);
1860  }
1861  }
1862  } else {
1863  // Ensure LastCommandEvent's batch is submitted if it is differrent
1864  // from the one this command is going to. If we reuse discarded events
1865  // then signalEventFromCmdListIfLastEventDiscarded will be called at batch
1866  // close if needed.
1867  const auto &OpenCommandList =
1868  CurQueue->eventOpenCommandList(CurQueue->LastCommandEvent);
1869  if (OpenCommandList != CurQueue->CommandListMap.end() &&
1870  OpenCommandList->second.isCopy(CurQueue) != UseCopyEngine) {
1871 
1872  if (auto Res = CurQueue->executeOpenCommandList(
1873  OpenCommandList->second.isCopy(CurQueue)))
1874  return Res;
1875  }
1876  }
1877  }
1878 
1879  // For in-order queues, every command should be executed only after the
1880  // previous command has finished. The event associated with the last
1881  // enqueued command is added into the waitlist to ensure in-order semantics.
1882  bool IncludeLastCommandEvent =
1883  CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr;
1884 
1885  // If the last event is discarded then we already have a barrier waiting for
1886  // that event, so must not include the last command event into the wait
1887  // list because it will cause waiting for event which was reset.
1888  if (ReuseDiscardedEvents && CurQueue->isDiscardEvents() &&
1889  CurQueue->LastCommandEvent && CurQueue->LastCommandEvent->IsDiscarded)
1890  IncludeLastCommandEvent = false;
1891 
1892  try {
1893  pi_uint32 TmpListLength = 0;
1894 
1895  if (IncludeLastCommandEvent) {
1896  this->ZeEventList = new ze_event_handle_t[EventListLength + 1];
1897  this->PiEventList = new pi_event[EventListLength + 1];
1898  std::shared_lock<pi_shared_mutex> Lock(CurQueue->LastCommandEvent->Mutex);
1899  this->ZeEventList[0] = CurQueue->LastCommandEvent->ZeEvent;
1900  this->PiEventList[0] = CurQueue->LastCommandEvent;
1901  TmpListLength = 1;
1902  } else if (EventListLength > 0) {
1903  this->ZeEventList = new ze_event_handle_t[EventListLength];
1904  this->PiEventList = new pi_event[EventListLength];
1905  }
1906 
1907  if (EventListLength > 0) {
1908  for (pi_uint32 I = 0; I < EventListLength; I++) {
1909  PI_ASSERT(EventList[I] != nullptr, PI_ERROR_INVALID_VALUE);
1910  {
1911  std::shared_lock<pi_shared_mutex> Lock(EventList[I]->Mutex);
1912  if (EventList[I]->Completed)
1913  continue;
1914 
1915  // Poll of the host-visible events.
1916  auto HostVisibleEvent = EventList[I]->HostVisibleEvent;
1917  if (FilterEventWaitList && HostVisibleEvent) {
1918  auto Res = ZE_CALL_NOCHECK(zeEventQueryStatus,
1919  (HostVisibleEvent->ZeEvent));
1920  if (Res == ZE_RESULT_SUCCESS) {
1921  // Event has already completed, don't put it into the list
1922  continue;
1923  }
1924  }
1925  }
1926 
1927  auto Queue = EventList[I]->Queue;
1928  if (Queue) {
1929  // The caller of createAndRetainPiZeEventList must already hold
1930  // a lock of the CurQueue. Additionally lock the Queue if it
1931  // is different from CurQueue.
1932  // TODO: rework this to avoid deadlock when another thread is
1933  // locking the same queues but in a different order.
1934  auto Lock = ((Queue == CurQueue)
1935  ? std::unique_lock<pi_shared_mutex>()
1936  : std::unique_lock<pi_shared_mutex>(Queue->Mutex));
1937 
1938  // If the event that is going to be waited is in an open batch
1939  // different from where this next command is going to be added,
1940  // then we have to force execute of that open command-list
1941  // to avoid deadlocks.
1942  //
1943  const auto &OpenCommandList =
1944  Queue->eventOpenCommandList(EventList[I]);
1945  if (OpenCommandList != Queue->CommandListMap.end()) {
1946 
1947  if (Queue == CurQueue &&
1948  OpenCommandList->second.isCopy(Queue) == UseCopyEngine) {
1949  // Don't force execute the batch yet since the new command
1950  // is going to the same open batch as the dependent event.
1951  } else {
1952  if (auto Res = Queue->executeOpenCommandList(
1953  OpenCommandList->second.isCopy(Queue)))
1954  return Res;
1955  }
1956  }
1957  } else {
1958  // There is a dependency on an interop-event.
1959  // Similarily to the above to avoid dead locks ensure that
1960  // execution of all prior commands in the current command-
1961  // batch is visible to the host. This may not be the case
1962  // when we intended to have only last command in the batch
1963  // produce host-visible event, e.g.
1964  //
1965  // event0 = interop event
1966  // event1 = command1 (already in batch, no deps)
1967  // event2 = command2 (is being added, dep on event0)
1968  // event3 = signal host-visible event for the batch
1969  // event1.wait()
1970  // event0.signal()
1971  //
1972  // Make sure that event1.wait() will wait for a host-visible
1973  // event that is signalled before the command2 is enqueued.
1974  if (CurQueue->Device->ZeEventsScope != AllHostVisible) {
1975  CurQueue->executeAllOpenCommandLists();
1976  }
1977  }
1978 
1979  std::shared_lock<pi_shared_mutex> Lock(EventList[I]->Mutex);
1980  this->ZeEventList[TmpListLength] = EventList[I]->ZeEvent;
1981  this->PiEventList[TmpListLength] = EventList[I];
1982  TmpListLength += 1;
1983  }
1984  }
1985 
1986  this->Length = TmpListLength;
1987 
1988  } catch (...) {
1989  return PI_ERROR_OUT_OF_HOST_MEMORY;
1990  }
1991 
1992  for (pi_uint32 I = 0; I < this->Length; I++) {
1993  this->PiEventList[I]->RefCount.increment();
1994  }
1995 
1996  return PI_SUCCESS;
1997 }
1998 
1999 static void printZeEventList(const _pi_ze_event_list_t &PiZeEventList) {
2000  zePrint(" NumEventsInWaitList %d:", PiZeEventList.Length);
2001 
2002  for (pi_uint32 I = 0; I < PiZeEventList.Length; I++) {
2003  zePrint(" %#llx", pi_cast<std::uintptr_t>(PiZeEventList.ZeEventList[I]));
2004  }
2005 
2006  zePrint("\n");
2007 }
2008 
2010  std::list<pi_event> &EventsToBeReleased) {
2011  // acquire a lock before reading the length and list fields.
2012  // Acquire the lock, copy the needed data locally, and reset
2013  // the fields, then release the lock.
2014  // Only then do we do the actual actions to release and destroy,
2015  // holding the lock for the minimum time necessary.
2016  pi_uint32 LocLength = 0;
2017  ze_event_handle_t *LocZeEventList = nullptr;
2018  pi_event *LocPiEventList = nullptr;
2019 
2020  {
2021  // acquire the lock and copy fields locally
2022  // Lock automatically releases when this goes out of scope.
2023  std::scoped_lock<pi_mutex> lock(this->PiZeEventListMutex);
2024 
2025  LocLength = Length;
2026  LocZeEventList = ZeEventList;
2027  LocPiEventList = PiEventList;
2028 
2029  Length = 0;
2030  ZeEventList = nullptr;
2031  PiEventList = nullptr;
2032 
2033  // release lock by ending scope.
2034  }
2035 
2036  for (pi_uint32 I = 0; I < LocLength; I++) {
2037  // Add the event to be released to the list
2038  EventsToBeReleased.push_back(LocPiEventList[I]);
2039  }
2040 
2041  if (LocZeEventList != nullptr) {
2042  delete[] LocZeEventList;
2043  }
2044  if (LocPiEventList != nullptr) {
2045  delete[] LocPiEventList;
2046  }
2047 
2048  return PI_SUCCESS;
2049 }
2050 
2051 extern "C" {
2052 
2053 // Forward declarations
2055 
2056 static ze_result_t
2058  ze_module_build_log_handle_t *ZeBuildLog);
2059 
2061  pi_uint32 *NumPlatforms) {
2062  return pi2ur::piPlatformsGet(NumEntries, Platforms, NumPlatforms);
2063 }
2064 
2066  size_t ParamValueSize, void *ParamValue,
2067  size_t *ParamValueSizeRet) {
2068  zePrint("==========================\n");
2069  zePrint("SYCL over Level-Zero %s\n", Platform->ZeDriverVersion.c_str());
2070  zePrint("==========================\n");
2071 
2072  // To distinguish this L0 platform from Unified Runtime one.
2073  if (ParamName == PI_PLATFORM_INFO_NAME) {
2074  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
2075  return ReturnValue("Intel(R) Level-Zero");
2076  }
2077  return pi2ur::piPlatformGetInfo(Platform, ParamName, ParamValueSize,
2078  ParamValue, ParamValueSizeRet);
2079 }
2080 
2082  pi_native_handle *NativeHandle) {
2083  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
2084  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2085 
2086  auto ZeDriver = pi_cast<ze_driver_handle_t *>(NativeHandle);
2087  // Extract the Level Zero driver handle from the given PI platform
2088  *ZeDriver = Platform->ZeDriver;
2089  return PI_SUCCESS;
2090 }
2091 
2093  pi_platform *Platform) {
2094  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
2095  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2096 
2097  auto ZeDriver = pi_cast<ze_driver_handle_t>(NativeHandle);
2098 
2099  pi_uint32 NumPlatforms = 0;
2100  pi_result Res = piPlatformsGet(0, nullptr, &NumPlatforms);
2101  if (Res != PI_SUCCESS) {
2102  return Res;
2103  }
2104 
2105  if (NumPlatforms) {
2106  std::vector<pi_platform> Platforms(NumPlatforms);
2107  PI_CALL(piPlatformsGet(NumPlatforms, Platforms.data(), nullptr));
2108 
2109  // The SYCL spec requires that the set of platforms must remain fixed for
2110  // the duration of the application's execution. We assume that we found all
2111  // of the Level Zero drivers when we initialized the platform cache, so the
2112  // "NativeHandle" must already be in the cache. If it is not, this must not
2113  // be a valid Level Zero driver.
2114  for (const pi_platform &CachedPlatform : Platforms) {
2115  if (CachedPlatform->ZeDriver == ZeDriver) {
2116  *Platform = CachedPlatform;
2117  return PI_SUCCESS;
2118  }
2119  }
2120  }
2121 
2122  return PI_ERROR_INVALID_VALUE;
2123 }
2124 
2126  return pi2ur::piPluginGetLastError(message);
2127 }
2128 
2130  pi_uint32 NumEntries, pi_device *Devices,
2131  pi_uint32 *NumDevices) {
2132  return pi2ur::piDevicesGet(Platform, DeviceType, NumEntries, Devices,
2133  NumDevices);
2134 }
2135 
2137  return pi2ur::piDeviceRetain(Device);
2138 }
2139 
2142 }
2143 
2145  size_t ParamValueSize, void *ParamValue,
2146  size_t *ParamValueSizeRet) {
2147  return pi2ur::piDeviceGetInfo(Device, ParamName, ParamValueSize, ParamValue,
2148  ParamValueSizeRet);
2149 }
2150 
2153  pi_uint32 NumDevices, pi_device *OutDevices,
2154  pi_uint32 *OutNumDevices) {
2155  return pi2ur::piDevicePartition(Device, Properties, NumDevices, OutDevices,
2156  OutNumDevices);
2157 }
2158 
2159 pi_result
2160 piextDeviceSelectBinary(pi_device Device, // TODO: does this need to be context?
2161  pi_device_binary *Binaries, pi_uint32 NumBinaries,
2162  pi_uint32 *SelectedBinaryInd) {
2163 
2164  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
2165  PI_ASSERT(SelectedBinaryInd, PI_ERROR_INVALID_VALUE);
2166  PI_ASSERT(NumBinaries == 0 || Binaries, PI_ERROR_INVALID_VALUE);
2167 
2168  // TODO: this is a bare-bones implementation for choosing a device image
2169  // that would be compatible with the targeted device. An AOT-compiled
2170  // image is preferred over SPIR-V for known devices (i.e. Intel devices)
2171  // The implementation makes no effort to differentiate between multiple images
2172  // for the given device, and simply picks the first one compatible.
2173  //
2174  // Real implementation will use the same mechanism OpenCL ICD dispatcher
2175  // uses. Something like:
2176  // PI_VALIDATE_HANDLE_RETURN_HANDLE(ctx, PI_ERROR_INVALID_CONTEXT);
2177  // return context->dispatch->piextDeviceSelectIR(
2178  // ctx, images, num_images, selected_image);
2179  // where context->dispatch is set to the dispatch table provided by PI
2180  // plugin for platform/device the ctx was created for.
2181 
2182  // Look for GEN binary, which we known can only be handled by Level-Zero now.
2183  const char *BinaryTarget = __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN;
2184 
2185  // Find the appropriate device image, fallback to spirv if not found
2186  constexpr pi_uint32 InvalidInd = std::numeric_limits<pi_uint32>::max();
2187  pi_uint32 Spirv = InvalidInd;
2188 
2189  for (pi_uint32 i = 0; i < NumBinaries; ++i) {
2190  if (strcmp(Binaries[i]->DeviceTargetSpec, BinaryTarget) == 0) {
2191  *SelectedBinaryInd = i;
2192  return PI_SUCCESS;
2193  }
2194  if (strcmp(Binaries[i]->DeviceTargetSpec,
2196  Spirv = i;
2197  }
2198  // Points to a spirv image, if such indeed was found
2199  if ((*SelectedBinaryInd = Spirv) != InvalidInd)
2200  return PI_SUCCESS;
2201 
2202  // No image can be loaded for the given device
2203  return PI_ERROR_INVALID_BINARY;
2204 }
2205 
2207  pi_native_handle *NativeHandle) {
2208  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
2209  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2210 
2211  auto ZeDevice = pi_cast<ze_device_handle_t *>(NativeHandle);
2212  // Extract the Level Zero module handle from the given PI device
2213  *ZeDevice = Device->ZeDevice;
2214  return PI_SUCCESS;
2215 }
2216 
2218  pi_platform Platform,
2219  pi_device *Device) {
2220  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
2221  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2222 
2223  auto ZeDevice = pi_cast<ze_device_handle_t>(NativeHandle);
2224 
2225  // The SYCL spec requires that the set of devices must remain fixed for the
2226  // duration of the application's execution. We assume that we found all of the
2227  // Level Zero devices when we initialized the platforms/devices cache, so the
2228  // "NativeHandle" must already be in the cache. If it is not, this must not be
2229  // a valid Level Zero device.
2230  //
2231  // TODO: maybe we should populate cache of platforms if it wasn't already.
2232  // For now assert that is was populated.
2233  PI_ASSERT(PiPlatformCachePopulated, PI_ERROR_INVALID_VALUE);
2234  const std::lock_guard<SpinLock> Lock{*PiPlatformsCacheMutex};
2235 
2236  pi_device Dev = nullptr;
2237  for (pi_platform ThePlatform : *PiPlatformsCache) {
2238  Dev = ThePlatform->getDeviceFromNativeHandle(ZeDevice);
2239  if (Dev) {
2240  // Check that the input Platform, if was given, matches the found one.
2241  PI_ASSERT(!Platform || Platform == ThePlatform,
2242  PI_ERROR_INVALID_PLATFORM);
2243  break;
2244  }
2245  }
2246 
2247  if (Dev == nullptr)
2248  return PI_ERROR_INVALID_VALUE;
2249 
2250  *Device = Dev;
2251  return PI_SUCCESS;
2252 }
2253 
2255  pi_uint32 NumDevices, const pi_device *Devices,
2256  void (*PFnNotify)(const char *ErrInfo,
2257  const void *PrivateInfo, size_t CB,
2258  void *UserData),
2259  void *UserData, pi_context *RetContext) {
2260  (void)Properties;
2261  (void)PFnNotify;
2262  (void)UserData;
2263  PI_ASSERT(NumDevices, PI_ERROR_INVALID_VALUE);
2264  PI_ASSERT(Devices, PI_ERROR_INVALID_DEVICE);
2265  PI_ASSERT(RetContext, PI_ERROR_INVALID_VALUE);
2266 
2267  pi_platform Platform = (*Devices)->Platform;
2268  ZeStruct<ze_context_desc_t> ContextDesc;
2269  ContextDesc.flags = 0;
2270 
2271  ze_context_handle_t ZeContext;
2272  ZE_CALL(zeContextCreate, (Platform->ZeDriver, &ContextDesc, &ZeContext));
2273  try {
2274  *RetContext = new _pi_context(ZeContext, NumDevices, Devices, true);
2275  (*RetContext)->initialize();
2276  if (IndirectAccessTrackingEnabled) {
2277  std::scoped_lock<pi_shared_mutex> Lock(Platform->ContextsMutex);
2278  Platform->Contexts.push_back(*RetContext);
2279  }
2280  } catch (const std::bad_alloc &) {
2281  return PI_ERROR_OUT_OF_HOST_MEMORY;
2282  } catch (...) {
2283  return PI_ERROR_UNKNOWN;
2284  }
2285 
2286  return PI_SUCCESS;
2287 }
2288 
2290  size_t ParamValueSize, void *ParamValue,
2291  size_t *ParamValueSizeRet) {
2292 
2293  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
2294 
2295  std::shared_lock<pi_shared_mutex> Lock(Context->Mutex);
2296  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
2297  switch (ParamName) {
2299  return ReturnValue(&Context->Devices[0], Context->Devices.size());
2301  return ReturnValue(pi_uint32(Context->Devices.size()));
2303  return ReturnValue(pi_uint32{Context->RefCount.load()});
2305  // 2D USM memcpy is supported unless disabled through
2306  // SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D.
2307  return ReturnValue(pi_bool{UseMemcpy2DOperations});
2310  // 2D USM fill and memset is not supported.
2311  return ReturnValue(pi_bool{false});
2316  // These queries should be dealt with in context_impl.cpp by calling the
2317  // queries of each device separately and building the intersection set.
2318  setErrorMessage("These queries should have never come here.",
2319  UR_RESULT_ERROR_INVALID_VALUE);
2320  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
2321  }
2322  default:
2323  // TODO: implement other parameters
2324  die("piGetContextInfo: unsuppported ParamName.");
2325  }
2326 
2327  return PI_SUCCESS;
2328 }
2329 
2330 // FIXME: Dummy implementation to prevent link fail
2332  pi_context_extended_deleter Function,
2333  void *UserData) {
2334  (void)Context;
2335  (void)Function;
2336  (void)UserData;
2337  die("piextContextSetExtendedDeleter: not supported");
2338  return PI_SUCCESS;
2339 }
2340 
2342  pi_native_handle *NativeHandle) {
2343  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
2344  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2345 
2346  auto ZeContext = pi_cast<ze_context_handle_t *>(NativeHandle);
2347  // Extract the Level Zero queue handle from the given PI queue
2348  *ZeContext = Context->ZeContext;
2349  return PI_SUCCESS;
2350 }
2351 
2353  pi_uint32 NumDevices,
2354  const pi_device *Devices,
2355  bool OwnNativeHandle,
2356  pi_context *RetContext) {
2357  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2358  PI_ASSERT(Devices, PI_ERROR_INVALID_DEVICE);
2359  PI_ASSERT(RetContext, PI_ERROR_INVALID_VALUE);
2360  PI_ASSERT(NumDevices, PI_ERROR_INVALID_VALUE);
2361 
2362  try {
2363  *RetContext = new _pi_context(pi_cast<ze_context_handle_t>(NativeHandle),
2364  NumDevices, Devices, OwnNativeHandle);
2365  (*RetContext)->initialize();
2366  } catch (const std::bad_alloc &) {
2367  return PI_ERROR_OUT_OF_HOST_MEMORY;
2368  } catch (...) {
2369  return PI_ERROR_UNKNOWN;
2370  }
2371 
2372  return PI_SUCCESS;
2373 }
2374 
2376 
2377  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
2378 
2379  Context->RefCount.increment();
2380  return PI_SUCCESS;
2381 }
2382 
2383 // Helper function to release the context, a caller must lock the platform-level
2384 // mutex guarding the container with contexts because the context can be removed
2385 // from the list of tracked contexts.
2387 
2388  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
2389 
2390  if (!Context->RefCount.decrementAndTest())
2391  return PI_SUCCESS;
2392 
2393  if (IndirectAccessTrackingEnabled) {
2394  pi_platform Plt = Context->getPlatform();
2395  auto &Contexts = Plt->Contexts;
2396  auto It = std::find(Contexts.begin(), Contexts.end(), Context);
2397  if (It != Contexts.end())
2398  Contexts.erase(It);
2399  }
2400  ze_context_handle_t DestoryZeContext =
2401  Context->OwnZeContext ? Context->ZeContext : nullptr;
2402 
2403  // Clean up any live memory associated with Context
2404  pi_result Result = Context->finalize();
2405 
2406  // We must delete Context first and then destroy zeContext because
2407  // Context deallocation requires ZeContext in some member deallocation of
2408  // pi_context.
2409  delete Context;
2410 
2411  // Destruction of some members of pi_context uses L0 context
2412  // and therefore it must be valid at that point.
2413  // Technically it should be placed to the destructor of pi_context
2414  // but this makes API error handling more complex.
2415  if (DestoryZeContext)
2416  ZE_CALL(zeContextDestroy, (DestoryZeContext));
2417 
2418  return Result;
2419 }
2420 
2422  pi_platform Plt = Context->getPlatform();
2423  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
2424  std::defer_lock);
2425  if (IndirectAccessTrackingEnabled)
2426  ContextsLock.lock();
2427 
2428  return ContextReleaseHelper(Context);
2429 }
2430 
2432  pi_queue_properties Flags, pi_queue *Queue) {
2434  return piextQueueCreate(Context, Device, Properties, Queue);
2435 }
2438  PI_ASSERT(Properties, PI_ERROR_INVALID_VALUE);
2439  // Expect flags mask to be passed first.
2440  PI_ASSERT(Properties[0] == PI_QUEUE_FLAGS, PI_ERROR_INVALID_VALUE);
2441  pi_queue_properties Flags = Properties[1];
2442 
2443  PI_ASSERT(Properties[2] == 0 ||
2444  (Properties[2] == PI_QUEUE_COMPUTE_INDEX && Properties[4] == 0),
2445  PI_ERROR_INVALID_VALUE);
2446  auto ForceComputeIndex = Properties[2] == PI_QUEUE_COMPUTE_INDEX
2447  ? static_cast<int>(Properties[3])
2448  : -1; // Use default/round-robin.
2449 
2450  // Check that unexpected bits are not set.
2451  PI_ASSERT(
2458  PI_ERROR_INVALID_VALUE);
2459 
2460  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
2461  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
2462  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
2463  PI_ASSERT(Context->isValidDevice(Device), PI_ERROR_INVALID_DEVICE);
2464 
2465  // Create placeholder queues in the compute queue group.
2466  // Actual L0 queues will be created at first use.
2467  std::vector<ze_command_queue_handle_t> ZeComputeCommandQueues(
2468  Device->QueueGroup[_pi_queue::queue_type::Compute].ZeProperties.numQueues,
2469  nullptr);
2470 
2471  // Create placeholder queues in the copy queue group (main and link
2472  // native groups are combined into one group).
2473  // Actual L0 queues will be created at first use.
2474  size_t NumCopyGroups = 0;
2475  if (Device->hasMainCopyEngine()) {
2476  NumCopyGroups += Device->QueueGroup[_pi_queue::queue_type::MainCopy]
2477  .ZeProperties.numQueues;
2478  }
2479  if (Device->hasLinkCopyEngine()) {
2480  NumCopyGroups += Device->QueueGroup[_pi_queue::queue_type::LinkCopy]
2481  .ZeProperties.numQueues;
2482  }
2483  std::vector<ze_command_queue_handle_t> ZeCopyCommandQueues(NumCopyGroups,
2484  nullptr);
2485 
2486  try {
2487  *Queue = new _pi_queue(ZeComputeCommandQueues, ZeCopyCommandQueues, Context,
2488  Device, true, Flags, ForceComputeIndex);
2489  } catch (const std::bad_alloc &) {
2490  return PI_ERROR_OUT_OF_HOST_MEMORY;
2491  } catch (...) {
2492  return PI_ERROR_UNKNOWN;
2493  }
2494 
2495  // Do eager initialization of Level Zero handles on request.
2496  if (doEagerInit) {
2497  pi_queue Q = *Queue;
2498  // Creates said number of command-lists.
2499  auto warmupQueueGroup = [Q](bool UseCopyEngine,
2500  uint32_t RepeatCount) -> pi_result {
2501  pi_command_list_ptr_t CommandList;
2502  while (RepeatCount--) {
2503  if (Q->Device->ImmCommandListUsed) {
2504  CommandList = Q->getQueueGroup(UseCopyEngine).getImmCmdList();
2505  } else {
2506  // Heuristically create some number of regular command-list to reuse.
2507  for (int I = 0; I < 10; ++I) {
2508  PI_CALL(Q->createCommandList(UseCopyEngine, CommandList));
2509  // Immediately return them to the cache of available command-lists.
2510  std::vector<pi_event> EventsUnused;
2511  PI_CALL(Q->resetCommandList(CommandList, true /* MakeAvailable */,
2512  EventsUnused));
2513  }
2514  }
2515  }
2516  return PI_SUCCESS;
2517  };
2518  // Create as many command-lists as there are queues in the group.
2519  // With this the underlying round-robin logic would initialize all
2520  // native queues, and create command-lists and their fences.
2521  // At this point only the thread creating the queue will have associated
2522  // command-lists. Other threads have not accessed the queue yet. So we can
2523  // only warmup the initial thread's command-lists.
2524  auto InitialGroup = Q->ComputeQueueGroupsByTID.begin()->second;
2525  PI_CALL(warmupQueueGroup(false, InitialGroup.UpperIndex -
2526  InitialGroup.LowerIndex + 1));
2527  if (Q->useCopyEngine()) {
2528  auto InitialGroup = Q->CopyQueueGroupsByTID.begin()->second;
2529  PI_CALL(warmupQueueGroup(true, InitialGroup.UpperIndex -
2530  InitialGroup.LowerIndex + 1));
2531  }
2532  // TODO: warmup event pools. Both host-visible and device-only.
2533  }
2534  return PI_SUCCESS;
2535 }
2536 
2538  size_t ParamValueSize, void *ParamValue,
2539  size_t *ParamValueSizeRet) {
2540 
2541  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
2542 
2543  std::shared_lock<pi_shared_mutex> Lock(Queue->Mutex);
2544  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
2545  // TODO: consider support for queue properties and size
2546  switch (ParamName) {
2547  case PI_QUEUE_INFO_CONTEXT:
2548  return ReturnValue(Queue->Context);
2549  case PI_QUEUE_INFO_DEVICE:
2550  return ReturnValue(Queue->Device);
2552  return ReturnValue(pi_uint32{Queue->RefCount.load()});
2554  die("PI_QUEUE_INFO_PROPERTIES in piQueueGetInfo not implemented\n");
2555  break;
2556  case PI_QUEUE_INFO_SIZE:
2557  die("PI_QUEUE_INFO_SIZE in piQueueGetInfo not implemented\n");
2558  break;
2560  die("PI_QUEUE_INFO_DEVICE_DEFAULT in piQueueGetInfo not implemented\n");
2561  break;
2563  // We can exit early if we have in-order queue.
2564  if (Queue->isInOrderQueue()) {
2565  if (!Queue->LastCommandEvent)
2566  return ReturnValue(pi_bool{true});
2567 
2568  // We can check status of the event only if it isn't discarded otherwise
2569  // it may be reset (because we are free to reuse such events) and
2570  // zeEventQueryStatus will hang.
2571  // TODO: use more robust way to check that ZeEvent is not owned by
2572  // LastCommandEvent.
2573  if (!Queue->LastCommandEvent->IsDiscarded) {
2574  ze_result_t ZeResult = ZE_CALL_NOCHECK(
2575  zeEventQueryStatus, (Queue->LastCommandEvent->ZeEvent));
2576  if (ZeResult == ZE_RESULT_NOT_READY) {
2577  return ReturnValue(pi_bool{false});
2578  } else if (ZeResult != ZE_RESULT_SUCCESS) {
2579  return mapError(ZeResult);
2580  }
2581  return ReturnValue(pi_bool{true});
2582  }
2583  // For immediate command lists we have to check status of the event
2584  // because immediate command lists are not associated with level zero
2585  // queue. Conservatively return false in this case because last event is
2586  // discarded and we can't check its status.
2587  if (Queue->Device->ImmCommandListUsed)
2588  return ReturnValue(pi_bool{false});
2589  }
2590 
2591  // If we have any open command list which is not empty then return false
2592  // because it means that there are commands which are not even submitted for
2593  // execution yet.
2594  using IsCopy = bool;
2595  if (Queue->hasOpenCommandList(IsCopy{true}) ||
2596  Queue->hasOpenCommandList(IsCopy{false}))
2597  return ReturnValue(pi_bool{false});
2598 
2599  for (const auto &QueueMap :
2600  {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) {
2601  for (const auto &QueueGroup : QueueMap) {
2602  if (Queue->Device->ImmCommandListUsed) {
2603  // Immediate command lists are not associated with any Level Zero
2604  // queue, that's why we have to check status of events in each
2605  // immediate command list. Start checking from the end and exit early
2606  // if some event is not completed.
2607  for (const auto &ImmCmdList : QueueGroup.second.ImmCmdLists) {
2608  if (ImmCmdList == Queue->CommandListMap.end())
2609  continue;
2610 
2611  auto EventList = ImmCmdList->second.EventList;
2612  for (auto It = EventList.crbegin(); It != EventList.crend(); It++) {
2613  ze_result_t ZeResult =
2614  ZE_CALL_NOCHECK(zeEventQueryStatus, ((*It)->ZeEvent));
2615  if (ZeResult == ZE_RESULT_NOT_READY) {
2616  return ReturnValue(pi_bool{false});
2617  } else if (ZeResult != ZE_RESULT_SUCCESS) {
2618  return mapError(ZeResult);
2619  }
2620  }
2621  }
2622  } else {
2623  for (const auto &ZeQueue : QueueGroup.second.ZeQueues) {
2624  if (!ZeQueue)
2625  continue;
2626  // Provide 0 as the timeout parameter to immediately get the status
2627  // of the Level Zero queue.
2628  ze_result_t ZeResult = ZE_CALL_NOCHECK(zeCommandQueueSynchronize,
2629  (ZeQueue, /* timeout */ 0));
2630  if (ZeResult == ZE_RESULT_NOT_READY) {
2631  return ReturnValue(pi_bool{false});
2632  } else if (ZeResult != ZE_RESULT_SUCCESS) {
2633  return mapError(ZeResult);
2634  }
2635  }
2636  }
2637  }
2638  }
2639  return ReturnValue(pi_bool{true});
2640  }
2641  default:
2642  zePrint("Unsupported ParamName in piQueueGetInfo: ParamName=%d(0x%x)\n",
2643  ParamName, ParamName);
2644  return PI_ERROR_INVALID_VALUE;
2645  }
2646 
2647  return PI_SUCCESS;
2648 }
2649 
2651  {
2652  std::scoped_lock<pi_shared_mutex> Lock(Queue->Mutex);
2653  Queue->RefCountExternal++;
2654  }
2655  Queue->RefCount.increment();
2656  return PI_SUCCESS;
2657 }
2658 
2660  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
2661  std::vector<pi_event> EventListToCleanup;
2662 
2663  {
2664  std::scoped_lock<pi_shared_mutex> Lock(Queue->Mutex);
2665 
2666  if ((--Queue->RefCountExternal) != 0)
2667  return PI_SUCCESS;
2668 
2669  // When external reference count goes to zero it is still possible
2670  // that internal references still exists, e.g. command-lists that
2671  // are not yet completed. So do full queue synchronization here
2672  // and perform proper cleanup.
2673  //
2674  // It is possible to get to here and still have an open command list
2675  // if no wait or finish ever occurred for this queue.
2676  if (auto Res = Queue->executeAllOpenCommandLists())
2677  return Res;
2678 
2679  // Make sure all commands get executed.
2680  Queue->synchronize();
2681 
2682  // Destroy all the fences created associated with this queue.
2683  for (auto it = Queue->CommandListMap.begin();
2684  it != Queue->CommandListMap.end(); ++it) {
2685  // This fence wasn't yet signalled when we polled it for recycling
2686  // the command-list, so need to release the command-list too.
2687  // For immediate commandlists we don't need to do an L0 reset of the
2688  // commandlist but do need to do event cleanup which is also in the
2689  // resetCommandList function.
2690  // If the fence is a nullptr we are using immediate commandlists,
2691  // otherwise regular commandlists which use a fence.
2692  if (it->second.ZeFence == nullptr || it->second.ZeFenceInUse) {
2693  Queue->resetCommandList(it, true, EventListToCleanup);
2694  }
2695  // TODO: remove "if" when the problem is fixed in the level zero
2696  // runtime. Destroy only if a queue is healthy. Destroying a fence may
2697  // cause a hang otherwise.
2698  // If the fence is a nullptr we are using immediate commandlists.
2699  if (Queue->Healthy && it->second.ZeFence != nullptr)
2700  ZE_CALL(zeFenceDestroy, (it->second.ZeFence));
2701  }
2702  Queue->CommandListMap.clear();
2703  }
2704 
2705  for (auto &Event : EventListToCleanup) {
2706  // We don't need to synchronize the events since the queue
2707  // synchronized above already does that.
2708  {
2709  std::scoped_lock<pi_shared_mutex> EventLock(Event->Mutex);
2710  Event->Completed = true;
2711  }
2713  // This event was removed from the command list, so decrement ref count
2714  // (it was incremented when they were added to the command list).
2716  }
2718  return PI_SUCCESS;
2719 }
2720 
2722  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
2723 
2724  if (!Queue->RefCount.decrementAndTest())
2725  return PI_SUCCESS;
2726 
2727  for (auto &Cache : Queue->EventCaches)
2728  for (auto &Event : Cache)
2730 
2731  if (Queue->OwnZeCommandQueue) {
2732  for (auto &QueueMap :
2734  for (auto &QueueGroup : QueueMap)
2735  for (auto &ZeQueue : QueueGroup.second.ZeQueues)
2736  if (ZeQueue)
2737  ZE_CALL(zeCommandQueueDestroy, (ZeQueue));
2738  }
2739 
2740  zePrint("piQueueRelease(compute) NumTimesClosedFull %d, "
2741  "NumTimesClosedEarly %d\n",
2744  zePrint("piQueueRelease(copy) NumTimesClosedFull %d, NumTimesClosedEarly "
2745  "%d\n",
2748 
2749  delete Queue;
2750 
2751  return PI_SUCCESS;
2752 }
2753 
2755  // Wait until command lists attached to the command queue are executed.
2756  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
2757 
2758  if (Queue->Device->ImmCommandListUsed) {
2759  // Lock automatically releases when this goes out of scope.
2760  std::scoped_lock<pi_shared_mutex> Lock(Queue->Mutex);
2761 
2762  Queue->synchronize();
2763  } else {
2764  std::unique_lock<pi_shared_mutex> Lock(Queue->Mutex);
2765  std::vector<ze_command_queue_handle_t> ZeQueues;
2766 
2767  // execute any command list that may still be open.
2768  if (auto Res = Queue->executeAllOpenCommandLists())
2769  return Res;
2770 
2771  // Make a copy of queues to sync and release the lock.
2772  for (auto &QueueMap :
2774  for (auto &QueueGroup : QueueMap)
2775  std::copy(QueueGroup.second.ZeQueues.begin(),
2776  QueueGroup.second.ZeQueues.end(),
2777  std::back_inserter(ZeQueues));
2778 
2779  // Remember the last command's event.
2780  auto LastCommandEvent = Queue->LastCommandEvent;
2781 
2782  // Don't hold a lock to the queue's mutex while waiting.
2783  // This allows continue working with the queue from other threads.
2784  // TODO: this currently exhibits some issues in the driver, so
2785  // we control this with an env var. Remove this control when
2786  // we settle one way or the other.
2787  static bool HoldLock =
2788  std::getenv("SYCL_PI_LEVEL_ZERO_QUEUE_FINISH_HOLD_LOCK") != nullptr;
2789  if (!HoldLock) {
2790  Lock.unlock();
2791  }
2792 
2793  for (auto &ZeQueue : ZeQueues) {
2794  if (ZeQueue)
2795  ZE_CALL(zeHostSynchronize, (ZeQueue));
2796  }
2797 
2798  // Prevent unneeded already finished events to show up in the wait list.
2799  // We can only do so if nothing else was submitted to the queue
2800  // while we were synchronizing it.
2801  if (!HoldLock) {
2802  std::scoped_lock<pi_shared_mutex> Lock(Queue->Mutex);
2803  if (LastCommandEvent == Queue->LastCommandEvent) {
2804  Queue->LastCommandEvent = nullptr;
2805  }
2806  } else {
2807  Queue->LastCommandEvent = nullptr;
2808  }
2809  }
2810  // Reset signalled command lists and return them back to the cache of
2811  // available command lists. Events in the immediate command lists are cleaned
2812  // up in synchronize().
2813  if (!Queue->Device->ImmCommandListUsed)
2814  resetCommandLists(Queue);
2815  return PI_SUCCESS;
2816 }
2817 
2818 // Flushing cross-queue dependencies is covered by createAndRetainPiZeEventList,
2819 // so this can be left as a no-op.
2821  (void)Queue;
2822  return PI_SUCCESS;
2823 }
2824 
2826  pi_native_handle *NativeHandle) {
2827  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
2828  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2829 
2830  // Lock automatically releases when this goes out of scope.
2831  std::shared_lock<pi_shared_mutex> lock(Queue->Mutex);
2832 
2833  auto ZeQueue = pi_cast<ze_command_queue_handle_t *>(NativeHandle);
2834 
2835  // Extract a Level Zero compute queue handle from the given PI queue
2836  uint32_t QueueGroupOrdinalUnused;
2837  auto TID = std::this_thread::get_id();
2838  auto &InitialGroup = Queue->ComputeQueueGroupsByTID.begin()->second;
2839  const auto &Result =
2840  Queue->ComputeQueueGroupsByTID.insert({TID, InitialGroup});
2841  auto &ComputeQueueGroupRef = Result.first->second;
2842 
2843  *ZeQueue = ComputeQueueGroupRef.getZeQueue(&QueueGroupOrdinalUnused);
2844  return PI_SUCCESS;
2845 }
2846 
2849  bool OwnNativeHandle,
2850  pi_queue *Queue) {
2851  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
2852  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2853  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
2854  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
2855 
2856  auto ZeQueue = pi_cast<ze_command_queue_handle_t>(NativeHandle);
2857  // Assume this is the "0" index queue in the compute command-group.
2858  std::vector<ze_command_queue_handle_t> ZeQueues{ZeQueue};
2859 
2860  // TODO: see what we can do to correctly initialize PI queue for
2861  // compute vs. copy Level-Zero queue. Currently we will send
2862  // all commands to the "ZeQueue".
2863  std::vector<ze_command_queue_handle_t> ZeroCopyQueues;
2864  *Queue =
2865  new _pi_queue(ZeQueues, ZeroCopyQueues, Context, Device, OwnNativeHandle);
2866  return PI_SUCCESS;
2867 }
2868 
2869 // If indirect access tracking is enabled then performs reference counting,
2870 // otherwise just calls zeMemAllocDevice.
2872  pi_device Device, size_t Size) {
2873  pi_platform Plt = Device->Platform;
2874  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
2875  std::defer_lock);
2876  if (IndirectAccessTrackingEnabled) {
2877  // Lock the mutex which is guarding contexts container in the platform.
2878  // This prevents new kernels from being submitted in any context while
2879  // we are in the process of allocating a memory, this is needed to
2880  // properly capture allocations by kernels with indirect access.
2881  ContextsLock.lock();
2882  // We are going to defer memory release if there are kernels with
2883  // indirect access, that is why explicitly retain context to be sure
2884  // that it is released after all memory allocations in this context are
2885  // released.
2887  }
2888 
2889  ze_device_mem_alloc_desc_t ZeDesc = {};
2890  ZeDesc.flags = 0;
2891  ZeDesc.ordinal = 0;
2892  ZE_CALL(zeMemAllocDevice,
2893  (Context->ZeContext, &ZeDesc, Size, 1, Device->ZeDevice, ResultPtr));
2894 
2895  if (IndirectAccessTrackingEnabled) {
2896  // Keep track of all memory allocations in the context
2897  Context->MemAllocs.emplace(std::piecewise_construct,
2898  std::forward_as_tuple(*ResultPtr),
2899  std::forward_as_tuple(Context));
2900  }
2901  return PI_SUCCESS;
2902 }
2903 
2904 // If indirect access tracking is enabled then performs reference counting,
2905 // otherwise just calls zeMemAllocHost.
2907  size_t Size) {
2908  pi_platform Plt = Context->getPlatform();
2909  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
2910  std::defer_lock);
2911  if (IndirectAccessTrackingEnabled) {
2912  // Lock the mutex which is guarding contexts container in the platform.
2913  // This prevents new kernels from being submitted in any context while
2914  // we are in the process of allocating a memory, this is needed to
2915  // properly capture allocations by kernels with indirect access.
2916  ContextsLock.lock();
2917  // We are going to defer memory release if there are kernels with
2918  // indirect access, that is why explicitly retain context to be sure
2919  // that it is released after all memory allocations in this context are
2920  // released.
2922  }
2923 
2925  ZeDesc.flags = 0;
2926  ZE_CALL(zeMemAllocHost, (Context->ZeContext, &ZeDesc, Size, 1, ResultPtr));
2927 
2928  if (IndirectAccessTrackingEnabled) {
2929  // Keep track of all memory allocations in the context
2930  Context->MemAllocs.emplace(std::piecewise_construct,
2931  std::forward_as_tuple(*ResultPtr),
2932  std::forward_as_tuple(Context));
2933  }
2934  return PI_SUCCESS;
2935 }
2936 
2938  void *HostPtr, pi_mem *RetMem,
2939  const pi_mem_properties *properties) {
2940 
2941  // TODO: implement support for more access modes
2942  if (!((Flags & PI_MEM_FLAGS_ACCESS_RW) ||
2943  (Flags & PI_MEM_ACCESS_READ_ONLY))) {
2944  die("piMemBufferCreate: Level-Zero supports read-write and read-only "
2945  "buffer,"
2946  "but not other accesses (such as write-only) yet.");
2947  }
2948 
2949  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
2950  PI_ASSERT(RetMem, PI_ERROR_INVALID_VALUE);
2951 
2952  if (properties != nullptr) {
2953  die("piMemBufferCreate: no mem properties goes to Level-Zero RT yet");
2954  }
2955 
2956  if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
2957  // Having PI_MEM_FLAGS_HOST_PTR_ALLOC for buffer requires allocation of
2958  // pinned host memory, see:
2959  // sycl/doc/extensions/supported/sycl_ext_oneapi_use_pinned_host_memory_property.asciidoc
2960  // We are however missing such functionality in Level Zero, so we just
2961  // ignore the flag for now.
2962  //
2963  }
2964 
2965  // If USM Import feature is enabled and hostptr is supplied,
2966  // import the hostptr if not already imported into USM.
2967  // Data transfer rate is maximized when both source and destination
2968  // are USM pointers. Promotion of the host pointer to USM thus
2969  // optimizes data transfer performance.
2970  bool HostPtrImported = false;
2971  if (ZeUSMImport.Enabled && HostPtr != nullptr &&
2972  (Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0) {
2973  // Query memory type of the host pointer
2974  ze_device_handle_t ZeDeviceHandle;
2975  ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
2976  ZE_CALL(zeMemGetAllocProperties,
2977  (Context->ZeContext, HostPtr, &ZeMemoryAllocationProperties,
2978  &ZeDeviceHandle));
2979 
2980  // If not shared of any type, we can import the ptr
2981  if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) {
2982  // Promote the host ptr to USM host memory
2983  ze_driver_handle_t driverHandle = Context->getPlatform()->ZeDriver;
2984  ZeUSMImport.doZeUSMImport(driverHandle, HostPtr, Size);
2985  HostPtrImported = true;
2986  }
2987  }
2988 
2989  pi_buffer Buffer = nullptr;
2990  auto HostPtrOrNull =
2991  (Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast<char *>(HostPtr) : nullptr;
2992  try {
2993  Buffer = new _pi_buffer(Context, Size, HostPtrOrNull, HostPtrImported);
2994  } catch (const std::bad_alloc &) {
2995  return PI_ERROR_OUT_OF_HOST_MEMORY;
2996  } catch (...) {
2997  return PI_ERROR_UNKNOWN;
2998  }
2999 
3000  // Initialize the buffer with user data
3001  if (HostPtr) {
3002  if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
3003  (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
3004 
3005  // We don't yet know which device needs this buffer, so make the first
3006  // device in the context be the master, and hold the initial valid
3007  // allocation.
3008  char *ZeHandleDst;
3009  PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only,
3010  Context->Devices[0]));
3011  if (Buffer->OnHost) {
3012  // Do a host to host copy.
3013  // For an imported HostPtr the copy is unneeded.
3014  if (!HostPtrImported)
3015  memcpy(ZeHandleDst, HostPtr, Size);
3016  } else {
3017  // Initialize the buffer synchronously with immediate offload
3018  // zeCommandListAppendMemoryCopy must not be called from simultaneous
3019  // threads with the same command list handle, so we need exclusive lock.
3020  std::scoped_lock<pi_mutex> Lock(Context->ImmediateCommandListMutex);
3021  ZE_CALL(zeCommandListAppendMemoryCopy,
3022  (Context->ZeCommandListInit, ZeHandleDst, HostPtr, Size,
3023  nullptr, 0, nullptr));
3024  }
3025  } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
3026  // Nothing more to do.
3027  } else {
3028  die("piMemBufferCreate: not implemented");
3029  }
3030  }
3031 
3032  *RetMem = Buffer;
3033  return PI_SUCCESS;
3034 }
3035 
3036 pi_result piMemGetInfo(pi_mem Mem, pi_mem_info ParamName, size_t ParamValueSize,
3037  void *ParamValue, size_t *ParamValueSizeRet) {
3038  PI_ASSERT(Mem, PI_ERROR_INVALID_VALUE);
3039  // piMemImageGetInfo must be used for images
3040  PI_ASSERT(!Mem->isImage(), PI_ERROR_INVALID_VALUE);
3041 
3042  std::shared_lock<pi_shared_mutex> Lock(Mem->Mutex);
3043  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
3044 
3045  switch (ParamName) {
3046  case PI_MEM_CONTEXT:
3047  return ReturnValue(Mem->Context);
3048  case PI_MEM_SIZE: {
3049  // Get size of the allocation
3050  auto Buffer = pi_cast<pi_buffer>(Mem);
3051  return ReturnValue(size_t{Buffer->Size});
3052  }
3053  default:
3054  die("piMemGetInfo: Parameter is not implemented");
3055  }
3056 
3057  return {};
3058 }
3059 
3061  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
3062 
3063  Mem->RefCount.increment();
3064  return PI_SUCCESS;
3065 }
3066 
3067 // If indirect access tracking is not enabled then this functions just performs
3068 // zeMemFree. If indirect access tracking is enabled then reference counting is
3069 // performed.
3071  pi_platform Plt = Context->getPlatform();
3072  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
3073  std::defer_lock);
3074  if (IndirectAccessTrackingEnabled) {
3075  ContextsLock.lock();
3076  auto It = Context->MemAllocs.find(Ptr);
3077  if (It == std::end(Context->MemAllocs)) {
3078  die("All memory allocations must be tracked!");
3079  }
3080  if (!It->second.RefCount.decrementAndTest()) {
3081  // Memory can't be deallocated yet.
3082  return PI_SUCCESS;
3083  }
3084 
3085  // Reference count is zero, it is ok to free memory.
3086  // We don't need to track this allocation anymore.
3087  Context->MemAllocs.erase(It);
3088  }
3089 
3090  ZE_CALL(zeMemFree, (Context->ZeContext, Ptr));
3091 
3092  if (IndirectAccessTrackingEnabled)
3094 
3095  return PI_SUCCESS;
3096 }
3097 
3098 static pi_result USMFreeHelper(pi_context Context, void *Ptr,
3099  bool OwnZeMemHandle = true);
3100 
3102  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
3103 
3104  if (!Mem->RefCount.decrementAndTest())
3105  return PI_SUCCESS;
3106 
3107  if (Mem->isImage()) {
3108  char *ZeHandleImage;
3109  PI_CALL(Mem->getZeHandle(ZeHandleImage, _pi_mem::write_only));
3110  ZE_CALL(zeImageDestroy, (pi_cast<ze_image_handle_t>(ZeHandleImage)));
3111  } else {
3112  auto Buffer = static_cast<pi_buffer>(Mem);
3113  Buffer->free();
3114  }
3115  delete Mem;
3116 
3117  return PI_SUCCESS;
3118 }
3119 
3121  const pi_image_format *ImageFormat,
3122  const pi_image_desc *ImageDesc, void *HostPtr,
3123  pi_mem *RetImage) {
3124 
3125  // TODO: implement read-only, write-only
3126  if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) {
3127  die("piMemImageCreate: Level-Zero implements only read-write buffer,"
3128  "no read-only or write-only yet.");
3129  }
3130  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3131  PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE);
3132  PI_ASSERT(ImageFormat, PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
3133 
3134  ze_image_format_type_t ZeImageFormatType;
3135  size_t ZeImageFormatTypeSize;
3136  switch (ImageFormat->image_channel_data_type) {
3138  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT;
3139  ZeImageFormatTypeSize = 32;
3140  break;
3142  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT;
3143  ZeImageFormatTypeSize = 16;
3144  break;
3146  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
3147  ZeImageFormatTypeSize = 32;
3148  break;
3150  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
3151  ZeImageFormatTypeSize = 16;
3152  break;
3154  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
3155  ZeImageFormatTypeSize = 8;
3156  break;
3158  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM;
3159  ZeImageFormatTypeSize = 16;
3160  break;
3162  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM;
3163  ZeImageFormatTypeSize = 8;
3164  break;
3166  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
3167  ZeImageFormatTypeSize = 32;
3168  break;
3170  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
3171  ZeImageFormatTypeSize = 16;
3172  break;
3174  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
3175  ZeImageFormatTypeSize = 8;
3176  break;
3178  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM;
3179  ZeImageFormatTypeSize = 16;
3180  break;
3182  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM;
3183  ZeImageFormatTypeSize = 8;
3184  break;
3185  default:
3186  zePrint("piMemImageCreate: unsupported image data type: data type = %d\n",
3187  ImageFormat->image_channel_data_type);
3188  return PI_ERROR_INVALID_VALUE;
3189  }
3190 
3191  // TODO: populate the layout mapping
3192  ze_image_format_layout_t ZeImageFormatLayout;
3193  switch (ImageFormat->image_channel_order) {
3195  switch (ZeImageFormatTypeSize) {
3196  case 8:
3197  ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8;
3198  break;
3199  case 16:
3200  ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16;
3201  break;
3202  case 32:
3203  ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
3204  break;
3205  default:
3206  zePrint("piMemImageCreate: unexpected data type Size\n");
3207  return PI_ERROR_INVALID_VALUE;
3208  }
3209  break;
3210  default:
3211  zePrint("format layout = %d\n", ImageFormat->image_channel_order);
3212  die("piMemImageCreate: unsupported image format layout\n");
3213  break;
3214  }
3215 
3216  ze_image_format_t ZeFormatDesc = {
3217  ZeImageFormatLayout, ZeImageFormatType,
3218  // TODO: are swizzles deducted from image_format->image_channel_order?
3219  ZE_IMAGE_FORMAT_SWIZZLE_R, ZE_IMAGE_FORMAT_SWIZZLE_G,
3220  ZE_IMAGE_FORMAT_SWIZZLE_B, ZE_IMAGE_FORMAT_SWIZZLE_A};
3221 
3222  ze_image_type_t ZeImageType;
3223  switch (ImageDesc->image_type) {
3224  case PI_MEM_TYPE_IMAGE1D:
3225  ZeImageType = ZE_IMAGE_TYPE_1D;
3226  break;
3227  case PI_MEM_TYPE_IMAGE2D:
3228  ZeImageType = ZE_IMAGE_TYPE_2D;
3229  break;
3230  case PI_MEM_TYPE_IMAGE3D:
3231  ZeImageType = ZE_IMAGE_TYPE_3D;
3232  break;
3234  ZeImageType = ZE_IMAGE_TYPE_1DARRAY;
3235  break;
3237  ZeImageType = ZE_IMAGE_TYPE_2DARRAY;
3238  break;
3239  default:
3240  zePrint("piMemImageCreate: unsupported image type\n");
3241  return PI_ERROR_INVALID_VALUE;
3242  }
3243 
3244  ZeStruct<ze_image_desc_t> ZeImageDesc;
3245  ZeImageDesc.arraylevels = ZeImageDesc.flags = 0;
3246  ZeImageDesc.type = ZeImageType;
3247  ZeImageDesc.format = ZeFormatDesc;
3248  ZeImageDesc.width = pi_cast<uint32_t>(ImageDesc->image_width);
3249  ZeImageDesc.height = pi_cast<uint32_t>(ImageDesc->image_height);
3250  ZeImageDesc.depth = pi_cast<uint32_t>(ImageDesc->image_depth);
3251  ZeImageDesc.arraylevels = pi_cast<uint32_t>(ImageDesc->image_array_size);
3252  ZeImageDesc.miplevels = ImageDesc->num_mip_levels;
3253 
3254  std::shared_lock<pi_shared_mutex> Lock(Context->Mutex);
3255 
3256  // Currently we have the "0" device in context with mutliple root devices to
3257  // own the image.
3258  // TODO: Implement explicit copying for acessing the image from other devices
3259  // in the context.
3261  : Context->Devices[0];
3262  ze_image_handle_t ZeHImage;
3263  ZE_CALL(zeImageCreate,
3264  (Context->ZeContext, Device->ZeDevice, &ZeImageDesc, &ZeHImage));
3265 
3266  try {
3267  auto ZePIImage = new _pi_image(Context, ZeHImage);
3268  *RetImage = ZePIImage;
3269 
3270 #ifndef NDEBUG
3271  ZePIImage->ZeImageDesc = ZeImageDesc;
3272 #endif // !NDEBUG
3273 
3274  if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
3275  (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
3276  // Initialize image synchronously with immediate offload.
3277  // zeCommandListAppendImageCopyFromMemory must not be called from
3278  // simultaneous threads with the same command list handle, so we need
3279  // exclusive lock.
3280  std::scoped_lock<pi_mutex> Lock(Context->ImmediateCommandListMutex);
3281  ZE_CALL(zeCommandListAppendImageCopyFromMemory,
3282  (Context->ZeCommandListInit, ZeHImage, HostPtr, nullptr, nullptr,
3283  0, nullptr));
3284  }
3285  } catch (const std::bad_alloc &) {
3286  return PI_ERROR_OUT_OF_HOST_MEMORY;
3287  } catch (...) {
3288  return PI_ERROR_UNKNOWN;
3289  }
3290  return PI_SUCCESS;
3291 }
3292 
3294  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
3295  std::shared_lock<pi_shared_mutex> Guard(Mem->Mutex);
3296  char *ZeHandle;
3297  PI_CALL(Mem->getZeHandle(ZeHandle, _pi_mem::read_write));
3298  *NativeHandle = pi_cast<pi_native_handle>(ZeHandle);
3299  return PI_SUCCESS;
3300 }
3301 
3304  bool ownNativeHandle, pi_mem *Mem) {
3305  PI_ASSERT(Mem, PI_ERROR_INVALID_VALUE);
3306  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3307  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3308 
3309  std::shared_lock<pi_shared_mutex> Lock(Context->Mutex);
3310 
3311  // Get base of the allocation
3312  void *Base;
3313  size_t Size;
3314  void *Ptr = pi_cast<void *>(NativeHandle);
3315  ZE_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, &Size));
3316  PI_ASSERT(Ptr == Base, PI_ERROR_INVALID_VALUE);
3317 
3319  ze_device_handle_t ZeDevice = nullptr;
3320  ZE_CALL(zeMemGetAllocProperties,
3321  (Context->ZeContext, Ptr, &ZeMemProps, &ZeDevice));
3322 
3323  // Check type of the allocation
3324  switch (ZeMemProps.type) {
3325  case ZE_MEMORY_TYPE_HOST:
3326  case ZE_MEMORY_TYPE_SHARED:
3327  case ZE_MEMORY_TYPE_DEVICE:
3328  break;
3329  case ZE_MEMORY_TYPE_UNKNOWN:
3330  // Memory allocation is unrelated to the context
3331  return PI_ERROR_INVALID_CONTEXT;
3332  default:
3333  die("Unexpected memory type");
3334  }
3335 
3336  pi_device Device = nullptr;
3337  if (ZeDevice) {
3338  Device = Context->getPlatform()->getDeviceFromNativeHandle(ZeDevice);
3339  PI_ASSERT(Context->isValidDevice(Device), PI_ERROR_INVALID_CONTEXT);
3340  }
3341 
3342  try {
3343  *Mem = new _pi_buffer(Context, Size, Device, pi_cast<char *>(NativeHandle),
3344  ownNativeHandle);
3345 
3346  pi_platform Plt = Context->getPlatform();
3347  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
3348  std::defer_lock);
3349  // If we don't own the native handle then we can't control deallocation of
3350  // that memory so there is no point of keeping track of the memory
3351  // allocation for deferred memory release in the mode when indirect access
3352  // tracking is enabled.
3353  if (IndirectAccessTrackingEnabled && ownNativeHandle) {
3354  // We need to keep track of all memory allocations in the context
3355  ContextsLock.lock();
3356  // Retain context to be sure that it is released after all memory
3357  // allocations in this context are released.
3359 
3360  Context->MemAllocs.emplace(
3361  std::piecewise_construct, std::forward_as_tuple(Ptr),
3362  std::forward_as_tuple(Context, ownNativeHandle));
3363  }
3364  } catch (const std::bad_alloc &) {
3365  return PI_ERROR_OUT_OF_HOST_MEMORY;
3366  } catch (...) {
3367  return PI_ERROR_UNKNOWN;
3368  }
3369 
3370  // Initialize the buffer as necessary
3371  auto Buffer = pi_cast<pi_buffer>(*Mem);
3372  if (Device) {
3373  // If this allocation is on a device, then we re-use it for the buffer.
3374  // Nothing to do.
3375  } else if (Buffer->OnHost) {
3376  // If this is host allocation and buffer always stays on host there
3377  // nothing more to do.
3378  } else {
3379  // In all other cases (shared allocation, or host allocation that cannot
3380  // represent the buffer in this context) copy the data to a newly
3381  // created device allocation.
3382  char *ZeHandleDst;
3383  PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Device));
3384 
3385  // zeCommandListAppendMemoryCopy must not be called from simultaneous
3386  // threads with the same command list handle, so we need exclusive lock.
3387  std::scoped_lock<pi_mutex> Lock(Context->ImmediateCommandListMutex);
3388  ZE_CALL(zeCommandListAppendMemoryCopy,
3389  (Context->ZeCommandListInit, ZeHandleDst, Ptr, Size, nullptr, 0,
3390  nullptr));
3391  }
3392 
3393  return PI_SUCCESS;
3394 }
3395 
3397  size_t Length, pi_program *Program) {
3398 
3399  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3400  PI_ASSERT(ILBytes && Length, PI_ERROR_INVALID_VALUE);
3401  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3402 
3403  // NOTE: the Level Zero module creation is also building the program, so we
3404  // are deferring it until the program is ready to be built.
3405 
3406  try {
3407  *Program = new _pi_program(_pi_program::IL, Context, ILBytes, Length);
3408  } catch (const std::bad_alloc &) {
3409  return PI_ERROR_OUT_OF_HOST_MEMORY;
3410  } catch (...) {
3411  return PI_ERROR_UNKNOWN;
3412  }
3413  return PI_SUCCESS;
3414 }
3415 
3417  pi_context Context, pi_uint32 NumDevices, const pi_device *DeviceList,
3418  const size_t *Lengths, const unsigned char **Binaries,
3419  size_t NumMetadataEntries, const pi_device_binary_property *Metadata,
3420  pi_int32 *BinaryStatus, pi_program *Program) {
3421  (void)Metadata;
3422  (void)NumMetadataEntries;
3423 
3424  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3425  PI_ASSERT(DeviceList && NumDevices, PI_ERROR_INVALID_VALUE);
3426  PI_ASSERT(Binaries && Lengths, PI_ERROR_INVALID_VALUE);
3427  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3428 
3429  // For now we support only one device.
3430  if (NumDevices != 1) {
3431  zePrint("piProgramCreateWithBinary: level_zero supports only one device.");
3432  return PI_ERROR_INVALID_VALUE;
3433  }
3434  if (!Binaries[0] || !Lengths[0]) {
3435  if (BinaryStatus)
3436  *BinaryStatus = PI_ERROR_INVALID_VALUE;
3437  return PI_ERROR_INVALID_VALUE;
3438  }
3439 
3440  size_t Length = Lengths[0];
3441  auto Binary = Binaries[0];
3442 
3443  // In OpenCL, clCreateProgramWithBinary() can be used to load any of the
3444  // following: "program executable", "compiled program", or "library of
3445  // compiled programs". In addition, the loaded program can be either
3446  // IL (SPIR-v) or native device code. For now, we assume that
3447  // piProgramCreateWithBinary() is only used to load a "program executable"
3448  // as native device code.
3449  // If we wanted to support all the same cases as OpenCL, we would need to
3450  // somehow examine the binary image to distinguish the cases. Alternatively,
3451  // we could change the PI interface and have the caller pass additional
3452  // information to distinguish the cases.
3453 
3454  try {
3455  *Program = new _pi_program(_pi_program::Native, Context, Binary, Length);
3456  } catch (const std::bad_alloc &) {
3457  return PI_ERROR_OUT_OF_HOST_MEMORY;
3458  } catch (...) {
3459  return PI_ERROR_UNKNOWN;
3460  }
3461 
3462  if (BinaryStatus)
3463  *BinaryStatus = PI_SUCCESS;
3464  return PI_SUCCESS;
3465 }
3466 
3468  const char **Strings,
3469  const size_t *Lengths,
3470  pi_program *RetProgram) {
3471 
3472  (void)Context;
3473  (void)Count;
3474  (void)Strings;
3475  (void)Lengths;
3476  (void)RetProgram;
3477  zePrint("piclProgramCreateWithSource: not supported in Level Zero\n");
3478  return PI_ERROR_INVALID_OPERATION;
3479 }
3480 
3482  size_t ParamValueSize, void *ParamValue,
3483  size_t *ParamValueSizeRet) {
3484 
3485  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3486 
3487  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
3488  switch (ParamName) {
3490  return ReturnValue(pi_uint32{Program->RefCount.load()});
3492  // TODO: return true number of devices this program exists for.
3493  return ReturnValue(pi_uint32{1});
3495  // TODO: return all devices this program exists for.
3496  return ReturnValue(Program->Context->Devices[0]);
3498  std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
3499  size_t SzBinary;
3500  if (Program->State == _pi_program::IL ||
3501  Program->State == _pi_program::Native ||
3502  Program->State == _pi_program::Object) {
3503  SzBinary = Program->CodeLength;
3504  } else if (Program->State == _pi_program::Exe) {
3505  ZE_CALL(zeModuleGetNativeBinary, (Program->ZeModule, &SzBinary, nullptr));
3506  } else {
3507  return PI_ERROR_INVALID_PROGRAM;
3508  }
3509  // This is an array of 1 element, initialized as if it were scalar.
3510  return ReturnValue(size_t{SzBinary});
3511  }
3512  case PI_PROGRAM_INFO_BINARIES: {
3513  // The caller sets "ParamValue" to an array of pointers, one for each
3514  // device. Since Level Zero supports only one device, there is only one
3515  // pointer. If the pointer is NULL, we don't do anything. Otherwise, we
3516  // copy the program's binary image to the buffer at that pointer.
3517  uint8_t **PBinary = pi_cast<uint8_t **>(ParamValue);
3518  if (!PBinary[0])
3519  break;
3520 
3521  std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
3522  if (Program->State == _pi_program::IL ||
3523  Program->State == _pi_program::Native ||
3524  Program->State == _pi_program::Object) {
3525  std::memcpy(PBinary[0], Program->Code.get(), Program->CodeLength);
3526  } else if (Program->State == _pi_program::Exe) {
3527  size_t SzBinary = 0;
3528  ZE_CALL(zeModuleGetNativeBinary,
3529  (Program->ZeModule, &SzBinary, PBinary[0]));
3530  } else {
3531  return PI_ERROR_INVALID_PROGRAM;
3532  }
3533  break;
3534  }
3536  std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
3537  uint32_t NumKernels;
3538  if (Program->State == _pi_program::IL ||
3539  Program->State == _pi_program::Native ||
3540  Program->State == _pi_program::Object) {
3541  return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
3542  } else if (Program->State == _pi_program::Exe) {
3543  NumKernels = 0;
3544  ZE_CALL(zeModuleGetKernelNames,
3545  (Program->ZeModule, &NumKernels, nullptr));
3546  } else {
3547  return PI_ERROR_INVALID_PROGRAM;
3548  }
3549  return ReturnValue(size_t{NumKernels});
3550  }
3552  try {
3553  std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
3554  std::string PINames{""};
3555  if (Program->State == _pi_program::IL ||
3556  Program->State == _pi_program::Native ||
3557  Program->State == _pi_program::Object) {
3558  return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
3559  } else if (Program->State == _pi_program::Exe) {
3560  uint32_t Count = 0;
3561  ZE_CALL(zeModuleGetKernelNames, (Program->ZeModule, &Count, nullptr));
3562  std::unique_ptr<const char *[]> PNames(new const char *[Count]);
3563  ZE_CALL(zeModuleGetKernelNames,
3564  (Program->ZeModule, &Count, PNames.get()));
3565  for (uint32_t I = 0; I < Count; ++I) {
3566  PINames += (I > 0 ? ";" : "");
3567  PINames += PNames[I];
3568  }
3569  } else {
3570  return PI_ERROR_INVALID_PROGRAM;
3571  }
3572  return ReturnValue(PINames.c_str());
3573  } catch (const std::bad_alloc &) {
3574  return PI_ERROR_OUT_OF_HOST_MEMORY;
3575  } catch (...) {
3576  return PI_ERROR_UNKNOWN;
3577  }
3578  default:
3579  die("piProgramGetInfo: not implemented");
3580  }
3581 
3582  return PI_SUCCESS;
3583 }
3584 
3586  const pi_device *DeviceList, const char *Options,
3587  pi_uint32 NumInputPrograms,
3588  const pi_program *InputPrograms,
3589  void (*PFnNotify)(pi_program Program, void *UserData),
3590  void *UserData, pi_program *RetProgram) {
3591  // We only support one device with Level Zero currently.
3592  if (NumDevices != 1) {
3593  zePrint("piProgramLink: level_zero supports only one device.");
3594  return PI_ERROR_INVALID_VALUE;
3595  }
3596 
3597  // We do not support any link flags at this time because the Level Zero API
3598  // does not have any way to pass flags that are specific to linking.
3599  if (Options && *Options != '\0') {
3600  std::string ErrorMessage(
3601  "Level Zero does not support kernel link flags: \"");
3602  ErrorMessage.append(Options);
3603  ErrorMessage.push_back('\"');
3604  pi_program Program =
3606  *RetProgram = Program;
3607  return PI_ERROR_LINK_PROGRAM_FAILURE;
3608  }
3609 
3610  // Validate input parameters.
3611  PI_ASSERT(DeviceList, PI_ERROR_INVALID_DEVICE);
3612  PI_ASSERT(Context->isValidDevice(DeviceList[0]), PI_ERROR_INVALID_DEVICE);
3613  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
3614  if (NumInputPrograms == 0 || InputPrograms == nullptr)
3615  return PI_ERROR_INVALID_VALUE;
3616 
3617  pi_result PiResult = PI_SUCCESS;
3618  try {
3619  // Acquire a "shared" lock on each of the input programs, and also validate
3620  // that they are all in Object state.
3621  //
3622  // There is no danger of deadlock here even if two threads call
3623  // piProgramLink simultaneously with the same input programs in a different
3624  // order. If we were acquiring these with "exclusive" access, this could
3625  // lead to a classic lock ordering deadlock. However, there is no such
3626  // deadlock potential with "shared" access. There could also be a deadlock
3627  // potential if there was some other code that holds more than one of these
3628  // locks simultaneously with "exclusive" access. However, there is no such
3629  // code like that, so this is also not a danger.
3630  std::vector<std::shared_lock<pi_shared_mutex>> Guards(NumInputPrograms);
3631  for (pi_uint32 I = 0; I < NumInputPrograms; I++) {
3632  std::shared_lock<pi_shared_mutex> Guard(InputPrograms[I]->Mutex);
3633  Guards[I].swap(Guard);
3634  if (InputPrograms[I]->State != _pi_program::Object) {
3635  return PI_ERROR_INVALID_OPERATION;
3636  }
3637  }
3638 
3639  // Previous calls to piProgramCompile did not actually compile the SPIR-V.
3640  // Instead, we postpone compilation until this point, when all the modules
3641  // are linked together. By doing compilation and linking together, the JIT
3642  // compiler is able see all modules and do cross-module optimizations.
3643  //
3644  // Construct a ze_module_program_exp_desc_t which contains information about
3645  // all of the modules that will be linked together.
3647  std::vector<size_t> CodeSizes(NumInputPrograms);
3648  std::vector<const uint8_t *> CodeBufs(NumInputPrograms);
3649  std::vector<const char *> BuildFlagPtrs(NumInputPrograms);
3650  std::vector<const ze_module_constants_t *> SpecConstPtrs(NumInputPrograms);
3651  std::vector<_pi_program::SpecConstantShim> SpecConstShims;
3652  SpecConstShims.reserve(NumInputPrograms);
3653 
3654  for (pi_uint32 I = 0; I < NumInputPrograms; I++) {
3655  pi_program Program = InputPrograms[I];
3656  CodeSizes[I] = Program->CodeLength;
3657  CodeBufs[I] = Program->Code.get();
3658  BuildFlagPtrs[I] = Program->BuildFlags.c_str();
3659  SpecConstShims.emplace_back(Program);
3660  SpecConstPtrs[I] = SpecConstShims[I].ze();
3661  }
3662 
3663  ZeExtModuleDesc.count = NumInputPrograms;
3664  ZeExtModuleDesc.inputSizes = CodeSizes.data();
3665  ZeExtModuleDesc.pInputModules = CodeBufs.data();
3666  ZeExtModuleDesc.pBuildFlags = BuildFlagPtrs.data();
3667  ZeExtModuleDesc.pConstants = SpecConstPtrs.data();
3668 
3669  ZeStruct<ze_module_desc_t> ZeModuleDesc;
3670  ZeModuleDesc.pNext = &ZeExtModuleDesc;
3671  ZeModuleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV;
3672 
3673  // This works around a bug in the Level Zero driver. When "ZE_DEBUG=-1",
3674  // the driver does validation of the API calls, and it expects
3675  // "pInputModule" to be non-NULL and "inputSize" to be non-zero. This
3676  // validation is wrong when using the "ze_module_program_exp_desc_t"
3677  // extension because those fields are supposed to be ignored. As a
3678  // workaround, set both fields to 1.
3679  //
3680  // TODO: Remove this workaround when the driver is fixed.
3681  ZeModuleDesc.pInputModule = reinterpret_cast<const uint8_t *>(1);
3682  ZeModuleDesc.inputSize = 1;
3683 
3684  // We need a Level Zero extension to compile multiple programs together into
3685  // a single Level Zero module. However, we don't need that extension if
3686  // there happens to be only one input program.
3687  //
3688  // The "|| (NumInputPrograms == 1)" term is a workaround for a bug in the
3689  // Level Zero driver. The driver's "ze_module_program_exp_desc_t"
3690  // extension should work even in the case when there is just one input
3691  // module. However, there is currently a bug in the driver that leads to a
3692  // crash. As a workaround, do not use the extension when there is one
3693  // input module.
3694  //
3695  // TODO: Remove this workaround when the driver is fixed.
3696  if (!DeviceList[0]->Platform->ZeDriverModuleProgramExtensionFound ||
3697  (NumInputPrograms == 1)) {
3698  if (NumInputPrograms == 1) {
3699  ZeModuleDesc.pNext = nullptr;
3700  ZeModuleDesc.inputSize = ZeExtModuleDesc.inputSizes[0];
3701  ZeModuleDesc.pInputModule = ZeExtModuleDesc.pInputModules[0];
3702  ZeModuleDesc.pBuildFlags = ZeExtModuleDesc.pBuildFlags[0];
3703  ZeModuleDesc.pConstants = ZeExtModuleDesc.pConstants[0];
3704  } else {
3705  zePrint("piProgramLink: level_zero driver does not have static linking "
3706  "support.");
3707  return PI_ERROR_INVALID_VALUE;
3708  }
3709  }
3710 
3711  // Call the Level Zero API to compile, link, and create the module.
3712  ze_device_handle_t ZeDevice = DeviceList[0]->ZeDevice;
3713  ze_context_handle_t ZeContext = Context->ZeContext;
3714  ze_module_handle_t ZeModule = nullptr;
3715  ze_module_build_log_handle_t ZeBuildLog = nullptr;
3716  ze_result_t ZeResult =
3717  ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc,
3718  &ZeModule, &ZeBuildLog));
3719 
3720  // We still create a _pi_program object even if there is a BUILD_FAILURE
3721  // because we need the object to hold the ZeBuildLog. There is no build
3722  // log created for other errors, so we don't create an object.
3723  PiResult = mapError(ZeResult);
3724  if (ZeResult != ZE_RESULT_SUCCESS &&
3725  ZeResult != ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) {
3726  return PiResult;
3727  }
3728 
3729  // The call to zeModuleCreate does not report an error if there are
3730  // unresolved symbols because it thinks these could be resolved later via a
3731  // call to zeModuleDynamicLink. However, modules created with piProgramLink
3732  // are supposed to be fully linked and ready to use. Therefore, do an extra
3733  // check now for unresolved symbols. Note that we still create a
3734  // _pi_program if there are unresolved symbols because the ZeBuildLog tells
3735  // which symbols are unresolved.
3736  if (ZeResult == ZE_RESULT_SUCCESS) {
3737  ZeResult = checkUnresolvedSymbols(ZeModule, &ZeBuildLog);
3738  if (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE) {
3739  PiResult = PI_ERROR_LINK_PROGRAM_FAILURE;
3740  } else if (ZeResult != ZE_RESULT_SUCCESS) {
3741  return mapError(ZeResult);
3742  }
3743  }
3744 
3745  _pi_program::state State =
3746  (PiResult == PI_SUCCESS) ? _pi_program::Exe : _pi_program::Invalid;
3747  *RetProgram = new _pi_program(State, Context, ZeModule, ZeBuildLog);
3748  } catch (const std::bad_alloc &) {
3749  return PI_ERROR_OUT_OF_HOST_MEMORY;
3750  } catch (...) {
3751  return PI_ERROR_UNKNOWN;
3752  }
3753  return PiResult;
3754 }
3755 
3757  pi_program Program, pi_uint32 NumDevices, const pi_device *DeviceList,
3758  const char *Options, pi_uint32 NumInputHeaders,
3759  const pi_program *InputHeaders, const char **HeaderIncludeNames,
3760  void (*PFnNotify)(pi_program Program, void *UserData), void *UserData) {
3761  (void)NumInputHeaders;
3762  (void)InputHeaders;
3763  (void)HeaderIncludeNames;
3764 
3765  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3766 
3767  if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList))
3768  return PI_ERROR_INVALID_VALUE;
3769 
3770  // These aren't supported.
3771  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
3772 
3773  std::scoped_lock<pi_shared_mutex> Guard(Program->Mutex);
3774 
3775  // It's only valid to compile a program created from IL (we don't support
3776  // programs created from source code).
3777  //
3778  // The OpenCL spec says that the header parameters are ignored when compiling
3779  // IL programs, so we don't validate them.
3780  if (Program->State != _pi_program::IL)
3781  return PI_ERROR_INVALID_OPERATION;
3782 
3783  // We don't compile anything now. Instead, we delay compilation until
3784  // piProgramLink, where we do both compilation and linking as a single step.
3785  // This produces better code because the driver can do cross-module
3786  // optimizations. Therefore, we just remember the compilation flags, so we
3787  // can use them later.
3788  if (Options)
3789  Program->BuildFlags = Options;
3790  Program->State = _pi_program::Object;
3791 
3792  return PI_SUCCESS;
3793 }
3794 
3796  const pi_device *DeviceList, const char *Options,
3797  void (*PFnNotify)(pi_program Program, void *UserData),
3798  void *UserData) {
3799 
3800  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3801  if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList))
3802  return PI_ERROR_INVALID_VALUE;
3803 
3804  // We only support build to one device with Level Zero now.
3805  // TODO: we should eventually build to the possibly multiple root
3806  // devices in the context.
3807  if (NumDevices != 1) {
3808  zePrint("piProgramBuild: level_zero supports only one device.");
3809  return PI_ERROR_INVALID_VALUE;
3810  }
3811 
3812  // These aren't supported.
3813  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
3814 
3815  std::scoped_lock<pi_shared_mutex> Guard(Program->Mutex);
3816  // Check if device belongs to associated context.
3817  PI_ASSERT(Program->Context, PI_ERROR_INVALID_PROGRAM);
3818  PI_ASSERT(Program->Context->isValidDevice(DeviceList[0]),
3819  PI_ERROR_INVALID_VALUE);
3820 
3821  // It is legal to build a program created from either IL or from native
3822  // device code.
3823  if (Program->State != _pi_program::IL &&
3824  Program->State != _pi_program::Native)
3825  return PI_ERROR_INVALID_OPERATION;
3826 
3827  // We should have either IL or native device code.
3828  PI_ASSERT(Program->Code, PI_ERROR_INVALID_PROGRAM);
3829 
3830  // Ask Level Zero to build and load the native code onto the device.
3831  ZeStruct<ze_module_desc_t> ZeModuleDesc;
3832  _pi_program::SpecConstantShim Shim(Program);
3833  ZeModuleDesc.format = (Program->State == _pi_program::IL)
3834  ? ZE_MODULE_FORMAT_IL_SPIRV
3835  : ZE_MODULE_FORMAT_NATIVE;
3836  ZeModuleDesc.inputSize = Program->CodeLength;
3837  ZeModuleDesc.pInputModule = Program->Code.get();
3838  ZeModuleDesc.pBuildFlags = Options;
3839  ZeModuleDesc.pConstants = Shim.ze();
3840 
3841  ze_device_handle_t ZeDevice = DeviceList[0]->ZeDevice;
3842  ze_context_handle_t ZeContext = Program->Context->ZeContext;
3843  ze_module_handle_t ZeModule = nullptr;
3844 
3845  pi_result Result = PI_SUCCESS;
3846  Program->State = _pi_program::Exe;
3847  ze_result_t ZeResult =
3848  ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc,
3849  &ZeModule, &Program->ZeBuildLog));
3850  if (ZeResult != ZE_RESULT_SUCCESS) {
3851  // We adjust pi_program below to avoid attempting to release zeModule when
3852  // RT calls piProgramRelease().
3853  Program->State = _pi_program::Invalid;
3854  Result = mapError(ZeResult);
3855  if (Program->ZeBuildLog) {
3856  ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (Program->ZeBuildLog));
3857  Program->ZeBuildLog = nullptr;
3858  }
3859  if (ZeModule) {
3860  ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule));
3861  ZeModule = nullptr;
3862  }
3863  } else {
3864  // The call to zeModuleCreate does not report an error if there are
3865  // unresolved symbols because it thinks these could be resolved later via a
3866  // call to zeModuleDynamicLink. However, modules created with
3867  // piProgramBuild are supposed to be fully linked and ready to use.
3868  // Therefore, do an extra check now for unresolved symbols.
3869  ZeResult = checkUnresolvedSymbols(ZeModule, &Program->ZeBuildLog);
3870  if (ZeResult != ZE_RESULT_SUCCESS) {
3871  Program->State = _pi_program::Invalid;
3872  Result = (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE)
3873  ? PI_ERROR_BUILD_PROGRAM_FAILURE
3874  : mapError(ZeResult);
3875  if (ZeModule) {
3876  ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule));
3877  ZeModule = nullptr;
3878  }
3879  }
3880  }
3881 
3882  // We no longer need the IL / native code.
3883  Program->Code.reset();
3884  Program->ZeModule = ZeModule;
3885  return Result;
3886 }
3887 
3889  pi_program_build_info ParamName,
3890  size_t ParamValueSize, void *ParamValue,
3891  size_t *ParamValueSizeRet) {
3892  (void)Device;
3893 
3894  std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
3895  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
3896  if (ParamName == PI_PROGRAM_BUILD_INFO_BINARY_TYPE) {
3898  if (Program->State == _pi_program::Object) {
3900  } else if (Program->State == _pi_program::Exe) {
3902  }
3903  return ReturnValue(pi_program_binary_type{Type});
3904  }
3905  if (ParamName == PI_PROGRAM_BUILD_INFO_OPTIONS) {
3906  // TODO: how to get module build options out of Level Zero?
3907  // For the programs that we compiled we can remember the options
3908  // passed with piProgramCompile/piProgramBuild, but what can we
3909  // return for programs that were built outside and registered
3910  // with piProgramRegister?
3911  return ReturnValue("");
3912  } else if (ParamName == PI_PROGRAM_BUILD_INFO_LOG) {
3913  // Check first to see if the plugin code recorded an error message.
3914  if (!Program->ErrorMessage.empty()) {
3915  return ReturnValue(Program->ErrorMessage.c_str());
3916  }
3917 
3918  // Next check if there is a Level Zero build log.
3919  if (Program->ZeBuildLog) {
3920  size_t LogSize = ParamValueSize;
3921  ZE_CALL(zeModuleBuildLogGetString,
3922  (Program->ZeBuildLog, &LogSize, pi_cast<char *>(ParamValue)));
3923  if (ParamValueSizeRet) {
3924  *ParamValueSizeRet = LogSize;
3925  }
3926  return PI_SUCCESS;
3927  }
3928 
3929  // Otherwise, there is no error. The OpenCL spec says to return an empty
3930  // string if there ws no previous attempt to compile, build, or link the
3931  // program.
3932  return ReturnValue("");
3933  } else {
3934  zePrint("piProgramGetBuildInfo: unsupported ParamName\n");
3935  return PI_ERROR_INVALID_VALUE;
3936  }
3937  return PI_SUCCESS;
3938 }
3939 
3941  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3942  Program->RefCount.increment();
3943  return PI_SUCCESS;
3944 }
3945 
3947  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3948 
3949  if (!Program->RefCount.decrementAndTest())
3950  return PI_SUCCESS;
3951 
3952  delete Program;
3953 
3954  return PI_SUCCESS;
3955 }
3956 
3958  pi_native_handle *NativeHandle) {
3959  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3960  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3961 
3962  auto ZeModule = pi_cast<ze_module_handle_t *>(NativeHandle);
3963 
3964  std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
3965  switch (Program->State) {
3966  case _pi_program::Exe: {
3967  *ZeModule = Program->ZeModule;
3968  break;
3969  }
3970 
3971  default:
3972  return PI_ERROR_INVALID_OPERATION;
3973  }
3974 
3975  return PI_SUCCESS;
3976 }
3977 
3980  bool ownNativeHandle,
3981  pi_program *Program) {
3982  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3983  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3984  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3985 
3986  auto ZeModule = pi_cast<ze_module_handle_t>(NativeHandle);
3987 
3988  // We assume here that programs created from a native handle always
3989  // represent a fully linked executable (state Exe) and not an unlinked
3990  // executable (state Object).
3991 
3992  try {
3993  *Program =
3994  new _pi_program(_pi_program::Exe, Context, ZeModule, ownNativeHandle);
3995  } catch (const std::bad_alloc &) {
3996  return PI_ERROR_OUT_OF_HOST_MEMORY;
3997  } catch (...) {
3998  return PI_ERROR_UNKNOWN;
3999  }
4000  return PI_SUCCESS;
4001 }
4002 
4004  // According to Level Zero Specification, all kernels and build logs
4005  // must be destroyed before the Module can be destroyed. So, be sure
4006  // to destroy build log before destroying the module.
4007  if (ZeBuildLog) {
4008  ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (ZeBuildLog));
4009  }
4010 
4011  if (ZeModule && OwnZeModule) {
4012  ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule));
4013  }
4014 }
4015 
4016 // Check to see if a Level Zero module has any unresolved symbols.
4017 //
4018 // @param ZeModule The module handle to check.
4019 // @param ZeBuildLog If there are unresolved symbols, this build log handle is
4020 // modified to receive information telling which symbols
4021 // are unresolved.
4022 //
4023 // @return ZE_RESULT_ERROR_MODULE_LINK_FAILURE indicates there are unresolved
4024 // symbols. ZE_RESULT_SUCCESS indicates all symbols are resolved. Any other
4025 // value indicates there was an error and we cannot tell if symbols are
4026 // resolved.
4027 static ze_result_t
4029  ze_module_build_log_handle_t *ZeBuildLog) {
4030 
4031  // First check to see if the module has any imported symbols. If there are
4032  // no imported symbols, it's not possible to have any unresolved symbols. We
4033  // do this check first because we assume it's faster than the call to
4034  // zeModuleDynamicLink below.
4035  ZeStruct<ze_module_properties_t> ZeModuleProps;
4036  ze_result_t ZeResult =
4037  ZE_CALL_NOCHECK(zeModuleGetProperties, (ZeModule, &ZeModuleProps));
4038  if (ZeResult != ZE_RESULT_SUCCESS)
4039  return ZeResult;
4040 
4041  // If there are imported symbols, attempt to "link" the module with itself.
4042  // As a side effect, this will return the error
4043  // ZE_RESULT_ERROR_MODULE_LINK_FAILURE if there are any unresolved symbols.
4044  if (ZeModuleProps.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS) {
4045  return ZE_CALL_NOCHECK(zeModuleDynamicLink, (1, &ZeModule, ZeBuildLog));
4046  }
4047  return ZE_RESULT_SUCCESS;
4048 }
4049 
4050 pi_result piKernelCreate(pi_program Program, const char *KernelName,
4051  pi_kernel *RetKernel) {
4052 
4053  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4054  PI_ASSERT(RetKernel, PI_ERROR_INVALID_VALUE);
4055  PI_ASSERT(KernelName, PI_ERROR_INVALID_VALUE);
4056 
4057  std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
4058  if (Program->State != _pi_program::Exe) {
4059  return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
4060  }
4061 
4062  ZeStruct<ze_kernel_desc_t> ZeKernelDesc;
4063  ZeKernelDesc.flags = 0;
4064  ZeKernelDesc.pKernelName = KernelName;
4065 
4066  ze_kernel_handle_t ZeKernel;
4067  ZE_CALL(zeKernelCreate, (Program->ZeModule, &ZeKernelDesc, &ZeKernel));
4068 
4069  try {
4070  *RetKernel = new _pi_kernel(ZeKernel, true, Program);
4071  } catch (const std::bad_alloc &) {
4072  return PI_ERROR_OUT_OF_HOST_MEMORY;
4073  } catch (...) {
4074  return PI_ERROR_UNKNOWN;
4075  }
4076 
4077  PI_CALL((*RetKernel)->initialize());
4078  return PI_SUCCESS;
4079 }
4080 
4082  // Retain the program and context to show it's used by this kernel.
4083  PI_CALL(piProgramRetain(Program));
4084  if (IndirectAccessTrackingEnabled)
4085  // TODO: do piContextRetain without the guard
4086  PI_CALL(piContextRetain(Program->Context));
4087 
4088  // Set up how to obtain kernel properties when needed.
4089  ZeKernelProperties.Compute = [this](ze_kernel_properties_t &Properties) {
4090  ZE_CALL_NOCHECK(zeKernelGetProperties, (ZeKernel, &Properties));
4091  };
4092 
4093  // Cache kernel name.
4094  ZeKernelName.Compute = [this](std::string &Name) {
4095  size_t Size = 0;
4096  ZE_CALL_NOCHECK(zeKernelGetName, (ZeKernel, &Size, nullptr));
4097  char *KernelName = new char[Size];
4098  ZE_CALL_NOCHECK(zeKernelGetName, (ZeKernel, &Size, KernelName));
4099  Name = KernelName;
4100  delete[] KernelName;
4101  };
4102 
4103  return PI_SUCCESS;
4104 }
4105 
4106 pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex, size_t ArgSize,
4107  const void *ArgValue) {
4108 
4109  // OpenCL: "the arg_value pointer can be NULL or point to a NULL value
4110  // in which case a NULL value will be used as the value for the argument
4111  // declared as a pointer to global or constant memory in the kernel"
4112  //
4113  // We don't know the type of the argument but it seems that the only time
4114  // SYCL RT would send a pointer to NULL in 'arg_value' is when the argument
4115  // is a NULL pointer. Treat a pointer to NULL in 'arg_value' as a NULL.
4116  if (ArgSize == sizeof(void *) && ArgValue &&
4117  *(void **)(const_cast<void *>(ArgValue)) == nullptr) {
4118  ArgValue = nullptr;
4119  }
4120 
4121  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4122 
4123  std::scoped_lock<pi_shared_mutex> Guard(Kernel->Mutex);
4124  ZE_CALL(zeKernelSetArgumentValue,
4125  (pi_cast<ze_kernel_handle_t>(Kernel->ZeKernel),
4126  pi_cast<uint32_t>(ArgIndex), pi_cast<size_t>(ArgSize),
4127  pi_cast<const void *>(ArgValue)));
4128 
4129  return PI_SUCCESS;
4130 }
4131 
4132 // Special version of piKernelSetArg to accept pi_mem.
4134  const pi_mem *ArgValue) {
4135  // TODO: the better way would probably be to add a new PI API for
4136  // extracting native PI object from PI handle, and have SYCL
4137  // RT pass that directly to the regular piKernelSetArg (and
4138  // then remove this piextKernelSetArgMemObj).
4139 
4140  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4141 
4142  // We don't yet know the device where this kernel will next be run on.
4143  // Thus we can't know the actual memory allocation that needs to be used.
4144  // Remember the memory object being used as an argument for this kernel
4145  // to process it later when the device is known (at the kernel enqueue).
4146  //
4147  // TODO: for now we have to conservatively assume the access as read-write.
4148  // Improve that by passing SYCL buffer accessor type into
4149  // piextKernelSetArgMemObj.
4150  //
4151  std::scoped_lock<pi_shared_mutex> Guard(Kernel->Mutex);
4152  // The ArgValue may be a NULL pointer in which case a NULL value is used for
4153  // the kernel argument declared as a pointer to global or constant memory.
4154  auto Arg = ArgValue ? *ArgValue : nullptr;
4155  Kernel->PendingArguments.push_back(
4156  {ArgIndex, sizeof(void *), Arg, _pi_mem::read_write});
4157 
4158  return PI_SUCCESS;
4159 }
4160 
4161 // Special version of piKernelSetArg to accept pi_sampler.
4163  const pi_sampler *ArgValue) {
4164  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4165 
4166  std::scoped_lock<pi_shared_mutex> Guard(Kernel->Mutex);
4167  ZE_CALL(zeKernelSetArgumentValue,
4168  (pi_cast<ze_kernel_handle_t>(Kernel->ZeKernel),
4169  pi_cast<uint32_t>(ArgIndex), sizeof(void *),
4170  &(*ArgValue)->ZeSampler));
4171 
4172  return PI_SUCCESS;
4173 }
4174 
4176  size_t ParamValueSize, void *ParamValue,
4177  size_t *ParamValueSizeRet) {
4178  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4179 
4180  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4181 
4182  std::shared_lock<pi_shared_mutex> Guard(Kernel->Mutex);
4183  switch (ParamName) {
4185  return ReturnValue(pi_context{Kernel->Program->Context});
4187  return ReturnValue(pi_program{Kernel->Program});
4189  try {
4190  std::string &KernelName = *Kernel->ZeKernelName.operator->();
4191  return ReturnValue(static_cast<const char *>(KernelName.c_str()));
4192  } catch (const std::bad_alloc &) {
4193  return PI_ERROR_OUT_OF_HOST_MEMORY;
4194  } catch (...) {
4195  return PI_ERROR_UNKNOWN;
4196  }
4198  return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->numKernelArgs});
4200  return ReturnValue(pi_uint32{Kernel->RefCount.load()});
4202  try {
4203  uint32_t Size;
4204  ZE_CALL(zeKernelGetSourceAttributes, (Kernel->ZeKernel, &Size, nullptr));
4205  char *attributes = new char[Size];
4206  ZE_CALL(zeKernelGetSourceAttributes,
4207  (Kernel->ZeKernel, &Size, &attributes));
4208  auto Res = ReturnValue(attributes);
4209  delete[] attributes;
4210  return Res;
4211  } catch (const std::bad_alloc &) {
4212  return PI_ERROR_OUT_OF_HOST_MEMORY;
4213  } catch (...) {
4214  return PI_ERROR_UNKNOWN;
4215  }
4216  default:
4217  zePrint("Unsupported ParamName in piKernelGetInfo: ParamName=%d(0x%x)\n",
4218  ParamName, ParamName);
4219  return PI_ERROR_INVALID_VALUE;
4220  }
4221 
4222  return PI_SUCCESS;
4223 }
4224 
4226  pi_kernel_group_info ParamName,
4227  size_t ParamValueSize, void *ParamValue,
4228  size_t *ParamValueSizeRet) {
4229  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4230  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
4231 
4232  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4233 
4234  std::shared_lock<pi_shared_mutex> Guard(Kernel->Mutex);
4235  switch (ParamName) {
4237  // TODO: To revisit after level_zero/issues/262 is resolved
4238  struct {
4239  size_t Arr[3];
4240  } WorkSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX,
4241  Device->ZeDeviceComputeProperties->maxGroupSizeY,
4242  Device->ZeDeviceComputeProperties->maxGroupSizeZ}};
4243  return ReturnValue(WorkSize);
4244  }
4246  // As of right now, L0 is missing API to query kernel and device specific
4247  // max work group size.
4248  return ReturnValue(
4249  pi_uint64{Device->ZeDeviceComputeProperties->maxTotalGroupSize});
4250  }
4252  struct {
4253  size_t Arr[3];
4254  } WgSize = {{Kernel->ZeKernelProperties->requiredGroupSizeX,
4255  Kernel->ZeKernelProperties->requiredGroupSizeY,
4256  Kernel->ZeKernelProperties->requiredGroupSizeZ}};
4257  return ReturnValue(WgSize);
4258  }
4260  return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->localMemSize});
4262  return ReturnValue(size_t{Device->ZeDeviceProperties->physicalEUSimdWidth});
4263  }
4265  return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->privateMemSize});
4267  die("PI_KERNEL_GROUP_INFO_NUM_REGS in piKernelGetGroupInfo not "
4268  "implemented\n");
4269  break;
4270  }
4271  default:
4272  zePrint("Unknown ParamName in piKernelGetGroupInfo: ParamName=%d(0x%x)\n",
4273  ParamName, ParamName);
4274  return PI_ERROR_INVALID_VALUE;
4275  }
4276  return PI_SUCCESS;
4277 }
4278 
4280  pi_kernel_sub_group_info ParamName,
4281  size_t InputValueSize, const void *InputValue,
4282  size_t ParamValueSize, void *ParamValue,
4283  size_t *ParamValueSizeRet) {
4284  (void)Device;
4285  (void)InputValueSize;
4286  (void)InputValue;
4287 
4288  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4289 
4290  std::shared_lock<pi_shared_mutex> Guard(Kernel->Mutex);
4291  if (ParamName == PI_KERNEL_MAX_SUB_GROUP_SIZE) {
4292  ReturnValue(uint32_t{Kernel->ZeKernelProperties->maxSubgroupSize});
4293  } else if (ParamName == PI_KERNEL_MAX_NUM_SUB_GROUPS) {
4294  ReturnValue(uint32_t{Kernel->ZeKernelProperties->maxNumSubgroups});
4295  } else if (ParamName == PI_KERNEL_COMPILE_NUM_SUB_GROUPS) {
4296  ReturnValue(uint32_t{Kernel->ZeKernelProperties->requiredNumSubGroups});
4297  } else if (ParamName == PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL) {
4298  ReturnValue(uint32_t{Kernel->ZeKernelProperties->requiredSubgroupSize});
4299  } else {
4300  die("piKernelGetSubGroupInfo: parameter not implemented");
4301  return {};
4302  }
4303  return PI_SUCCESS;
4304 }
4305 
4307 
4308  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4309 
4310  Kernel->RefCount.increment();
4311  return PI_SUCCESS;
4312 }
4313 
4315  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4316 
4317  if (!Kernel->RefCount.decrementAndTest())
4318  return PI_SUCCESS;
4319 
4320  auto KernelProgram = Kernel->Program;
4321  if (Kernel->OwnZeKernel)
4322  ZE_CALL(zeKernelDestroy, (Kernel->ZeKernel));
4323  if (IndirectAccessTrackingEnabled) {
4324  PI_CALL(piContextRelease(KernelProgram->Context));
4325  }
4326  // do a release on the program this kernel was part of
4327  PI_CALL(piProgramRelease(KernelProgram));
4328  delete Kernel;
4329 
4330  return PI_SUCCESS;
4331 }
4332 
4333 pi_result
4335  const size_t *GlobalWorkOffset,
4336  const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
4337  pi_uint32 NumEventsInWaitList,
4338  const pi_event *EventWaitList, pi_event *OutEvent) {
4339  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4340  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
4341  PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION);
4342 
4343  // Lock automatically releases when this goes out of scope.
4344  std::scoped_lock<pi_shared_mutex, pi_shared_mutex, pi_shared_mutex> Lock(
4345  Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex);
4346  if (GlobalWorkOffset != NULL) {
4347  if (!Queue->Device->Platform->ZeDriverGlobalOffsetExtensionFound) {
4348  zePrint("No global offset extension found on this driver\n");
4349  return PI_ERROR_INVALID_VALUE;
4350  }
4351 
4352  ZE_CALL(zeKernelSetGlobalOffsetExp,
4353  (Kernel->ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1],
4354  GlobalWorkOffset[2]));
4355  }
4356 
4357  // If there are any pending arguments set them now.
4358  for (auto &Arg : Kernel->PendingArguments) {
4359  // The ArgValue may be a NULL pointer in which case a NULL value is used for
4360  // the kernel argument declared as a pointer to global or constant memory.
4361  char **ZeHandlePtr = nullptr;
4362  if (Arg.Value) {
4363  PI_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode,
4364  Queue->Device));
4365  }
4366  ZE_CALL(zeKernelSetArgumentValue,
4367  (Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr));
4368  }
4369  Kernel->PendingArguments.clear();
4370 
4371  ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
4372  uint32_t WG[3];
4373 
4374  // global_work_size of unused dimensions must be set to 1
4375  PI_ASSERT(WorkDim == 3 || GlobalWorkSize[2] == 1, PI_ERROR_INVALID_VALUE);
4376  PI_ASSERT(WorkDim >= 2 || GlobalWorkSize[1] == 1, PI_ERROR_INVALID_VALUE);
4377 
4378  if (LocalWorkSize) {
4379  WG[0] = pi_cast<uint32_t>(LocalWorkSize[0]);
4380  WG[1] = pi_cast<uint32_t>(LocalWorkSize[1]);
4381  WG[2] = pi_cast<uint32_t>(LocalWorkSize[2]);
4382  } else {
4383  // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize
4384  // values do not fit to 32-bit that the API only supports currently.
4385  bool SuggestGroupSize = true;
4386  for (int I : {0, 1, 2}) {
4387  if (GlobalWorkSize[I] > UINT32_MAX) {
4388  SuggestGroupSize = false;
4389  }
4390  }
4391  if (SuggestGroupSize) {
4392  ZE_CALL(zeKernelSuggestGroupSize,
4393  (Kernel->ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1],
4394  GlobalWorkSize[2], &WG[0], &WG[1], &WG[2]));
4395  } else {
4396  for (int I : {0, 1, 2}) {
4397  // Try to find a I-dimension WG size that the GlobalWorkSize[I] is
4398  // fully divisable with. Start with the max possible size in
4399  // each dimension.
4400  uint32_t GroupSize[] = {
4401  Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX,
4402  Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY,
4403  Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ};
4404  GroupSize[I] = std::min(size_t(GroupSize[I]), GlobalWorkSize[I]);
4405  while (GlobalWorkSize[I] % GroupSize[I]) {
4406  --GroupSize[I];
4407  }
4408  if (GlobalWorkSize[I] / GroupSize[I] > UINT32_MAX) {
4409  zePrint("piEnqueueKernelLaunch: can't find a WG size "
4410  "suitable for global work size > UINT32_MAX\n");
4411  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
4412  }
4413  WG[I] = GroupSize[I];
4414  }
4415  zePrint("piEnqueueKernelLaunch: using computed WG size = {%d, %d, %d}\n",
4416  WG[0], WG[1], WG[2]);
4417  }
4418  }
4419 
4420  // TODO: assert if sizes do not fit into 32-bit?
4421  switch (WorkDim) {
4422  case 3:
4423  ZeThreadGroupDimensions.groupCountX =
4424  pi_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
4425  ZeThreadGroupDimensions.groupCountY =
4426  pi_cast<uint32_t>(GlobalWorkSize[1] / WG[1]);
4427  ZeThreadGroupDimensions.groupCountZ =
4428  pi_cast<uint32_t>(GlobalWorkSize[2] / WG[2]);
4429  break;
4430  case 2:
4431  ZeThreadGroupDimensions.groupCountX =
4432  pi_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
4433  ZeThreadGroupDimensions.groupCountY =
4434  pi_cast<uint32_t>(GlobalWorkSize[1] / WG[1]);
4435  WG[2] = 1;
4436  break;
4437  case 1:
4438  ZeThreadGroupDimensions.groupCountX =
4439  pi_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
4440  WG[1] = WG[2] = 1;
4441  break;
4442 
4443  default:
4444  zePrint("piEnqueueKernelLaunch: unsupported work_dim\n");
4445  return PI_ERROR_INVALID_VALUE;
4446  }
4447 
4448  // Error handling for non-uniform group size case
4449  if (GlobalWorkSize[0] !=
4450  size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) {
4451  zePrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
4452  "multiple of the group size in the 1st dimension\n");
4453  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
4454  }
4455  if (GlobalWorkSize[1] !=
4456  size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) {
4457  zePrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
4458  "multiple of the group size in the 2nd dimension\n");
4459  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
4460  }
4461  if (GlobalWorkSize[2] !=
4462  size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) {
4463  zePrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
4464  "multiple of the group size in the 3rd dimension\n");
4465  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
4466  }
4467 
4468  ZE_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2]));
4469 
4470  bool UseCopyEngine = false;
4471  _pi_ze_event_list_t TmpWaitList;
4472  if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
4473  NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
4474  return Res;
4475 
4476  // Get a new command list to be used on this call
4477  pi_command_list_ptr_t CommandList{};
4478  if (auto Res = Queue->Context->getAvailableCommandList(
4479  Queue, CommandList, UseCopyEngine, true /* AllowBatching */))
4480  return Res;
4481 
4482  ze_event_handle_t ZeEvent = nullptr;
4483  pi_event InternalEvent;
4484  bool IsInternal = OutEvent == nullptr;
4485  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
4487  Queue, Event, PI_COMMAND_TYPE_NDRANGE_KERNEL, CommandList, IsInternal);
4488  if (Res != PI_SUCCESS)
4489  return Res;
4490  ZeEvent = (*Event)->ZeEvent;
4491  (*Event)->WaitList = TmpWaitList;
4492 
4493  // Save the kernel in the event, so that when the event is signalled
4494  // the code can do a piKernelRelease on this kernel.
4495  (*Event)->CommandData = (void *)Kernel;
4496 
4497  // Increment the reference count of the Kernel and indicate that the Kernel is
4498  // in use. Once the event has been signalled, the code in
4499  // CleanupCompletedEvent(Event) will do a piReleaseKernel to update the
4500  // reference count on the kernel, using the kernel saved in CommandData.
4501  PI_CALL(piKernelRetain(Kernel));
4502 
4503  // Add to list of kernels to be submitted
4504  if (IndirectAccessTrackingEnabled)
4505  Queue->KernelsToBeSubmitted.push_back(Kernel);
4506 
4507  if (Queue->Device->ImmCommandListUsed && IndirectAccessTrackingEnabled) {
4508  // If using immediate commandlists then gathering of indirect
4509  // references and appending to the queue (which means submission)
4510  // must be done together.
4511  std::unique_lock<pi_shared_mutex> ContextsLock(
4512  Queue->Device->Platform->ContextsMutex, std::defer_lock);
4513  // We are going to submit kernels for execution. If indirect access flag is
4514  // set for a kernel then we need to make a snapshot of existing memory
4515  // allocations in all contexts in the platform. We need to lock the mutex
4516  // guarding the list of contexts in the platform to prevent creation of new
4517  // memory alocations in any context before we submit the kernel for
4518  // execution.
4519  ContextsLock.lock();
4520  Queue->CaptureIndirectAccesses();
4521  // Add the command to the command list, which implies submission.
4522  ZE_CALL(zeCommandListAppendLaunchKernel,
4523  (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
4524  ZeEvent, (*Event)->WaitList.Length,
4525  (*Event)->WaitList.ZeEventList));
4526  } else {
4527  // Add the command to the command list for later submission.
4528  // No lock is needed here, unlike the immediate commandlist case above,
4529  // because the kernels are not actually submitted yet. Kernels will be
4530  // submitted only when the comamndlist is closed. Then, a lock is held.
4531  ZE_CALL(zeCommandListAppendLaunchKernel,
4532  (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
4533  ZeEvent, (*Event)->WaitList.Length,
4534  (*Event)->WaitList.ZeEventList));
4535  }
4536 
4537  zePrint("calling zeCommandListAppendLaunchKernel() with"
4538  " ZeEvent %#llx\n",
4539  pi_cast<std::uintptr_t>(ZeEvent));
4540  printZeEventList((*Event)->WaitList);
4541 
4542  // Execute command list asynchronously, as the event will be used
4543  // to track down its completion.
4544  if (auto Res = Queue->executeCommandList(CommandList, false, true))
4545  return Res;
4546 
4547  return PI_SUCCESS;
4548 }
4549 
4552  pi_program Program,
4553  bool OwnNativeHandle,
4554  pi_kernel *Kernel) {
4555  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
4556  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4557  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
4558  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4559 
4560  auto ZeKernel = pi_cast<ze_kernel_handle_t>(NativeHandle);
4561  *Kernel = new _pi_kernel(ZeKernel, OwnNativeHandle, Program);
4562  PI_CALL((*Kernel)->initialize());
4563  return PI_SUCCESS;
4564 }
4565 
4567  pi_native_handle *NativeHandle) {
4568  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4569  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
4570 
4571  std::shared_lock<pi_shared_mutex> Guard(Kernel->Mutex);
4572  auto *ZeKernel = pi_cast<ze_kernel_handle_t *>(NativeHandle);
4573  *ZeKernel = Kernel->ZeKernel;
4574  return PI_SUCCESS;
4575 }
4576 
4577 //
4578 // Events
4579 //
4580 pi_result
4582  PI_ASSERT(Queue, PI_ERROR_INVALID_EVENT);
4583 
4584  std::scoped_lock<pi_shared_mutex, pi_shared_mutex> Lock(Queue->Mutex,
4585  this->Mutex);
4586 
4587  if (!HostVisibleEvent) {
4588  if (Queue->Device->ZeEventsScope != OnDemandHostVisibleProxy)
4589  die("getOrCreateHostVisibleEvent: missing host-visible event");
4590 
4591  // Submit the command(s) signalling the proxy event to the queue.
4592  // We have to first submit a wait for the device-only event for which this
4593  // proxy is created.
4594  //
4595  // Get a new command list to be used on this call
4596 
4597  // We want to batch these commands to avoid extra submissions (costly)
4598  bool OkToBatch = true;
4599 
4600  pi_command_list_ptr_t CommandList{};
4601  if (auto Res = Queue->Context->getAvailableCommandList(
4602  Queue, CommandList, false /* UseCopyEngine */, OkToBatch))
4603  return Res;
4604 
4605  // Create a "proxy" host-visible event.
4606  auto Res = createEventAndAssociateQueue(
4607  Queue, &HostVisibleEvent, PI_COMMAND_TYPE_USER, CommandList,
4608  /* IsInternal */ false, /* ForceHostVisible */ true);
4609  if (Res != PI_SUCCESS)
4610  return Res;
4611 
4612  ZE_CALL(zeCommandListAppendWaitOnEvents, (CommandList->first, 1, &ZeEvent));
4613  ZE_CALL(zeCommandListAppendSignalEvent,
4614  (CommandList->first, HostVisibleEvent->ZeEvent));
4615 
4616  if (auto Res = Queue->executeCommandList(CommandList, false, OkToBatch))
4617  return Res;
4618  }
4619 
4620  ZeHostVisibleEvent = HostVisibleEvent->ZeEvent;
4621  return PI_SUCCESS;
4622 }
4623 
4625  Queue = nullptr;
4626  CleanedUp = false;
4627  Completed = false;
4628  CommandData = nullptr;
4629  CommandType = PI_COMMAND_TYPE_USER;
4630  WaitList = {};
4631  RefCountExternal = 0;
4632  RefCount.reset();
4633  CommandList = std::nullopt;
4634 
4635  if (!isHostVisible())
4636  HostVisibleEvent = nullptr;
4637 
4638  ZE_CALL(zeEventHostReset, (ZeEvent));
4639  return PI_SUCCESS;
4640 }
4641 
4643  bool WithProfiling) {
4644  std::scoped_lock<pi_mutex> Lock(EventCacheMutex);
4645  auto Cache = getEventCache(HostVisible, WithProfiling);
4646  if (Cache->empty())
4647  return nullptr;
4648 
4649  auto It = Cache->begin();
4650  pi_event Event = *It;
4651  Cache->erase(It);
4652  // We have to reset event before using it.
4653  Event->reset();
4654  return Event;
4655 }
4656 
4658  std::scoped_lock<pi_mutex> Lock(EventCacheMutex);
4659  auto Cache =
4660  getEventCache(Event->isHostVisible(), Event->isProfilingEnabled());
4661  Cache->emplace_back(Event);
4662 }
4663 
4664 // Helper function for creating a PI event.
4665 // The "Queue" argument specifies the PI queue where a command is submitted.
4666 // The "HostVisible" argument specifies if event needs to be allocated from
4667 // a host-visible pool.
4668 //
4670  bool HostVisible, pi_event *RetEvent) {
4671  bool ProfilingEnabled =
4672  !Queue || (Queue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) != 0;
4673 
4674  if (auto CachedEvent =
4675  Context->getEventFromContextCache(HostVisible, ProfilingEnabled)) {
4676  *RetEvent = CachedEvent;
4677  return PI_SUCCESS;
4678  }
4679 
4680  ze_event_handle_t ZeEvent;
4681  ze_event_pool_handle_t ZeEventPool = {};
4682 
4683  size_t Index = 0;
4684 
4685  if (auto Res = Context->getFreeSlotInExistingOrNewPool(
4686  ZeEventPool, Index, HostVisible, ProfilingEnabled))
4687  return Res;
4688 
4689  ZeStruct<ze_event_desc_t> ZeEventDesc;
4690  ZeEventDesc.index = Index;
4691  ZeEventDesc.wait = 0;
4692 
4693  if (HostVisible) {
4694  ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
4695  } else {
4696  //
4697  // Set the scope to "device" for every event. This is sufficient for
4698  // global device access and peer device access. If needed to be seen on
4699  // the host we are doing special handling, see EventsScope options.
4700  //
4701  // TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be
4702  // used in some circumstances.
4703  //
4704  ZeEventDesc.signal = 0;
4705  }
4706 
4707  ZE_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeEvent));
4708 
4709  try {
4710  PI_ASSERT(RetEvent, PI_ERROR_INVALID_VALUE);
4711 
4712  *RetEvent = new _pi_event(ZeEvent, ZeEventPool, Context,
4713  PI_COMMAND_TYPE_USER, true);
4714  } catch (const std::bad_alloc &) {
4715  return PI_ERROR_OUT_OF_HOST_MEMORY;
4716  } catch (...) {
4717  return PI_ERROR_UNKNOWN;
4718  }
4719 
4720  if (HostVisible)
4721  (*RetEvent)->HostVisibleEvent = *RetEvent;
4722 
4723  return PI_SUCCESS;
4724 }
4725 
4726 // External PI API entry
4728  pi_result Result = EventCreate(Context, nullptr, true, RetEvent);
4729  (*RetEvent)->RefCountExternal++;
4730  if (Result != PI_SUCCESS)
4731  return Result;
4732  ZE_CALL(zeEventHostSignal, ((*RetEvent)->ZeEvent));
4733  return PI_SUCCESS;
4734 }
4735 
4737  size_t ParamValueSize, void *ParamValue,
4738  size_t *ParamValueSizeRet) {
4739 
4740  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
4741 
4742  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4743  switch (ParamName) {
4745  std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex);
4746  return ReturnValue(pi_queue{Event->Queue});
4747  }
4748  case PI_EVENT_INFO_CONTEXT: {
4749  std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex);
4750  return ReturnValue(pi_context{Event->Context});
4751  }
4753  std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex);
4754  return ReturnValue(pi_cast<pi_uint64>(Event->CommandType));
4755  }
4757  // Check to see if the event's Queue has an open command list due to
4758  // batching. If so, go ahead and close and submit it, because it is
4759  // possible that this is trying to query some event's status that
4760  // is part of the batch. This isn't strictly required, but it seems
4761  // like a reasonable thing to do.
4762  auto Queue = Event->Queue;
4763  if (Queue) {
4764  // Lock automatically releases when this goes out of scope.
4765  std::scoped_lock<pi_shared_mutex> lock(Queue->Mutex);
4766  const auto &OpenCommandList = Queue->eventOpenCommandList(Event);
4767  if (OpenCommandList != Queue->CommandListMap.end()) {
4768  if (auto Res = Queue->executeOpenCommandList(
4769  OpenCommandList->second.isCopy(Queue)))
4770  return Res;
4771  }
4772  }
4773 
4774  // Level Zero has a much more explicit notion of command submission than
4775  // OpenCL. It doesn't happen unless the user submits a command list. We've
4776  // done it just above so the status is at least PI_EVENT_RUNNING.
4777  pi_int32 Result = PI_EVENT_RUNNING;
4778 
4779  // Make sure that we query a host-visible event only.
4780  // If one wasn't yet created then don't create it here as well, and
4781  // just conservatively return that event is not yet completed.
4782  std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex);
4783  auto HostVisibleEvent = Event->HostVisibleEvent;
4784  if (Event->Completed) {
4785  Result = PI_EVENT_COMPLETE;
4786  } else if (HostVisibleEvent) {
4787  ze_result_t ZeResult;
4788  ZeResult =
4789  ZE_CALL_NOCHECK(zeEventQueryStatus, (HostVisibleEvent->ZeEvent));
4790  if (ZeResult == ZE_RESULT_SUCCESS) {
4791  Result = PI_EVENT_COMPLETE;
4792  }
4793  }
4794  return ReturnValue(pi_cast<pi_int32>(Result));
4795  }
4797  return ReturnValue(pi_uint32{Event->RefCount.load()});
4798  default:
4799  zePrint("Unsupported ParamName in piEventGetInfo: ParamName=%d(%x)\n",
4800  ParamName, ParamName);
4801  return PI_ERROR_INVALID_VALUE;
4802  }
4803 
4804  return PI_SUCCESS;
4805 }
4806 
4808  size_t ParamValueSize, void *ParamValue,
4809  size_t *ParamValueSizeRet) {
4810 
4811  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
4812 
4813  std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex);
4814  if (Event->Queue &&
4815  (Event->Queue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
4816  return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE;
4817  }
4818 
4819  pi_device Device =
4820  Event->Queue ? Event->Queue->Device : Event->Context->Devices[0];
4821 
4822  uint64_t ZeTimerResolution = Device->ZeDeviceProperties->timerResolution;
4823  const uint64_t TimestampMaxValue =
4824  ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL);
4825 
4826  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4827 
4828  ze_kernel_timestamp_result_t tsResult;
4829 
4830  switch (ParamName) {
4832  ZE_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult));
4833  uint64_t ContextStartTime =
4834  (tsResult.global.kernelStart & TimestampMaxValue) * ZeTimerResolution;
4835  return ReturnValue(ContextStartTime);
4836  }
4838  ZE_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult));
4839 
4840  uint64_t ContextStartTime =
4841  (tsResult.global.kernelStart & TimestampMaxValue);
4842  uint64_t ContextEndTime = (tsResult.global.kernelEnd & TimestampMaxValue);
4843 
4844  //
4845  // Handle a possible wrap-around (the underlying HW counter is < 64-bit).
4846  // Note, it will not report correct time if there were multiple wrap
4847  // arounds, and the longer term plan is to enlarge the capacity of the
4848  // HW timestamps.
4849  //
4850  if (ContextEndTime <= ContextStartTime) {
4851  ContextEndTime += TimestampMaxValue;
4852  }
4853  ContextEndTime *= ZeTimerResolution;
4854  return ReturnValue(ContextEndTime);
4855  }
4858  // Note: No users for this case
4859  // TODO: Implement commmand submission time when needed,
4860  // by recording device timestamp (using zeDeviceGetGlobalTimestamps)
4861  // before submitting command to device
4862  return ReturnValue(uint64_t{0});
4863  default:
4864  zePrint("piEventGetProfilingInfo: not supported ParamName\n");
4865  return PI_ERROR_INVALID_VALUE;
4866  }
4867 
4868  return PI_SUCCESS;
4869 }
4870 
4871 } // extern "C"
4872 
4873 // Perform any necessary cleanup after an event has been signalled.
4874 // This currently makes sure to release any kernel that may have been used by
4875 // the event, updates the last command event in the queue and cleans up all dep
4876 // events of the event.
4877 // If the caller locks queue mutex then it must pass 'true' to QueueLocked.
4878 static pi_result CleanupCompletedEvent(pi_event Event, bool QueueLocked) {
4879  pi_kernel AssociatedKernel = nullptr;
4880  // List of dependent events.
4881  std::list<pi_event> EventsToBeReleased;
4882  pi_queue AssociatedQueue = nullptr;
4883  {
4884  std::scoped_lock<pi_shared_mutex> EventLock(Event->Mutex);
4885  // Exit early of event was already cleanedup.
4886  if (Event->CleanedUp)
4887  return PI_SUCCESS;
4888 
4889  AssociatedQueue = Event->Queue;
4890 
4891  // Remember the kernel associated with this event if there is one. We are
4892  // going to release it later.
4894  Event->CommandData) {
4895  AssociatedKernel = pi_cast<pi_kernel>(Event->CommandData);
4896  Event->CommandData = nullptr;
4897  }
4898 
4899  // Make a list of all the dependent events that must have signalled
4900  // because this event was dependent on them.
4902  EventsToBeReleased);
4903 
4904  Event->CleanedUp = true;
4905  }
4906 
4907  auto ReleaseIndirectMem = [](pi_kernel Kernel) {
4908  if (IndirectAccessTrackingEnabled) {
4909  // piKernelRelease is called by CleanupCompletedEvent(Event) as soon as
4910  // kernel execution has finished. This is the place where we need to
4911  // release memory allocations. If kernel is not in use (not submitted by
4912  // some other thread) then release referenced memory allocations. As a
4913  // result, memory can be deallocated and context can be removed from
4914  // container in the platform. That's why we need to lock a mutex here.
4915  pi_platform Plt = Kernel->Program->Context->getPlatform();
4916  std::scoped_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex);
4917 
4918  if (--Kernel->SubmissionsCount == 0) {
4919  // Kernel is not submitted for execution, release referenced memory
4920  // allocations.
4921  for (auto &MemAlloc : Kernel->MemAllocs) {
4922  // std::pair<void *const, MemAllocRecord> *, Hash
4923  USMFreeHelper(MemAlloc->second.Context, MemAlloc->first,
4924  MemAlloc->second.OwnZeMemHandle);
4925  }
4926  Kernel->MemAllocs.clear();
4927  }
4928  }
4929  };
4930 
4931  // We've reset event data members above, now cleanup resources.
4932  if (AssociatedKernel) {
4933  ReleaseIndirectMem(AssociatedKernel);
4934  PI_CALL(piKernelRelease(AssociatedKernel));
4935  }
4936 
4937  if (AssociatedQueue) {
4938  {
4939  // Lock automatically releases when this goes out of scope.
4940  std::unique_lock<pi_shared_mutex> QueueLock(AssociatedQueue->Mutex,
4941  std::defer_lock);
4942  if (!QueueLocked)
4943  QueueLock.lock();
4944 
4945  // If this event was the LastCommandEvent in the queue, being used
4946  // to make sure that commands were executed in-order, remove this.
4947  // If we don't do this, the event can get released and freed leaving
4948  // a dangling pointer to this event. It could also cause unneeded
4949  // already finished events to show up in the wait list.
4950  if (AssociatedQueue->LastCommandEvent == Event) {
4951  AssociatedQueue->LastCommandEvent = nullptr;
4952  }
4953  }
4954 
4955  // Release this event since we explicitly retained it on creation and
4956  // association with queue. Events which don't have associated queue doesn't
4957  // require this release because it means that they are not created using
4958  // createEventAndAssociateQueue, i.e. additional retain was not made.
4960  }
4961 
4962  // The list of dependent events will be appended to as we walk it so that this
4963  // algorithm doesn't go recursive due to dependent events themselves being
4964  // dependent on other events forming a potentially very deep tree, and deep
4965  // recursion. That turned out to be a significant problem with the recursive
4966  // code that preceded this implementation.
4967  while (!EventsToBeReleased.empty()) {
4968  pi_event DepEvent = EventsToBeReleased.front();
4969  DepEvent->Completed = true;
4970  EventsToBeReleased.pop_front();
4971 
4972  pi_kernel DepEventKernel = nullptr;
4973  {
4974  std::scoped_lock<pi_shared_mutex> DepEventLock(DepEvent->Mutex);
4976  EventsToBeReleased);
4977  if (IndirectAccessTrackingEnabled) {
4978  // DepEvent has finished, we can release the associated kernel if there
4979  // is one. This is the earliest place we can do this and it can't be
4980  // done twice, so it is safe. Lock automatically releases when this goes
4981  // out of scope.
4982  // TODO: this code needs to be moved out of the guard.
4983  if (DepEvent->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL &&
4984  DepEvent->CommandData) {
4985  DepEventKernel = pi_cast<pi_kernel>(DepEvent->CommandData);
4986  DepEvent->CommandData = nullptr;
4987  }
4988  }
4989  }
4990  if (DepEventKernel) {
4991  ReleaseIndirectMem(DepEventKernel);
4992  PI_CALL(piKernelRelease(DepEventKernel));
4993  }
4994  PI_CALL(piEventReleaseInternal(DepEvent));
4995  }
4996 
4997  return PI_SUCCESS;
4998 }
4999 
5000 extern "C" {
5001 
5002 pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
5003 
5004  if (NumEvents && !EventList) {
5005  return PI_ERROR_INVALID_EVENT;
5006  }
5007  for (uint32_t I = 0; I < NumEvents; I++) {
5008  if (EventList[I]->Queue->Device->ZeEventsScope ==
5010  // Make sure to add all host-visible "proxy" event signals if needed.
5011  // This ensures that all signalling commands are submitted below and
5012  // thus proxy events can be waited without a deadlock.
5013  //
5014  if (!EventList[I]->hasExternalRefs())
5015  die("piEventsWait must not be called for an internal event");
5016 
5017  ze_event_handle_t ZeHostVisibleEvent;
5018  if (auto Res =
5019  EventList[I]->getOrCreateHostVisibleEvent(ZeHostVisibleEvent))
5020  return Res;
5021  }
5022  }
5023  // Submit dependent open command lists for execution, if any
5024  for (uint32_t I = 0; I < NumEvents; I++) {
5025  auto Queue = EventList[I]->Queue;
5026  if (Queue) {
5027  // Lock automatically releases when this goes out of scope.
5028  std::scoped_lock<pi_shared_mutex> lock(Queue->Mutex);
5029 
5030  if (auto Res = Queue->executeAllOpenCommandLists())
5031  return Res;
5032  }
5033  }
5034  std::unordered_set<pi_queue> Queues;
5035  for (uint32_t I = 0; I < NumEvents; I++) {
5036  {
5037  {
5038  std::shared_lock<pi_shared_mutex> EventLock(EventList[I]->Mutex);
5039  if (!EventList[I]->hasExternalRefs())
5040  die("piEventsWait must not be called for an internal event");
5041 
5042  if (!EventList[I]->Completed) {
5043  auto HostVisibleEvent = EventList[I]->HostVisibleEvent;
5044  if (!HostVisibleEvent)
5045  die("The host-visible proxy event missing");
5046 
5047  ze_event_handle_t ZeEvent = HostVisibleEvent->ZeEvent;
5048  zePrint("ZeEvent = %#llx\n", pi_cast<std::uintptr_t>(ZeEvent));
5049  ZE_CALL(zeHostSynchronize, (ZeEvent));
5050  EventList[I]->Completed = true;
5051  }
5052  }
5053  if (auto Q = EventList[I]->Queue) {
5054  if (Q->Device->ImmCommandListUsed && Q->isInOrderQueue())
5055  // Use information about waited event to cleanup completed events in
5056  // the in-order queue.
5057  CleanupEventsInImmCmdLists(EventList[I]->Queue,
5058  /* QueueLocked */ false,
5059  /* QueueSynced */ false, EventList[I]);
5060  else {
5061  // NOTE: we are cleaning up after the event here to free resources
5062  // sooner in case run-time is not calling piEventRelease soon enough.
5063  CleanupCompletedEvent(EventList[I]);
5064  // For the case when we have out-of-order queue or regular command
5065  // lists its more efficient to check fences so put the queue in the
5066  // set to cleanup later.
5067  Queues.insert(Q);
5068  }
5069  }
5070  }
5071  }
5072 
5073  // We waited some events above, check queue for signaled command lists and
5074  // reset them.
5075  for (auto &Q : Queues)
5076  resetCommandLists(Q);
5077 
5078  return PI_SUCCESS;
5079 }
5080 
5081 pi_result piEventSetCallback(pi_event Event, pi_int32 CommandExecCallbackType,
5082  void (*PFnNotify)(pi_event Event,
5083  pi_int32 EventCommandStatus,
5084  void *UserData),
5085  void *UserData) {
5086  (void)Event;
5087  (void)CommandExecCallbackType;
5088  (void)PFnNotify;
5089  (void)UserData;
5090  die("piEventSetCallback: deprecated, to be removed");
5091  return PI_SUCCESS;
5092 }
5093 
5094 pi_result piEventSetStatus(pi_event Event, pi_int32 ExecutionStatus) {
5095  (void)Event;
5096  (void)ExecutionStatus;
5097  die("piEventSetStatus: deprecated, to be removed");
5098  return PI_SUCCESS;
5099 }
5100 
5102  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5103  Event->RefCountExternal++;
5104  Event->RefCount.increment();
5105  return PI_SUCCESS;
5106 }
5107 
5109  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5110  Event->RefCountExternal--;
5112  return PI_SUCCESS;
5113 }
5114 
5116  Event->RefCount.increment();
5117  Events.push_back(Event);
5118 }
5119 
5121  for (const auto &Event : Events)
5123  Events.clear();
5124  return PI_SUCCESS;
5125 }
5126 
5128  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5129 
5130  if (!Event->RefCount.decrementAndTest())
5131  return PI_SUCCESS;
5132 
5134  Event->CommandData) {
5135  // Free the memory allocated in the piEnqueueMemBufferMap.
5136  if (auto Res = ZeMemFreeHelper(Event->Context, Event->CommandData))
5137  return Res;
5138  Event->CommandData = nullptr;
5139  }
5140  if (Event->OwnZeEvent) {
5141  if (DisableEventsCaching) {
5142  ZE_CALL(zeEventDestroy, (Event->ZeEvent));
5143  auto Context = Event->Context;
5144  if (auto Res = Context->decrementUnreleasedEventsInPool(Event))
5145  return Res;
5146  }
5147  }
5148  // It is possible that host-visible event was never created.
5149  // In case it was check if that's different from this same event
5150  // and release a reference to it.
5151  if (Event->HostVisibleEvent && Event->HostVisibleEvent != Event) {
5152  // Decrement ref-count of the host-visible proxy event.
5154  }
5155 
5156  // Save pointer to the queue before deleting/resetting event.
5157  // When we add an event to the cache we need to check whether profiling is
5158  // enabled or not, so we access properties of the queue and that's why queue
5159  // must released later.
5160  auto Queue = Event->Queue;
5161  if (DisableEventsCaching || !Event->OwnZeEvent) {
5162  delete Event;
5163  } else {
5164  Event->Context->addEventToContextCache(Event);
5165  }
5166 
5167  // We intentionally incremented the reference counter when an event is
5168  // created so that we can avoid pi_queue is released before the associated
5169  // pi_event is released. Here we have to decrement it so pi_queue
5170  // can be released successfully.
5171  if (Queue) {
5173  }
5174 
5175  return PI_SUCCESS;
5176 }
5177 
5179  pi_native_handle *NativeHandle) {
5180  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5181  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
5182 
5183  {
5184  std::shared_lock<pi_shared_mutex> Lock(Event->Mutex);
5185  auto *ZeEvent = pi_cast<ze_event_handle_t *>(NativeHandle);
5186  *ZeEvent = Event->ZeEvent;
5187  }
5188  // Event can potentially be in an open command-list, make sure that
5189  // it is submitted for execution to avoid potential deadlock if
5190  // interop app is going to wait for it.
5191  auto Queue = Event->Queue;
5192  if (Queue) {
5193  std::scoped_lock<pi_shared_mutex> lock(Queue->Mutex);
5194  const auto &OpenCommandList = Queue->eventOpenCommandList(Event);
5195  if (OpenCommandList != Queue->CommandListMap.end()) {
5196  if (auto Res = Queue->executeOpenCommandList(
5197  OpenCommandList->second.isCopy(Queue)))
5198  return Res;
5199  }
5200  }
5201  return PI_SUCCESS;
5202 }
5203 
5206  bool OwnNativeHandle,
5207  pi_event *Event) {
5208  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
5209  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5210  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
5211 
5212  auto ZeEvent = pi_cast<ze_event_handle_t>(NativeHandle);
5213  *Event = new _pi_event(ZeEvent, nullptr /* ZeEventPool */, Context,
5214  PI_COMMAND_TYPE_USER, OwnNativeHandle);
5215 
5216  // Assume native event is host-visible, or otherwise we'd
5217  // need to create a host-visible proxy for it.
5218  (*Event)->HostVisibleEvent = *Event;
5219 
5220  // Unlike regular events managed by SYCL RT we don't have to wait for interop
5221  // events completion, and not need to do the their `cleanup()`. This in
5222  // particular guarantees that the extra `piEventRelease` is not called on
5223  // them. That release is needed to match the `piEventRetain` of regular events
5224  // made for waiting for event completion, but not this interop event.
5225  (*Event)->CleanedUp = true;
5226 
5227  return PI_SUCCESS;
5228 }
5229 
5230 //
5231 // Sampler
5232 //
5234  const pi_sampler_properties *SamplerProperties,
5235  pi_sampler *RetSampler) {
5236 
5237  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
5238  PI_ASSERT(RetSampler, PI_ERROR_INVALID_VALUE);
5239 
5240  std::shared_lock<pi_shared_mutex> Lock(Context->Mutex);
5241 
5242  // Have the "0" device in context to own the sampler. Rely on Level-Zero
5243  // drivers to perform migration as necessary for sharing it across multiple
5244  // devices in the context.
5245  //
5246  // TODO: figure out if we instead need explicit copying for acessing
5247  // the sampler from other devices in the context.
5248  //
5250 
5251  ze_sampler_handle_t ZeSampler;
5252  ZeStruct<ze_sampler_desc_t> ZeSamplerDesc;
5253 
5254  // Set the default values for the ZeSamplerDesc.
5255  ZeSamplerDesc.isNormalized = PI_TRUE;
5256  ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_CLAMP;
5257  ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST;
5258 
5259  // Update the values of the ZeSamplerDesc from the pi_sampler_properties list.
5260  // Default values will be used if any of the following is true:
5261  // a) SamplerProperties list is NULL
5262  // b) SamplerProperties list is missing any properties
5263 
5264  if (SamplerProperties) {
5265  const pi_sampler_properties *CurProperty = SamplerProperties;
5266 
5267  while (*CurProperty != 0) {
5268  switch (*CurProperty) {
5270  pi_bool CurValueBool = pi_cast<pi_bool>(*(++CurProperty));
5271 
5272  if (CurValueBool == PI_TRUE)
5273  ZeSamplerDesc.isNormalized = PI_TRUE;
5274  else if (CurValueBool == PI_FALSE)
5275  ZeSamplerDesc.isNormalized = PI_FALSE;
5276  else {
5277  zePrint("piSamplerCreate: unsupported "
5278  "PI_SAMPLER_NORMALIZED_COORDS value\n");
5279  return PI_ERROR_INVALID_VALUE;
5280  }
5281  } break;
5282 
5284  pi_sampler_addressing_mode CurValueAddressingMode =
5285  pi_cast<pi_sampler_addressing_mode>(
5286  pi_cast<pi_uint32>(*(++CurProperty)));
5287 
5288  // Level Zero runtime with API version 1.2 and lower has a bug:
5289  // ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER is implemented as "clamp to
5290  // edge" and ZE_SAMPLER_ADDRESS_MODE_CLAMP is implemented as "clamp to
5291  // border", i.e. logic is flipped. Starting from API version 1.3 this
5292  // problem is going to be fixed. That's why check for API version to set
5293  // an address mode.
5294  ze_api_version_t ZeApiVersion = Context->getPlatform()->ZeApiVersion;
5295  // TODO: add support for PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE
5296  switch (CurValueAddressingMode) {
5298  ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_NONE;
5299  break;
5301  ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_REPEAT;
5302  break;
5304  ZeSamplerDesc.addressMode =
5305  ZeApiVersion < ZE_MAKE_VERSION(1, 3)
5306  ? ZE_SAMPLER_ADDRESS_MODE_CLAMP
5307  : ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER;
5308  break;
5310  ZeSamplerDesc.addressMode =
5311  ZeApiVersion < ZE_MAKE_VERSION(1, 3)
5312  ? ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER
5313  : ZE_SAMPLER_ADDRESS_MODE_CLAMP;
5314  break;
5316  ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_MIRROR;
5317  break;
5318  default:
5319  zePrint("piSamplerCreate: unsupported PI_SAMPLER_ADDRESSING_MODE "
5320  "value\n");
5321  zePrint("PI_SAMPLER_ADDRESSING_MODE=%d\n", CurValueAddressingMode);
5322  return PI_ERROR_INVALID_VALUE;
5323  }
5324  } break;
5325 
5327  pi_sampler_filter_mode CurValueFilterMode =
5328  pi_cast<pi_sampler_filter_mode>(
5329  pi_cast<pi_uint32>(*(++CurProperty)));
5330 
5331  if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_NEAREST)
5332  ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST;
5333  else if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_LINEAR)
5334  ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_LINEAR;
5335  else {
5336  zePrint("PI_SAMPLER_FILTER_MODE=%d\n", CurValueFilterMode);
5337  zePrint(
5338  "piSamplerCreate: unsupported PI_SAMPLER_FILTER_MODE value\n");
5339  return PI_ERROR_INVALID_VALUE;
5340  }
5341  } break;
5342 
5343  default:
5344  break;
5345  }
5346  CurProperty++;
5347  }
5348  }
5349 
5350  ZE_CALL(zeSamplerCreate, (Context->ZeContext, Device->ZeDevice,
5351  &ZeSamplerDesc, // TODO: translate properties
5352  &ZeSampler));
5353 
5354  try {
5355  *RetSampler = new _pi_sampler(ZeSampler);
5356  } catch (const std::bad_alloc &) {
5357  return PI_ERROR_OUT_OF_HOST_MEMORY;
5358  } catch (...) {
5359  return PI_ERROR_UNKNOWN;
5360  }
5361  return PI_SUCCESS;
5362 }
5363 
5365  size_t ParamValueSize, void *ParamValue,
5366  size_t *ParamValueSizeRet) {
5367  (void)Sampler;
5368  (void)ParamName;
5369  (void)ParamValueSize;
5370  (void)ParamValue;
5371  (void)ParamValueSizeRet;
5372 
5373  die("piSamplerGetInfo: not implemented");
5374  return {};
5375 }
5376 
5378  PI_ASSERT(Sampler, PI_ERROR_INVALID_SAMPLER);
5379 
5380  Sampler->RefCount.increment();
5381  return PI_SUCCESS;
5382 }
5383 
5385  PI_ASSERT(Sampler, PI_ERROR_INVALID_SAMPLER);
5386 
5387  if (!Sampler->RefCount.decrementAndTest())
5388  return PI_SUCCESS;
5389 
5390  ZE_CALL(zeSamplerDestroy, (Sampler->ZeSampler));
5391  delete Sampler;
5392 
5393  return PI_SUCCESS;
5394 }
5395 
5396 //
5397 // Queue Commands
5398 //
5400  const pi_event *EventWaitList,
5401  pi_event *OutEvent) {
5402 
5403  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
5404 
5405  if (EventWaitList) {
5406  PI_ASSERT(NumEventsInWaitList > 0, PI_ERROR_INVALID_VALUE);
5407 
5408  bool UseCopyEngine = false;
5409 
5410  // Lock automatically releases when this goes out of scope.
5411  std::scoped_lock<pi_shared_mutex> lock(Queue->Mutex);
5412 
5413  _pi_ze_event_list_t TmpWaitList = {};
5414  if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
5415  NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
5416  return Res;
5417 
5418  // Get a new command list to be used on this call
5419  pi_command_list_ptr_t CommandList{};
5420  if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList,
5421  UseCopyEngine))
5422  return Res;
5423 
5424  ze_event_handle_t ZeEvent = nullptr;
5425  pi_event InternalEvent;
5426  bool IsInternal = OutEvent == nullptr;
5427  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
5428  auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER,
5429  CommandList, IsInternal);
5430  if (Res != PI_SUCCESS)
5431  return Res;
5432 
5433  ZeEvent = (*Event)->ZeEvent;
5434  (*Event)->WaitList = TmpWaitList;
5435 
5436  const auto &WaitList = (*Event)->WaitList;
5437  auto ZeCommandList = CommandList->first;
5438  ZE_CALL(zeCommandListAppendWaitOnEvents,
5439  (ZeCommandList, WaitList.Length, WaitList.ZeEventList));
5440 
5441  ZE_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent));
5442 
5443  // Execute command list asynchronously as the event will be used
5444  // to track down its completion.
5445  return Queue->executeCommandList(CommandList);
5446  }
5447 
5448  {
5449  // If wait-list is empty, then this particular command should wait until
5450  // all previous enqueued commands to the command-queue have completed.
5451  //
5452  // TODO: find a way to do that without blocking the host.
5453 
5454  // Lock automatically releases when this goes out of scope.
5455  std::scoped_lock<pi_shared_mutex> lock(Queue->Mutex);
5456 
5457  if (OutEvent) {
5458  auto Res = createEventAndAssociateQueue(
5459  Queue, OutEvent, PI_COMMAND_TYPE_USER, Queue->CommandListMap.end());
5460  if (Res != PI_SUCCESS)
5461  return Res;
5462  }
5463 
5464  Queue->synchronize();
5465 
5466  if (OutEvent) {
5467  Queue->LastCommandEvent = *OutEvent;
5468 
5469  ZE_CALL(zeEventHostSignal, ((*OutEvent)->ZeEvent));
5470  (*OutEvent)->Completed = true;
5471  }
5472  }
5473 
5474  if (!Queue->Device->ImmCommandListUsed)
5475  resetCommandLists(Queue);
5476 
5477  return PI_SUCCESS;
5478 }
5479 
5481  pi_uint32 NumEventsInWaitList,
5482  const pi_event *EventWaitList,
5483  pi_event *OutEvent) {
5484  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
5485 
5486  // Lock automatically releases when this goes out of scope.
5487  std::scoped_lock<pi_shared_mutex> lock(Queue->Mutex);
5488 
5489  // Helper function for appending a barrier to a command list.
5490  auto insertBarrierIntoCmdList =
5491  [&Queue](pi_command_list_ptr_t CmdList,
5492  const _pi_ze_event_list_t &EventWaitList, pi_event &Event,
5493  bool IsInternal) {
5494  if (auto Res = createEventAndAssociateQueue(
5495  Queue, &Event, PI_COMMAND_TYPE_USER, CmdList, IsInternal))
5496  return Res;
5497 
5498  Event->WaitList = EventWaitList;
5499  ZE_CALL(zeCommandListAppendBarrier,
5500  (CmdList->first, Event->ZeEvent, EventWaitList.Length,
5501  EventWaitList.ZeEventList));
5502  return PI_SUCCESS;
5503  };
5504 
5505  pi_event InternalEvent;
5506  bool IsInternal = OutEvent == nullptr;
5507  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
5508 
5509  // Indicator for whether batching is allowed. This may be changed later in
5510  // this function, but allow it by default.
5511  bool OkToBatch = true;
5512 
5513  // If we have a list of events to make the barrier from, then we can create a
5514  // barrier on these and use the resulting event as our future barrier.
5515  // We use the same approach if
5516  // SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a
5517  // positive value.
5518  // We use the same approach if we have in-order queue because every command
5519  // depends on previous one, so we don't need to insert barrier to multiple
5520  // command lists.
5521  if (NumEventsInWaitList || !UseMultipleCmdlistBarriers ||
5522  Queue->isInOrderQueue()) {
5523  // Retain the events as they will be owned by the result event.
5524  _pi_ze_event_list_t TmpWaitList;
5525  if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
5526  NumEventsInWaitList, EventWaitList, Queue,
5527  /*UseCopyEngine=*/false))
5528  return Res;
5529 
5530  // Get an arbitrary command-list in the queue.
5531  pi_command_list_ptr_t CmdList;
5532  if (auto Res = Queue->Context->getAvailableCommandList(
5533  Queue, CmdList,
5534  /*UseCopyEngine=*/false, OkToBatch))
5535  return Res;
5536 
5537  // Insert the barrier into the command-list and execute.
5538  if (auto Res =
5539  insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal))
5540  return Res;
5541 
5542  if (auto Res = Queue->executeCommandList(CmdList, false, OkToBatch))
5543  return Res;
5544 
5545  // Because of the dependency between commands in the in-order queue we don't
5546  // need to keep track of any active barriers if we have in-order queue.
5547  if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue()) {
5548  Queue->ActiveBarriers.add(*Event);
5549  }
5550  return PI_SUCCESS;
5551  }
5552 
5553  // Since there are no events to explicitly create a barrier for, we are
5554  // inserting a queue-wide barrier.
5555 
5556  // Command list(s) for putting barriers.
5557  std::vector<pi_command_list_ptr_t> CmdLists;
5558 
5559  // There must be at least one L0 queue.
5560  auto &InitialComputeGroup = Queue->ComputeQueueGroupsByTID.begin()->second;
5561  auto &InitialCopyGroup = Queue->CopyQueueGroupsByTID.begin()->second;
5562  PI_ASSERT(!InitialComputeGroup.ZeQueues.empty() ||
5563  !InitialCopyGroup.ZeQueues.empty(),
5564  PI_ERROR_INVALID_QUEUE);
5565 
5566  size_t NumQueues = 0;
5567  for (auto &QueueMap :
5569  for (auto &QueueGroup : QueueMap)
5570  NumQueues += QueueGroup.second.ZeQueues.size();
5571 
5572  OkToBatch = true;
5573  // Get an available command list tied to each command queue. We need
5574  // these so a queue-wide barrier can be inserted into each command
5575  // queue.
5576  CmdLists.reserve(NumQueues);
5577  for (auto &QueueMap :
5579  for (auto &QueueGroup : QueueMap) {
5580  bool UseCopyEngine =
5581  QueueGroup.second.Type != _pi_queue::queue_type::Compute;
5582  if (Queue->Device->ImmCommandListUsed) {
5583  // If immediate command lists are being used, each will act as their own
5584  // queue, so we must insert a barrier into each.
5585  for (auto &ImmCmdList : QueueGroup.second.ImmCmdLists)
5586  if (ImmCmdList != Queue->CommandListMap.end())
5587  CmdLists.push_back(ImmCmdList);
5588  } else {
5589  for (auto ZeQueue : QueueGroup.second.ZeQueues) {
5590  if (ZeQueue) {
5591  pi_command_list_ptr_t CmdList;
5592  if (auto Res = Queue->Context->getAvailableCommandList(
5593  Queue, CmdList, UseCopyEngine, OkToBatch, &ZeQueue))
5594  return Res;
5595  CmdLists.push_back(CmdList);
5596  }
5597  }
5598  }
5599  }
5600 
5601  // If no activity has occurred on the queue then there will be no cmdlists.
5602  // We need one for generating an Event, so create one.
5603  if (CmdLists.size() == 0) {
5604  // Get any available command list.
5605  pi_command_list_ptr_t CmdList;
5606  if (auto Res = Queue->Context->getAvailableCommandList(
5607  Queue, CmdList,
5608  /*UseCopyEngine=*/false, OkToBatch))
5609  return Res;
5610  CmdLists.push_back(CmdList);
5611  }
5612 
5613  if (CmdLists.size() > 1) {
5614  // Insert a barrier into each unique command queue using the available
5615  // command-lists.
5616  std::vector<pi_event> EventWaitVector(CmdLists.size());
5617  for (size_t I = 0; I < CmdLists.size(); ++I) {
5618  if (auto Res =
5619  insertBarrierIntoCmdList(CmdLists[I], _pi_ze_event_list_t{},
5620  EventWaitVector[I], /*IsInternal*/ true))
5621  return Res;
5622  }
5623  // If there were multiple queues we need to create a "convergence" event to
5624  // be our active barrier. This convergence event is signalled by a barrier
5625  // on all the events from the barriers we have inserted into each queue.
5626  // Use the first command list as our convergence command list.
5627  pi_command_list_ptr_t &ConvergenceCmdList = CmdLists[0];
5628 
5629  // Create an event list. It will take ownership over all relevant events so
5630  // we relinquish ownership and let it keep all events it needs.
5631  _pi_ze_event_list_t BaseWaitList;
5632  if (auto Res = BaseWaitList.createAndRetainPiZeEventList(
5633  EventWaitVector.size(), EventWaitVector.data(), Queue,
5634  ConvergenceCmdList->second.isCopy(Queue)))
5635  return Res;
5636 
5637  // Insert a barrier with the events from each command-queue into the
5638  // convergence command list. The resulting event signals the convergence of
5639  // all barriers.
5640  if (auto Res = insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList,
5641  *Event, IsInternal))
5642  return Res;
5643  } else {
5644  // If there is only a single queue then insert a barrier and the single
5645  // result event can be used as our active barrier and used as the return
5646  // event. Take into account whether output event is discarded or not.
5647  if (auto Res = insertBarrierIntoCmdList(CmdLists[0], _pi_ze_event_list_t{},
5648  *Event, IsInternal))
5649  return Res;
5650  }
5651 
5652  // Execute each command list so the barriers can be encountered.
5653  for (pi_command_list_ptr_t &CmdList : CmdLists)
5654  if (auto Res = Queue->executeCommandList(CmdList, false, OkToBatch))
5655  return Res;
5656 
5657  if (auto Res = Queue->ActiveBarriers.clear())
5658  return Res;
5659  Queue->ActiveBarriers.add(*Event);
5660  return PI_SUCCESS;
5661 }
5662 
5664  pi_bool BlockingRead, size_t Offset,
5665  size_t Size, void *Dst,
5666  pi_uint32 NumEventsInWaitList,
5667  const pi_event *EventWaitList,
5668  pi_event *Event) {
5669  PI_ASSERT(Src, PI_ERROR_INVALID_MEM_OBJECT);
5670  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
5671 
5672  std::shared_lock<pi_shared_mutex> SrcLock(Src->Mutex, std::defer_lock);
5673  std::scoped_lock<std::shared_lock<pi_shared_mutex>, pi_shared_mutex> LockAll(
5674  SrcLock, Queue->Mutex);
5675 
5676  char *ZeHandleSrc;
5677  PI_CALL(Src->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device));
5679  BlockingRead, Size, ZeHandleSrc + Offset,
5680  NumEventsInWaitList, EventWaitList, Event,
5681  /* PreferCopyEngine */ true);
5682 }
5683 
5685  pi_queue Queue, pi_mem Buffer, pi_bool BlockingRead,
5686  pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset,
5687  pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch,
5688  size_t HostRowPitch, size_t HostSlicePitch, void *Ptr,
5689  pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
5690  pi_event *Event) {
5691 
5692  PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT);
5693  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
5694 
5695  std::shared_lock<pi_shared_mutex> SrcLock(Buffer->Mutex, std::defer_lock);
5696  std::scoped_lock<std::shared_lock<pi_shared_mutex>, pi_shared_mutex> LockAll(
5697  SrcLock, Queue->Mutex);
5698 
5699  char *ZeHandleSrc;
5700  PI_CALL(Buffer->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device));
5701  return enqueueMemCopyRectHelper(
5702  PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, Queue, ZeHandleSrc,
5703  static_cast<char *>(Ptr), BufferOffset, HostOffset, Region,
5704  BufferRowPitch, HostRowPitch, BufferSlicePitch, HostSlicePitch,
5705  BlockingRead, NumEventsInWaitList, EventWaitList, Event);
5706 }
5707 
5708 } // extern "C"
5709 
5711  auto InitialCopyGroup = CopyQueueGroupsByTID.begin()->second;
5712  return PreferCopyEngine && InitialCopyGroup.ZeQueues.size() > 0 &&
5713  (!isInOrderQueue() || UseCopyEngineForInOrderQueue);
5714 }
5715 
5716 // Wait on all operations in flight on this Queue.
5717 // The caller is expected to hold a lock on the Queue.
5718 // For standard commandlists sync the L0 queues directly.
5719 // For immediate commandlists add barriers to all commandlists associated
5720 // with the Queue. An alternative approach would be to wait on all Events
5721 // associated with the in-flight operations.
5722 // TODO: Event release in immediate commandlist mode is driven by the SYCL
5723 // runtime. Need to investigate whether relase can be done earlier, at sync
5724 // points such as this, to reduce total number of active Events.
5726  if (!Healthy)
5727  return PI_SUCCESS;
5728 
5729  auto syncImmCmdList = [](_pi_queue *Queue, pi_command_list_ptr_t ImmCmdList) {
5730  if (ImmCmdList == Queue->CommandListMap.end())
5731  return PI_SUCCESS;
5732 
5733  pi_event Event;
5735  Queue, &Event, PI_COMMAND_TYPE_USER, ImmCmdList, false);
5736  if (Res != PI_SUCCESS)
5737  return Res;
5738  auto zeEvent = Event->ZeEvent;
5739  ZE_CALL(zeCommandListAppendBarrier,
5740  (ImmCmdList->first, zeEvent, 0, nullptr));
5741  ZE_CALL(zeHostSynchronize, (zeEvent));
5742  Event->Completed = true;
5743  PI_CALL(piEventRelease(Event));
5744 
5745  // Cleanup all events from the synced command list.
5746  auto EventListToCleanup = std::move(ImmCmdList->second.EventList);
5747  ImmCmdList->second.EventList.clear();
5748  CleanupEventListFromResetCmdList(EventListToCleanup, true);
5749  return PI_SUCCESS;
5750  };
5751 
5752  // Do nothing if the queue is empty
5753  if (!LastCommandEvent)
5754  return PI_SUCCESS;
5755 
5756  // For in-order queue just wait for the last command.
5757  if (isInOrderQueue()) {
5758  ZE_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent));
5759  } else {
5760  // Otherwise sync all L0 queues/immediate command-lists.
5761  for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) {
5762  for (auto &QueueGroup : QueueMap) {
5763  if (Device->ImmCommandListUsed) {
5764  for (auto ImmCmdList : QueueGroup.second.ImmCmdLists)
5765  syncImmCmdList(this, ImmCmdList);
5766  } else {
5767  for (auto &ZeQueue : QueueGroup.second.ZeQueues)
5768  if (ZeQueue)
5769  ZE_CALL(zeHostSynchronize, (ZeQueue));
5770  }
5771  }
5772  }
5773  }
5774  LastCommandEvent = nullptr;
5775 
5776  // With the entire queue synchronized, the active barriers must be done so we
5777  // can remove them.
5778  if (auto Res = ActiveBarriers.clear())
5779  return Res;
5780 
5781  return PI_SUCCESS;
5782 }
5783 
5784 // Shared by all memory read/write/copy PI interfaces.
5785 // PI interfaces must have queue's and destination buffer's mutexes locked for
5786 // exclusive use and source buffer's mutex locked for shared use on entry.
5787 static pi_result
5788 enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst,
5789  pi_bool BlockingWrite, size_t Size, const void *Src,
5790  pi_uint32 NumEventsInWaitList,
5791  const pi_event *EventWaitList, pi_event *OutEvent,
5792  bool PreferCopyEngine) {
5793  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
5794 
5795  bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine);
5796 
5797  _pi_ze_event_list_t TmpWaitList;
5798  if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
5799  NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
5800  return Res;
5801 
5802  // We want to batch these commands to avoid extra submissions (costly)
5803  bool OkToBatch = true;
5804 
5805  // Get a new command list to be used on this call
5806  pi_command_list_ptr_t CommandList{};
5807  if (auto Res = Queue->Context->getAvailableCommandList(
5808  Queue, CommandList, UseCopyEngine, OkToBatch))
5809  return Res;
5810 
5811  ze_event_handle_t ZeEvent = nullptr;
5812  pi_event InternalEvent;
5813  bool IsInternal = OutEvent == nullptr;
5814  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
5815  auto Res = createEventAndAssociateQueue(Queue, Event, CommandType,
5816  CommandList, IsInternal);
5817  if (Res != PI_SUCCESS)
5818  return Res;
5819  ZeEvent = (*Event)->ZeEvent;
5820  (*Event)->WaitList = TmpWaitList;
5821 
5822  const auto &ZeCommandList = CommandList->first;
5823  const auto &WaitList = (*Event)->WaitList;
5824 
5825  zePrint("calling zeCommandListAppendMemoryCopy() with\n"
5826  " ZeEvent %#llx\n",
5827  pi_cast<std::uintptr_t>(ZeEvent));
5828  printZeEventList(WaitList);
5829 
5830  ZE_CALL(zeCommandListAppendMemoryCopy,
5831  (ZeCommandList, Dst, Src, Size, ZeEvent, WaitList.Length,
5832  WaitList.ZeEventList));
5833 
5834  if (auto Res =
5835  Queue->executeCommandList(CommandList, BlockingWrite, OkToBatch))
5836  return Res;
5837 
5838  return PI_SUCCESS;
5839 }
5840 
5841 // Shared by all memory read/write/copy rect PI interfaces.
5842 // PI interfaces must have queue's and destination buffer's mutexes locked for
5843 //