DPC++ Runtime
Runtime libraries for oneAPI DPC++
pi_level_zero.cpp
Go to the documentation of this file.
1 //===-------- pi_level_zero.cpp - Level Zero Plugin --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===------------------------------------------------------------------===//
8 
13 
14 #include "pi_level_zero.hpp"
15 #include <algorithm>
16 #include <cstdarg>
17 #include <cstdio>
18 #include <cstring>
19 #include <memory>
20 #include <set>
21 #include <sstream>
22 #include <string>
23 #include <sycl/detail/spinlock.hpp>
24 #include <thread>
25 #include <utility>
26 
27 #include <level_zero/zet_api.h>
28 
29 #include "usm_allocator.hpp"
30 
31 extern "C" {
32 // Forward declarartions.
35 static pi_result EventCreate(pi_context Context, pi_queue Queue,
36  bool HostVisible, pi_event *RetEvent);
37 }
38 
39 // Defined in tracing.cpp
40 void enableZeTracing();
41 void disableZeTracing();
42 
43 namespace {
44 
45 // Controls Level Zero calls serialization to w/a Level Zero driver being not MT
46 // ready. Recognized values (can be used as a bit mask):
47 enum {
48  ZeSerializeNone =
49  0, // no locking or blocking (except when SYCL RT requested blocking)
50  ZeSerializeLock = 1, // locking around each ZE_CALL
51  ZeSerializeBlock =
52  2, // blocking ZE calls, where supported (usually in enqueue commands)
53 };
54 static const pi_uint32 ZeSerialize = [] {
55  const char *SerializeMode = std::getenv("ZE_SERIALIZE");
56  const pi_uint32 SerializeModeValue =
57  SerializeMode ? std::atoi(SerializeMode) : 0;
58  return SerializeModeValue;
59 }();
60 
61 // This is an experimental option to test performance of device to device copy
62 // operations on copy engines (versus compute engine)
63 static const bool UseCopyEngineForD2DCopy = [] {
64  const char *CopyEngineForD2DCopy =
65  std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY");
66  return (CopyEngineForD2DCopy && (std::stoi(CopyEngineForD2DCopy) != 0));
67 }();
68 
69 // This is an experimental option that allows the use of copy engine, if
70 // available in the device, in Level Zero plugin for copy operations submitted
71 // to an in-order queue. The default is 1.
72 static const bool UseCopyEngineForInOrderQueue = [] {
73  const char *CopyEngineForInOrderQueue =
74  std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE");
75  return (!CopyEngineForInOrderQueue ||
76  (std::stoi(CopyEngineForInOrderQueue) != 0));
77 }();
78 
79 // This is an experimental option that allows the use of multiple command lists
80 // when submitting barriers. The default is 0.
81 static const bool UseMultipleCmdlistBarriers = [] {
82  const char *UseMultipleCmdlistBarriersFlag =
83  std::getenv("SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS");
84  if (!UseMultipleCmdlistBarriersFlag)
85  return true;
86  return std::stoi(UseMultipleCmdlistBarriersFlag) > 0;
87 }();
88 
89 // This is an experimental option that allows to disable caching of events in
90 // the context.
91 static const bool DisableEventsCaching = [] {
92  const char *DisableEventsCachingFlag =
93  std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING");
94  if (!DisableEventsCachingFlag)
95  return false;
96  return std::stoi(DisableEventsCachingFlag) != 0;
97 }();
98 
99 // This class encapsulates actions taken along with a call to Level Zero API.
100 class ZeCall {
101 private:
102  // The global mutex that is used for total serialization of Level Zero calls.
103  static std::mutex GlobalLock;
104 
105 public:
106  ZeCall() {
107  if ((ZeSerialize & ZeSerializeLock) != 0) {
108  GlobalLock.lock();
109  }
110  }
111  ~ZeCall() {
112  if ((ZeSerialize & ZeSerializeLock) != 0) {
113  GlobalLock.unlock();
114  }
115  }
116 
117  // The non-static version just calls static one.
118  ze_result_t doCall(ze_result_t ZeResult, const char *ZeName,
119  const char *ZeArgs, bool TraceError = true);
120 };
121 std::mutex ZeCall::GlobalLock;
122 
123 // Controls PI level tracing prints.
124 static bool PrintPiTrace = false;
125 
126 // Controls support of the indirect access kernels and deferred memory release.
127 static const bool IndirectAccessTrackingEnabled = [] {
128  return std::getenv("SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY") !=
129  nullptr;
130 }();
131 
132 // Map Level Zero runtime error code to PI error code.
133 static pi_result mapError(ze_result_t ZeResult) {
134  // TODO: these mapping need to be clarified and synced with the PI API return
135  // values, which is TBD.
136  static std::unordered_map<ze_result_t, pi_result> ErrorMapping = {
137  {ZE_RESULT_SUCCESS, PI_SUCCESS},
138  {ZE_RESULT_ERROR_DEVICE_LOST, PI_ERROR_DEVICE_NOT_FOUND},
139  {ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, PI_ERROR_INVALID_OPERATION},
140  {ZE_RESULT_ERROR_NOT_AVAILABLE, PI_ERROR_INVALID_OPERATION},
141  {ZE_RESULT_ERROR_UNINITIALIZED, PI_ERROR_INVALID_PLATFORM},
142  {ZE_RESULT_ERROR_INVALID_ARGUMENT, PI_ERROR_INVALID_ARG_VALUE},
143  {ZE_RESULT_ERROR_INVALID_NULL_POINTER, PI_ERROR_INVALID_VALUE},
144  {ZE_RESULT_ERROR_INVALID_SIZE, PI_ERROR_INVALID_VALUE},
145  {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, PI_ERROR_INVALID_VALUE},
146  {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, PI_ERROR_INVALID_VALUE},
147  {ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT, PI_ERROR_INVALID_EVENT},
148  {ZE_RESULT_ERROR_INVALID_ENUMERATION, PI_ERROR_INVALID_VALUE},
149  {ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION, PI_ERROR_INVALID_VALUE},
150  {ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT, PI_ERROR_INVALID_VALUE},
151  {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, PI_ERROR_INVALID_BINARY},
152  {ZE_RESULT_ERROR_INVALID_KERNEL_NAME, PI_ERROR_INVALID_KERNEL_NAME},
153  {ZE_RESULT_ERROR_INVALID_FUNCTION_NAME, PI_ERROR_BUILD_PROGRAM_FAILURE},
154  {ZE_RESULT_ERROR_OVERLAPPING_REGIONS, PI_ERROR_INVALID_OPERATION},
155  {ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION,
156  PI_ERROR_INVALID_WORK_GROUP_SIZE},
157  {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, PI_ERROR_BUILD_PROGRAM_FAILURE},
158  {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, PI_ERROR_OUT_OF_RESOURCES},
159  {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, PI_ERROR_OUT_OF_HOST_MEMORY}};
160 
161  auto It = ErrorMapping.find(ZeResult);
162  if (It == ErrorMapping.end()) {
163  return PI_ERROR_UNKNOWN;
164  }
165  return It->second;
166 }
167 
168 // This will count the calls to Level-Zero
169 static std::map<const char *, int> *ZeCallCount = nullptr;
170 
171 // Trace a call to Level-Zero RT
172 #define ZE_CALL(ZeName, ZeArgs) \
173  { \
174  ze_result_t ZeResult = ZeName ZeArgs; \
175  if (auto Result = ZeCall().doCall(ZeResult, #ZeName, #ZeArgs, true)) \
176  return mapError(Result); \
177  }
178 
179 #define ZE_CALL_NOCHECK(ZeName, ZeArgs) \
180  ZeCall().doCall(ZeName ZeArgs, #ZeName, #ZeArgs, false)
181 
182 // Trace an internal PI call; returns in case of an error.
183 #define PI_CALL(Call) \
184  { \
185  if (PrintPiTrace) \
186  fprintf(stderr, "PI ---> %s\n", #Call); \
187  pi_result Result = (Call); \
188  if (Result != PI_SUCCESS) \
189  return Result; \
190  }
191 
192 enum DebugLevel {
193  ZE_DEBUG_NONE = 0x0,
194  ZE_DEBUG_BASIC = 0x1,
195  ZE_DEBUG_VALIDATION = 0x2,
196  ZE_DEBUG_CALL_COUNT = 0x4,
197  ZE_DEBUG_ALL = -1
198 };
199 
200 // Controls Level Zero calls tracing.
201 static const int ZeDebug = [] {
202  const char *DebugMode = std::getenv("ZE_DEBUG");
203  return DebugMode ? std::atoi(DebugMode) : ZE_DEBUG_NONE;
204 }();
205 
206 static void zePrint(const char *Format, ...) {
207  if (ZeDebug & ZE_DEBUG_BASIC) {
208  va_list Args;
209  va_start(Args, Format);
210  vfprintf(stderr, Format, Args);
211  va_end(Args);
212  }
213 }
214 
215 // Controls if we should choose doing eager initialization
216 // to make it happen on warmup paths and have the reportable
217 // paths be less likely affected.
218 //
219 static bool doEagerInit = [] {
220  const char *EagerInit = std::getenv("SYCL_EAGER_INIT");
221  return EagerInit ? std::atoi(EagerInit) != 0 : false;
222 }();
223 
224 // Maximum number of events that can be present in an event ZePool is captured
225 // here. Setting it to 256 gave best possible performance for several
226 // benchmarks.
227 static const pi_uint32 MaxNumEventsPerPool = [] {
228  const auto MaxNumEventsPerPoolEnv =
229  std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
230  pi_uint32 Result =
231  MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256;
232  if (Result <= 0)
233  Result = 256;
234  return Result;
235 }();
236 
237 // Helper function to implement zeHostSynchronize.
238 // The behavior is to avoid infinite wait during host sync under ZE_DEBUG.
239 // This allows for a much more responsive debugging of hangs.
240 //
241 template <typename T, typename Func>
242 ze_result_t zeHostSynchronizeImpl(Func Api, T Handle) {
243  if (!ZeDebug) {
244  return Api(Handle, UINT64_MAX);
245  }
246 
247  ze_result_t R;
248  while ((R = Api(Handle, 1000)) == ZE_RESULT_NOT_READY)
249  ;
250  return R;
251 }
252 
253 // Template function to do various types of host synchronizations.
254 // This is intended to be used instead of direct calls to specific
255 // Level-Zero synchronization APIs.
256 //
257 template <typename T> ze_result_t zeHostSynchronize(T Handle);
258 template <> ze_result_t zeHostSynchronize(ze_event_handle_t Handle) {
259  return zeHostSynchronizeImpl(zeEventHostSynchronize, Handle);
260 }
261 template <> ze_result_t zeHostSynchronize(ze_command_queue_handle_t Handle) {
262  return zeHostSynchronizeImpl(zeCommandQueueSynchronize, Handle);
263 }
264 
265 template <typename T, typename Assign>
266 pi_result getInfoImpl(size_t param_value_size, void *param_value,
267  size_t *param_value_size_ret, T value, size_t value_size,
268  Assign &&assign_func) {
269 
270  if (param_value != nullptr) {
271 
272  if (param_value_size < value_size) {
273  return PI_ERROR_INVALID_VALUE;
274  }
275 
276  assign_func(param_value, value, value_size);
277  }
278 
279  if (param_value_size_ret != nullptr) {
280  *param_value_size_ret = value_size;
281  }
282 
283  return PI_SUCCESS;
284 }
285 
286 template <typename T>
287 pi_result getInfo(size_t param_value_size, void *param_value,
288  size_t *param_value_size_ret, T value) {
289 
290  auto assignment = [](void *param_value, T value, size_t value_size) {
291  (void)value_size;
292  *static_cast<T *>(param_value) = value;
293  };
294 
295  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
296  sizeof(T), assignment);
297 }
298 
299 template <typename T>
300 pi_result getInfoArray(size_t array_length, size_t param_value_size,
301  void *param_value, size_t *param_value_size_ret,
302  T *value) {
303  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
304  array_length * sizeof(T), memcpy);
305 }
306 
307 template <typename T, typename RetType>
308 pi_result getInfoArray(size_t array_length, size_t param_value_size,
309  void *param_value, size_t *param_value_size_ret,
310  T *value) {
311  if (param_value) {
312  memset(param_value, 0, param_value_size);
313  for (uint32_t I = 0; I < array_length; I++)
314  ((RetType *)param_value)[I] = (RetType)value[I];
315  }
316  if (param_value_size_ret)
317  *param_value_size_ret = array_length * sizeof(RetType);
318  return PI_SUCCESS;
319 }
320 
321 template <>
322 pi_result getInfo<const char *>(size_t param_value_size, void *param_value,
323  size_t *param_value_size_ret,
324  const char *value) {
325  return getInfoArray(strlen(value) + 1, param_value_size, param_value,
326  param_value_size_ret, value);
327 }
328 
329 class ReturnHelper {
330 public:
331  ReturnHelper(size_t param_value_size, void *param_value,
332  size_t *param_value_size_ret)
333  : param_value_size(param_value_size), param_value(param_value),
334  param_value_size_ret(param_value_size_ret) {}
335 
336  template <class T> pi_result operator()(const T &t) {
337  return getInfo(param_value_size, param_value, param_value_size_ret, t);
338  }
339 
340 private:
341  size_t param_value_size;
342  void *param_value;
343  size_t *param_value_size_ret;
344 };
345 
346 } // anonymous namespace
347 
348 // SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in
349 // which case all compute commands will be submitted to the command-queue
350 // with the given index in the compute command group. If it is instead set
351 // to negative then all available compute engines may be used.
352 //
353 // The default value is "0".
354 //
355 static const std::pair<int, int> getRangeOfAllowedComputeEngines() {
356  static const char *EnvVar =
357  std::getenv("SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE");
358  // If the environment variable is not set only use "0" CCS for now.
359  // TODO: allow all CCSs when HW support is complete.
360  if (!EnvVar)
361  return std::pair<int, int>(0, 0);
362 
363  auto EnvVarValue = std::atoi(EnvVar);
364  if (EnvVarValue >= 0) {
365  return std::pair<int, int>(EnvVarValue, EnvVarValue);
366  }
367 
368  return std::pair<int, int>(0, INT_MAX);
369 }
370 
371 // SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE can be set to an integer value, or
372 // a pair of integer values of the form "lower_index:upper_index".
373 // Here, the indices point to copy engines in a list of all available copy
374 // engines.
375 // This functions returns this pair of indices.
376 // If the user specifies only a single integer, a value of 0 indicates that
377 // the copy engines will not be used at all. A value of 1 indicates that all
378 // available copy engines can be used.
379 static const std::pair<int, int>
381  static const char *EnvVar = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE");
382  // If the environment variable is not set, no copy engines are used when
383  // immediate commandlists are being used. For standard commandlists all are
384  // used.
385  if (!EnvVar) {
386  if (Device->useImmediateCommandLists())
387  return std::pair<int, int>(-1, -1); // No copy engines can be used.
388  return std::pair<int, int>(0, INT_MAX); // All copy engines will be used.
389  }
390  std::string CopyEngineRange = EnvVar;
391  // Environment variable can be a single integer or a pair of integers
392  // separated by ":"
393  auto pos = CopyEngineRange.find(":");
394  if (pos == std::string::npos) {
395  bool UseCopyEngine = (std::stoi(CopyEngineRange) != 0);
396  if (UseCopyEngine)
397  return std::pair<int, int>(0, INT_MAX); // All copy engines can be used.
398  return std::pair<int, int>(-1, -1); // No copy engines will be used.
399  }
400  int LowerCopyEngineIndex = std::stoi(CopyEngineRange.substr(0, pos));
401  int UpperCopyEngineIndex = std::stoi(CopyEngineRange.substr(pos + 1));
402  if ((LowerCopyEngineIndex > UpperCopyEngineIndex) ||
403  (LowerCopyEngineIndex < -1) || (UpperCopyEngineIndex < -1)) {
404  zePrint("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE: invalid value provided, "
405  "default set.\n");
406  LowerCopyEngineIndex = 0;
407  UpperCopyEngineIndex = INT_MAX;
408  }
409  return std::pair<int, int>(LowerCopyEngineIndex, UpperCopyEngineIndex);
410 }
411 
413  int LowerCopyQueueIndex = getRangeOfAllowedCopyEngines(Device).first;
414  int UpperCopyQueueIndex = getRangeOfAllowedCopyEngines(Device).second;
415  return ((LowerCopyQueueIndex != -1) || (UpperCopyQueueIndex != -1));
416 }
417 
418 // Global variables used in PI_Level_Zero
419 // Note we only create a simple pointer variables such that C++ RT won't
420 // deallocate them automatically at the end of the main program.
421 // The heap memory allocated for these global variables reclaimed only when
422 // Sycl RT calls piTearDown().
423 static std::vector<pi_platform> *PiPlatformsCache =
424  new std::vector<pi_platform>;
425 static sycl::detail::SpinLock *PiPlatformsCacheMutex =
426  new sycl::detail::SpinLock;
427 static bool PiPlatformCachePopulated = false;
428 
429 // Flags which tell whether various Level Zero extensions are available.
432 
433 pi_result
434 _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool,
435  size_t &Index, bool HostVisible,
436  bool ProfilingEnabled) {
437  // Lock while updating event pool machinery.
438  std::scoped_lock<pi_mutex> Lock(ZeEventPoolCacheMutex);
439 
440  std::list<ze_event_pool_handle_t> *ZePoolCache =
441  getZeEventPoolCache(HostVisible, ProfilingEnabled);
442 
443  if (!ZePoolCache->empty()) {
444  if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) {
445  if (DisableEventsCaching) {
446  // Remove full pool from the cache if events caching is disabled.
447  ZePoolCache->erase(ZePoolCache->begin());
448  } else {
449  // If event caching is enabled then we don't destroy events so there is
450  // no need to remove pool from the cache and add it back when it has
451  // available slots. Just keep it in the tail of the cache so that all
452  // pools can be destroyed during context destruction.
453  ZePoolCache->push_front(nullptr);
454  }
455  }
456  }
457  if (ZePoolCache->empty()) {
458  ZePoolCache->push_back(nullptr);
459  }
460 
461  // We shall be adding an event to the front pool.
462  ze_event_pool_handle_t *ZePool = &ZePoolCache->front();
463  Index = 0;
464  // Create one event ZePool per MaxNumEventsPerPool events
465  if (*ZePool == nullptr) {
466  ZeStruct<ze_event_pool_desc_t> ZeEventPoolDesc;
467  ZeEventPoolDesc.count = MaxNumEventsPerPool;
468  ZeEventPoolDesc.flags = 0;
469  if (HostVisible)
470  ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
471  if (ProfilingEnabled)
472  ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
473  zePrint("ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags);
474 
475  std::vector<ze_device_handle_t> ZeDevices;
476  std::for_each(Devices.begin(), Devices.end(), [&](const pi_device &D) {
477  ZeDevices.push_back(D->ZeDevice);
478  });
479 
480  ZE_CALL(zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc, ZeDevices.size(),
481  &ZeDevices[0], ZePool));
482  NumEventsAvailableInEventPool[*ZePool] = MaxNumEventsPerPool - 1;
483  NumEventsUnreleasedInEventPool[*ZePool] = 1;
484  } else {
485  Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[*ZePool];
486  --NumEventsAvailableInEventPool[*ZePool];
487  ++NumEventsUnreleasedInEventPool[*ZePool];
488  }
489  Pool = *ZePool;
490  return PI_SUCCESS;
491 }
492 
494  std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex, std::defer_lock);
495  std::scoped_lock<pi_mutex, std::shared_lock<pi_shared_mutex>> LockAll(
496  ZeEventPoolCacheMutex, EventLock);
497  if (!Event->ZeEventPool) {
498  // This must be an interop event created on a users's pool.
499  // Do nothing.
500  return PI_SUCCESS;
501  }
502 
503  std::list<ze_event_pool_handle_t> *ZePoolCache =
504  getZeEventPoolCache(Event->isHostVisible(), Event->isProfilingEnabled());
505 
506  // Put the empty pool to the cache of the pools.
507  if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0)
508  die("Invalid event release: event pool doesn't have unreleased events");
509  if (--NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) {
510  if (ZePoolCache->front() != Event->ZeEventPool) {
511  ZePoolCache->push_back(Event->ZeEventPool);
512  }
513  NumEventsAvailableInEventPool[Event->ZeEventPool] = MaxNumEventsPerPool;
514  }
515 
516  return PI_SUCCESS;
517 }
518 
519 // Some opencl extensions we know are supported by all Level Zero devices.
520 constexpr char ZE_SUPPORTED_EXTENSIONS[] =
521  "cl_khr_il_program cl_khr_subgroups cl_intel_subgroups "
522  "cl_intel_subgroups_short cl_intel_required_subgroup_size ";
523 
524 // Forward declarations
525 static pi_result
526 enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst,
527  pi_bool BlockingWrite, size_t Size, const void *Src,
528  pi_uint32 NumEventsInWaitList,
529  const pi_event *EventWaitList, pi_event *Event,
530  bool PreferCopyEngine = false);
531 
533  pi_command_type CommandType, pi_queue Queue, void *SrcBuffer,
534  void *DstBuffer, pi_buff_rect_offset SrcOrigin,
535  pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region,
536  size_t SrcRowPitch, size_t SrcSlicePitch, size_t DstRowPitch,
537  size_t DstSlicePitch, pi_bool Blocking, pi_uint32 NumEventsInWaitList,
538  const pi_event *EventWaitList, pi_event *Event,
539  bool PreferCopyEngine = false);
540 
541 inline void zeParseError(ze_result_t ZeError, const char *&ErrorString) {
542  switch (ZeError) {
543 #define ZE_ERRCASE(ERR) \
544  case ERR: \
545  ErrorString = "" #ERR; \
546  break;
547 
548  ZE_ERRCASE(ZE_RESULT_SUCCESS)
549  ZE_ERRCASE(ZE_RESULT_NOT_READY)
550  ZE_ERRCASE(ZE_RESULT_ERROR_DEVICE_LOST)
551  ZE_ERRCASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY)
552  ZE_ERRCASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY)
553  ZE_ERRCASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE)
554  ZE_ERRCASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS)
555  ZE_ERRCASE(ZE_RESULT_ERROR_NOT_AVAILABLE)
556  ZE_ERRCASE(ZE_RESULT_ERROR_UNINITIALIZED)
557  ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION)
558  ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE)
559  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_ARGUMENT)
560  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE)
561  ZE_ERRCASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE)
562  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER)
563  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_SIZE)
564  ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE)
565  ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT)
566  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT)
567  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_ENUMERATION)
568  ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION)
569  ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT)
570  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY)
571  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME)
572  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME)
573  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME)
574  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION)
575  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION)
576  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX)
577  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE)
578  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE)
579  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE)
580  ZE_ERRCASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS)
581  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED)
582  ZE_ERRCASE(ZE_RESULT_ERROR_UNKNOWN)
583 
584 #undef ZE_ERRCASE
585  default:
586  assert(false && "Unexpected Error code");
587  } // switch
588 }
589 
590 // Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR
591 constexpr size_t MaxMessageSize = 256;
592 thread_local pi_result ErrorMessageCode = PI_SUCCESS;
593 thread_local char ErrorMessage[MaxMessageSize];
594 
595 // Utility function for setting a message and warning
596 [[maybe_unused]] static void setErrorMessage(const char *message,
597  pi_result error_code) {
598  assert(strlen(message) <= MaxMessageSize);
599  strcpy(ErrorMessage, message);
600  ErrorMessageCode = error_code;
601 }
602 
603 // Returns plugin specific error and warning messages
605  *message = &ErrorMessage[0];
606  return ErrorMessageCode;
607 }
608 
609 ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName,
610  const char *ZeArgs, bool TraceError) {
611  zePrint("ZE ---> %s%s\n", ZeName, ZeArgs);
612 
613  if (ZeDebug & ZE_DEBUG_CALL_COUNT) {
614  ++(*ZeCallCount)[ZeName];
615  }
616 
617  if (ZeResult && TraceError) {
618  const char *ErrorString = "Unknown";
619  zeParseError(ZeResult, ErrorString);
620  zePrint("Error (%s) in %s\n", ErrorString, ZeName);
621  }
622  return ZeResult;
623 }
624 
625 #define PI_ASSERT(condition, error) \
626  if (!(condition)) \
627  return error;
628 
629 // This helper function creates a pi_event and associate a pi_queue.
630 // Note that the caller of this function must have acquired lock on the Queue
631 // that is passed in.
632 // \param Queue pi_queue to associate with a new event.
633 // \param Event a pointer to hold the newly created pi_event
634 // \param CommandType various command type determined by the caller
635 // \param CommandList is the command list where the event is added
636 // \param IsInternal tells if the event is internal, i.e. visible in the L0
637 // plugin only.
638 // \param ForceHostVisible tells if the event must be created in
639 // the host-visible pool
641  pi_queue Queue, pi_event *Event, pi_command_type CommandType,
642  pi_command_list_ptr_t CommandList, bool IsInternal = false,
643  bool ForceHostVisible = false) {
644 
645  if (!ForceHostVisible)
646  ForceHostVisible = Queue->Device->eventsScope() == AllHostVisible;
647  PI_CALL(EventCreate(Queue->Context, Queue, ForceHostVisible, Event));
648 
649  (*Event)->Queue = Queue;
650  (*Event)->CommandType = CommandType;
651 
652  // Append this Event to the CommandList, if any
653  if (CommandList != Queue->CommandListMap.end()) {
654  CommandList->second.append(*Event);
655  (*Event)->RefCount.increment();
656  }
657 
658  // We need to increment the reference counter here to avoid pi_queue
659  // being released before the associated pi_event is released because
660  // piEventRelease requires access to the associated pi_queue.
661  // In piEventRelease, the reference counter of the Queue is decremented
662  // to release it.
663  Queue->RefCount.increment();
664 
665  // SYCL RT does not track completion of the events, so it could
666  // release a PI event as soon as that's not being waited in the app.
667  // But we have to ensure that the event is not destroyed before
668  // it is really signalled, so retain it explicitly here and
669  // release in CleanupCompletedEvent(Event).
670  // If the event is internal then don't increment the reference count as this
671  // event will not be waited/released by SYCL RT, so it must be destroyed by
672  // EventRelease in resetCommandList.
673  if (!IsInternal)
674  PI_CALL(piEventRetain(*Event));
675 
676  return PI_SUCCESS;
677 }
678 
679 pi_result _pi_device::initialize(int SubSubDeviceOrdinal,
680  int SubSubDeviceIndex) {
681  uint32_t numQueueGroups = 0;
682  ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
683  (ZeDevice, &numQueueGroups, nullptr));
684  if (numQueueGroups == 0) {
685  return PI_ERROR_UNKNOWN;
686  }
687  zePrint("NOTE: Number of queue groups = %d\n", numQueueGroups);
688  std::vector<ZeStruct<ze_command_queue_group_properties_t>>
689  QueueGroupProperties(numQueueGroups);
690  ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
691  (ZeDevice, &numQueueGroups, QueueGroupProperties.data()));
692 
693  // Initialize ordinal and compute queue group properties
694  for (uint32_t i = 0; i < numQueueGroups; i++) {
695  if (QueueGroupProperties[i].flags &
696  ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
699  QueueGroupProperties[i];
700  break;
701  }
702  }
703 
704  // Reinitialize a sub-sub-device with its own ordinal, index.
705  // Our sub-sub-device representation is currently [Level-Zero sub-device
706  // handle + Level-Zero compute group/engine index]. Only the specified
707  // index queue will be used to submit work to the sub-sub-device.
708  if (SubSubDeviceOrdinal >= 0) {
709  QueueGroup[queue_group_info_t::Compute].ZeOrdinal = SubSubDeviceOrdinal;
710  QueueGroup[queue_group_info_t::Compute].ZeIndex = SubSubDeviceIndex;
711  } else { // Proceed with initialization for root and sub-device
712  // How is it possible that there are no "compute" capabilities?
713  if (QueueGroup[queue_group_info_t::Compute].ZeOrdinal < 0) {
714  return PI_ERROR_UNKNOWN;
715  }
716 
717  if (CopyEngineRequested(this)) {
718  for (uint32_t i = 0; i < numQueueGroups; i++) {
719  if (((QueueGroupProperties[i].flags &
720  ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) &&
721  (QueueGroupProperties[i].flags &
722  ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) {
723  if (QueueGroupProperties[i].numQueues == 1) {
726  QueueGroupProperties[i];
727  } else {
730  QueueGroupProperties[i];
731  break;
732  }
733  }
734  }
735  if (QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal < 0)
736  zePrint("NOTE: main blitter/copy engine is not available\n");
737  else
738  zePrint("NOTE: main blitter/copy engine is available\n");
739 
740  if (QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal < 0)
741  zePrint("NOTE: link blitter/copy engines are not available\n");
742  else
743  zePrint("NOTE: link blitter/copy engines are available\n");
744  }
745  }
746 
747  // Maintain various device properties cache.
748  // Note that we just describe here how to compute the data.
749  // The real initialization is upon first access.
750  //
751  auto ZeDevice = this->ZeDevice;
752  ZeDeviceProperties.Compute = [ZeDevice](ze_device_properties_t &Properties) {
753  ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &Properties));
754  };
755 
757  [ZeDevice](ze_device_compute_properties_t &Properties) {
758  ZE_CALL_NOCHECK(zeDeviceGetComputeProperties, (ZeDevice, &Properties));
759  };
760 
762  [ZeDevice](ze_device_image_properties_t &Properties) {
763  ZE_CALL_NOCHECK(zeDeviceGetImageProperties, (ZeDevice, &Properties));
764  };
765 
767  [ZeDevice](ze_device_module_properties_t &Properties) {
768  ZE_CALL_NOCHECK(zeDeviceGetModuleProperties, (ZeDevice, &Properties));
769  };
770 
772  [ZeDevice](
773  std::vector<ZeStruct<ze_device_memory_properties_t>> &Properties) {
774  uint32_t Count = 0;
775  ZE_CALL_NOCHECK(zeDeviceGetMemoryProperties,
776  (ZeDevice, &Count, nullptr));
777 
778  Properties.resize(Count);
779  ZE_CALL_NOCHECK(zeDeviceGetMemoryProperties,
780  (ZeDevice, &Count, Properties.data()));
781  };
782 
784  [ZeDevice](ze_device_memory_access_properties_t &Properties) {
785  ZE_CALL_NOCHECK(zeDeviceGetMemoryAccessProperties,
786  (ZeDevice, &Properties));
787  };
788 
790  [ZeDevice](ze_device_cache_properties_t &Properties) {
791  // TODO: Since v1.0 there can be multiple cache properties.
792  // For now remember the first one, if any.
793  uint32_t Count = 0;
794  ZE_CALL_NOCHECK(zeDeviceGetCacheProperties,
795  (ZeDevice, &Count, nullptr));
796  if (Count > 0)
797  Count = 1;
798  ZE_CALL_NOCHECK(zeDeviceGetCacheProperties,
799  (ZeDevice, &Count, &Properties));
800  };
801 
802  // Check device id for PVC.
803  // TODO: change mechanism for detecting PVC once L0 provides an interface.
804  // At present even PVC doesn't automatically use immediate commandlists.
805  // Change this after more testing.
807  false; // (ZeDeviceProperties->deviceId & 0xff0) == 0xbd0;
808 
809  return PI_SUCCESS;
810 }
811 
812 // Get value of device scope events env var setting or -1 if unset
813 static const int DeviceEventsSetting = [] {
814  const char *DeviceEventsSettingStr =
815  std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS");
816  if (!DeviceEventsSettingStr)
817  return -1;
818  return std::stoi(DeviceEventsSettingStr);
819 }();
820 
821 // Controls the scope of events.
822 // If immediate commandlists are being used then use compatible event scopes.
824  // Set default based on type of commandlists being used.
827  // Override the default if user has explicitly chosen the events scope.
828  switch (DeviceEventsSetting) {
829  case 0:
830  return AllHostVisible;
831  case 1:
833  case 2:
835  }
836  return Default;
837 }
838 
839 // Get value of immediate commandlists env var setting or -1 if unset
840 static const int ImmediateCommandlistsSetting = [] {
841  const char *ImmediateCommandlistsSettingStr =
842  std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS");
843  if (!ImmediateCommandlistsSettingStr)
844  return -1;
845  return std::stoi(ImmediateCommandlistsSettingStr);
846 }();
847 
848 // Whether immediate commandlists will be used for kernel launches and copies.
849 // The default is standard commandlists. Setting a value >=1 specifies use of
850 // immediate commandlists. Note: when immediate commandlists are used then
851 // device-only events must be either AllHostVisible or OnDemandHostVisibleProxy.
852 // (See env var SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS).
857 }
858 
859 pi_device _pi_context::getRootDevice() const {
860  assert(Devices.size() > 0);
861 
862  if (Devices.size() == 1)
863  return Devices[0];
864 
865  // Check if we have context with subdevices of the same device (context
866  // may include root device itself as well)
867  pi_device ContextRootDevice =
868  Devices[0]->RootDevice ? Devices[0]->RootDevice : Devices[0];
869 
870  // For context with sub subdevices, the ContextRootDevice might still
871  // not be the root device.
872  // Check whether the ContextRootDevice is the subdevice or root device.
873  if (ContextRootDevice->isSubDevice()) {
874  ContextRootDevice = ContextRootDevice->RootDevice;
875  }
876 
877  for (auto &Device : Devices) {
878  if ((!Device->RootDevice && Device != ContextRootDevice) ||
879  (Device->RootDevice && Device->RootDevice != ContextRootDevice)) {
880  ContextRootDevice = nullptr;
881  break;
882  }
883  }
884  return ContextRootDevice;
885 }
886 
888 
889  // Helper lambda to create various USM allocators for a device.
890  auto createUSMAllocators = [this](pi_device Device) {
891  SharedMemAllocContexts.emplace(
892  std::piecewise_construct, std::make_tuple(Device),
893  std::make_tuple(std::unique_ptr<SystemMemory>(
894  new USMSharedMemoryAlloc(this, Device))));
896  std::piecewise_construct, std::make_tuple(Device),
897  std::make_tuple(std::unique_ptr<SystemMemory>(
898  new USMSharedReadOnlyMemoryAlloc(this, Device))));
899  DeviceMemAllocContexts.emplace(
900  std::piecewise_construct, std::make_tuple(Device),
901  std::make_tuple(std::unique_ptr<SystemMemory>(
902  new USMDeviceMemoryAlloc(this, Device))));
903  };
904 
905  // Recursive helper to call createUSMAllocators for all sub-devices
906  std::function<void(pi_device)> createUSMAllocatorsRecursive;
907  createUSMAllocatorsRecursive =
908  [createUSMAllocators,
909  &createUSMAllocatorsRecursive](pi_device Device) -> void {
910  createUSMAllocators(Device);
911  for (auto &SubDevice : Device->SubDevices)
912  createUSMAllocatorsRecursive(SubDevice);
913  };
914 
915  // Create USM allocator context for each pair (device, context).
916  //
917  for (auto &Device : Devices) {
918  createUSMAllocatorsRecursive(Device);
919  }
920  // Create USM allocator context for host. Device and Shared USM allocations
921  // are device-specific. Host allocations are not device-dependent therefore
922  // we don't need a map with device as key.
923  HostMemAllocContext = std::make_unique<USMAllocContext>(
924  std::unique_ptr<SystemMemory>(new USMHostMemoryAlloc(this)));
925 
926  // We may allocate memory to this root device so create allocators.
928  DeviceMemAllocContexts.end()) {
929  createUSMAllocators(SingleRootDevice);
930  }
931 
932  // Create the immediate command list to be used for initializations
933  // Created as synchronous so level-zero performs implicit synchronization and
934  // there is no need to query for completion in the plugin
935  //
936  // TODO: get rid of using Devices[0] for the context with multiple
937  // root-devices. We should somehow make the data initialized on all devices.
939 
940  // NOTE: we always submit to the "0" index compute engine with immediate
941  // command list since this is one for context.
942  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
943  ZeCommandQueueDesc.ordinal =
945  ZeCommandQueueDesc.index = 0;
946  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
947  ZE_CALL(
948  zeCommandListCreateImmediate,
949  (ZeContext, Device->ZeDevice, &ZeCommandQueueDesc, &ZeCommandListInit));
950  return PI_SUCCESS;
951 }
952 
954  // This function is called when pi_context is deallocated, piContextRelease.
955  // There could be some memory that may have not been deallocated.
956  // For example, event and event pool caches would be still alive.
957 
958  if (!DisableEventsCaching) {
959  std::scoped_lock<pi_mutex> Lock(EventCacheMutex);
960  for (auto &EventCache : EventCaches) {
961  for (auto Event : EventCache) {
962  ZE_CALL(zeEventDestroy, (Event->ZeEvent));
963  delete Event;
964  }
965  EventCache.clear();
966  }
967  }
968  {
969  std::scoped_lock<pi_mutex> Lock(ZeEventPoolCacheMutex);
970  for (auto &ZePoolCache : ZeEventPoolCache) {
971  for (auto &ZePool : ZePoolCache)
972  ZE_CALL(zeEventPoolDestroy, (ZePool));
973  ZePoolCache.clear();
974  }
975  }
976 
977  // Destroy the command list used for initializations
978  ZE_CALL(zeCommandListDestroy, (ZeCommandListInit));
979 
980  std::scoped_lock<pi_mutex> Lock(ZeCommandListCacheMutex);
981  for (auto &List : ZeComputeCommandListCache) {
982  for (ze_command_list_handle_t &ZeCommandList : List.second) {
983  if (ZeCommandList)
984  ZE_CALL(zeCommandListDestroy, (ZeCommandList));
985  }
986  }
987  for (auto &List : ZeCopyCommandListCache) {
988  for (ze_command_list_handle_t &ZeCommandList : List.second) {
989  if (ZeCommandList)
990  ZE_CALL(zeCommandListDestroy, (ZeCommandList));
991  }
992  }
993  return PI_SUCCESS;
994 }
995 
997  return ZeQueueGroupOrdinal !=
998  (uint32_t)Queue->Device
999  ->QueueGroup[_pi_device::queue_group_info_t::type::Compute]
1000  .ZeOrdinal;
1001 }
1002 
1004  // If out-of-order queue property is not set, then this is a in-order queue.
1005  return ((this->Properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0);
1006 }
1007 
1009  return ((this->Properties & PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS) != 0);
1010 }
1011 
1012 pi_result
1014  bool MakeAvailable,
1015  std::vector<pi_event> &EventListToCleanup) {
1016  bool UseCopyEngine = CommandList->second.isCopy(this);
1017 
1018  // Immediate commandlists do not have an associated fence.
1019  if (CommandList->second.ZeFence != nullptr) {
1020  // Fence had been signalled meaning the associated command-list completed.
1021  // Reset the fence and put the command list into a cache for reuse in PI
1022  // calls.
1023  ZE_CALL(zeFenceReset, (CommandList->second.ZeFence));
1024  ZE_CALL(zeCommandListReset, (CommandList->first));
1025  CommandList->second.ZeFenceInUse = false;
1026  }
1027 
1028  auto &EventList = CommandList->second.EventList;
1029  // Remember all the events in this command list which needs to be
1030  // released/cleaned up and clear event list associated with command list.
1031  std::move(std::begin(EventList), std::end(EventList),
1032  std::back_inserter(EventListToCleanup));
1033  EventList.clear();
1034 
1035  // Standard commandlists move in and out of the cache as they are recycled.
1036  // Immediate commandlists are always available.
1037  if (CommandList->second.ZeFence != nullptr && MakeAvailable) {
1038  std::scoped_lock<pi_mutex> Lock(this->Context->ZeCommandListCacheMutex);
1039  auto &ZeCommandListCache =
1040  UseCopyEngine
1043  ZeCommandListCache.push_back(CommandList->first);
1044  }
1045 
1046  return PI_SUCCESS;
1047 }
1048 
1049 // Configuration of the command-list batching.
1050 typedef struct CommandListBatchConfig {
1051  // Default value of 0. This specifies to use dynamic batch size adjustment.
1052  // Other values will try to collect specified amount of commands.
1054 
1055  // If doing dynamic batching, specifies start batch size.
1057 
1058  // The maximum size for dynamic batch.
1060 
1061  // The step size for dynamic batch increases.
1063 
1064  // Thresholds for when increase batch size (number of closed early is small
1065  // and number of closed full is high).
1068 
1069  // Tells the starting size of a batch.
1070  pi_uint32 startSize() const { return Size > 0 ? Size : DynamicSizeStart; }
1071  // Tells is we are doing dynamic batch size adjustment.
1072  bool dynamic() const { return Size == 0; }
1074 
1075 // Helper function to initialize static variables that holds batch config info
1076 // for compute and copy command batching.
1078  zeCommandListBatchConfig Config{}; // default initialize
1079 
1080  // Default value of 0. This specifies to use dynamic batch size adjustment.
1081  const auto BatchSizeStr =
1082  (IsCopy) ? std::getenv("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE")
1083  : std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
1084  if (BatchSizeStr) {
1085  pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
1086  // Level Zero may only support a limted number of commands per command
1087  // list. The actual upper limit is not specified by the Level Zero
1088  // Specification. For now we allow an arbitrary upper limit.
1089  if (BatchSizeStrVal > 0) {
1090  Config.Size = BatchSizeStrVal;
1091  } else if (BatchSizeStrVal == 0) {
1092  Config.Size = 0;
1093  // We are requested to do dynamic batching. Collect specifics, if any.
1094  // The extended format supported is ":" separated values.
1095  //
1096  // NOTE: these extra settings are experimental and are intended to
1097  // be used only for finding a better default heuristic.
1098  //
1099  std::string BatchConfig(BatchSizeStr);
1100  size_t Ord = 0;
1101  size_t Pos = 0;
1102  while (true) {
1103  if (++Ord > 5)
1104  break;
1105 
1106  Pos = BatchConfig.find(":", Pos);
1107  if (Pos == std::string::npos)
1108  break;
1109  ++Pos; // past the ":"
1110 
1111  pi_uint32 Val;
1112  try {
1113  Val = std::stoi(BatchConfig.substr(Pos));
1114  } catch (...) {
1115  if (IsCopy)
1116  zePrint(
1117  "SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: failed to parse value\n");
1118  else
1119  zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: failed to parse value\n");
1120  break;
1121  }
1122  switch (Ord) {
1123  case 1:
1124  Config.DynamicSizeStart = Val;
1125  break;
1126  case 2:
1127  Config.DynamicSizeMax = Val;
1128  break;
1129  case 3:
1130  Config.DynamicSizeStep = Val;
1131  break;
1132  case 4:
1133  Config.NumTimesClosedEarlyThreshold = Val;
1134  break;
1135  case 5:
1136  Config.NumTimesClosedFullThreshold = Val;
1137  break;
1138  default:
1139  die("Unexpected batch config");
1140  }
1141  if (IsCopy)
1142  zePrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: dynamic batch param "
1143  "#%d: %d\n",
1144  (int)Ord, (int)Val);
1145  else
1146  zePrint(
1147  "SYCL_PI_LEVEL_ZERO_BATCH_SIZE: dynamic batch param #%d: %d\n",
1148  (int)Ord, (int)Val);
1149  };
1150 
1151  } else {
1152  // Negative batch sizes are silently ignored.
1153  if (IsCopy)
1154  zePrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: ignored negative value\n");
1155  else
1156  zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: ignored negative value\n");
1157  }
1158  }
1159  return Config;
1160 }
1161 
1162 // Static variable that holds batch config info for compute command batching.
1164  using IsCopy = bool;
1165  return ZeCommandListBatchConfig(IsCopy{false});
1166 }();
1167 
1168 // Static variable that holds batch config info for copy command batching.
1170  using IsCopy = bool;
1171  return ZeCommandListBatchConfig(IsCopy{true});
1172 }();
1173 
1174 _pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
1175  std::vector<ze_command_queue_handle_t> &CopyQueues,
1176  pi_context Context, pi_device Device,
1177  bool OwnZeCommandQueue,
1179  : Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue},
1180  Properties(PiQueueProperties) {
1181 
1182  // Compute group initialization.
1183  // First, see if the queue's device allows for round-robin or it is
1184  // fixed to one particular compute CCS (it is so for sub-sub-devices).
1185  auto &ComputeQueueGroupInfo = Device->QueueGroup[queue_type::Compute];
1186  ComputeQueueGroup.ZeQueues = ComputeQueues;
1187  if (ComputeQueueGroupInfo.ZeIndex >= 0) {
1188  ComputeQueueGroup.LowerIndex = ComputeQueueGroupInfo.ZeIndex;
1189  ComputeQueueGroup.UpperIndex = ComputeQueueGroupInfo.ZeIndex;
1190  ComputeQueueGroup.NextIndex = ComputeQueueGroupInfo.ZeIndex;
1191  } else {
1192  // Set-up to round-robin across allowed range of engines.
1193  uint32_t FilterLowerIndex = getRangeOfAllowedComputeEngines().first;
1194  uint32_t FilterUpperIndex = getRangeOfAllowedComputeEngines().second;
1195  FilterUpperIndex = std::min((size_t)FilterUpperIndex,
1196  FilterLowerIndex + ComputeQueues.size() - 1);
1197  if (FilterLowerIndex <= FilterUpperIndex) {
1198  ComputeQueueGroup.LowerIndex = FilterLowerIndex;
1199  ComputeQueueGroup.UpperIndex = FilterUpperIndex;
1200  ComputeQueueGroup.NextIndex = ComputeQueueGroup.LowerIndex;
1201  // Create space to hold immediate commandlists corresponding to the
1202  // ZeQueues
1203  if (Device->useImmediateCommandLists()) {
1204  ComputeQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
1205  ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
1206  }
1207  } else {
1208  die("No compute queue available/allowed.");
1209  }
1210  }
1211 
1212  // Copy group initialization.
1213  if (getRangeOfAllowedCopyEngines(Device).first < 0 ||
1214  getRangeOfAllowedCopyEngines(Device).second < 0) {
1215  // We are asked not to use copy engines, just do nothing.
1216  // Leave CopyQueueGroup.ZeQueues empty, and it won't be used.
1217  } else {
1218  uint32_t FilterLowerIndex = getRangeOfAllowedCopyEngines(Device).first;
1219  uint32_t FilterUpperIndex = getRangeOfAllowedCopyEngines(Device).second;
1220  FilterUpperIndex = std::min((size_t)FilterUpperIndex,
1221  FilterLowerIndex + CopyQueues.size() - 1);
1222  if (FilterLowerIndex <= FilterUpperIndex) {
1223  CopyQueueGroup.ZeQueues = CopyQueues;
1224  CopyQueueGroup.LowerIndex = FilterLowerIndex;
1225  CopyQueueGroup.UpperIndex = FilterUpperIndex;
1226  CopyQueueGroup.NextIndex = CopyQueueGroup.LowerIndex;
1227  // Create space to hold immediate commandlists corresponding to the
1228  // ZeQueues
1229  if (Device->useImmediateCommandLists()) {
1230  CopyQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
1231  CopyQueueGroup.ZeQueues.size(), CommandListMap.end());
1232  }
1233  }
1234  }
1235 
1236  // Initialize compute/copy command batches.
1237  ComputeCommandBatch.OpenCommandList = CommandListMap.end();
1238  CopyCommandBatch.OpenCommandList = CommandListMap.end();
1239  ComputeCommandBatch.QueueBatchSize =
1241  CopyCommandBatch.QueueBatchSize = ZeCommandListBatchCopyConfig.startSize();
1242 }
1243 
1245  bool QueueLocked = false);
1246 
1247 // Helper function to perform the necessary cleanup of the events from reset cmd
1248 // list.
1249 static pi_result
1250 CleanupEventListFromResetCmdList(std::vector<pi_event> &EventListToCleanup,
1251  bool QueueLocked = false) {
1252  for (auto Event : EventListToCleanup) {
1253  // We don't need to synchronize the events since the fence associated with
1254  // the command list was synchronized.
1255  {
1256  std::scoped_lock<pi_shared_mutex> EventLock(Event->Mutex);
1257  Event->Completed = true;
1258  }
1259  PI_CALL(CleanupCompletedEvent(Event, QueueLocked));
1260  // This event was removed from the command list, so decrement ref count
1261  // (it was incremented when they were added to the command list).
1263  }
1264  return PI_SUCCESS;
1265 }
1266 
1267 // Reset signalled command lists in the queue and put them to the cache of
1268 // command lists. A caller must not lock the queue mutex.
1270  // We need events to be cleaned up out of scope where queue is locked to avoid
1271  // nested locks, because event cleanup requires event to be locked. Nested
1272  // locks are hard to control and can cause deadlocks if mutexes are locked in
1273  // different order.
1274  std::vector<pi_event> EventListToCleanup;
1275  {
1276  // We check for command lists that have been already signalled, but have not
1277  // been added to the available list yet. Each command list has a fence
1278  // associated which tracks if a command list has completed dispatch of its
1279  // commands and is ready for reuse. If a command list is found to have been
1280  // signalled, then the command list & fence are reset and command list is
1281  // returned to the command list cache. All events associated with command
1282  // list are cleaned up if command list was reset.
1283  std::scoped_lock<pi_shared_mutex> Lock(Queue->Mutex);
1284  for (auto &&it = Queue->CommandListMap.begin();
1285  it != Queue->CommandListMap.end(); ++it) {
1286  // Immediate commandlists don't use a fence but still need reset.
1287  if (it->second.ZeFence == nullptr) {
1288  PI_CALL(Queue->resetCommandList(it, true, EventListToCleanup));
1289  } else {
1290  // It is possible that the fence was already noted as signalled and
1291  // reset. In that case the ZeFenceInUse flag will be false.
1292  if (it->second.ZeFenceInUse) {
1293  ze_result_t ZeResult =
1294  ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence));
1295  if (ZeResult == ZE_RESULT_SUCCESS)
1296  PI_CALL(Queue->resetCommandList(it, true, EventListToCleanup));
1297  }
1298  }
1299  }
1300  }
1301  CleanupEventListFromResetCmdList(EventListToCleanup);
1302  return PI_SUCCESS;
1303 }
1304 
1305 // Retrieve an available command list to be used in a PI call.
1307  pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine,
1308  bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) {
1309  // Immediate commandlists have been pre-allocated and are always available.
1310  if (Queue->Device->useImmediateCommandLists()) {
1311  CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList();
1312  if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine))
1313  return Res;
1314  return PI_SUCCESS;
1315  }
1316 
1317  auto &CommandBatch =
1318  UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
1319  // Handle batching of commands
1320  // First see if there is an command-list open for batching commands
1321  // for this queue.
1322  if (Queue->hasOpenCommandList(UseCopyEngine)) {
1323  if (AllowBatching &&
1324  (!ForcedCmdQueue ||
1325  *ForcedCmdQueue == CommandBatch.OpenCommandList->second.ZeQueue)) {
1326  CommandList = CommandBatch.OpenCommandList;
1327  return PI_SUCCESS;
1328  }
1329  // If this command isn't allowed to be batched or doesn't match the forced
1330  // command queue, then we need to go ahead and execute what is already in
1331  // the batched list, and then go on to process this. On exit from
1332  // executeOpenCommandList OpenCommandList will be invalidated.
1333  if (auto Res = Queue->executeOpenCommandList(UseCopyEngine))
1334  return Res;
1335  // Note that active barriers do not need to be inserted here as they will
1336  // have been enqueued into the command-list when they were created.
1337  }
1338 
1339  // Create/Reuse the command list, because in Level Zero commands are added to
1340  // the command lists, and later are then added to the command queue.
1341  // Each command list is paired with an associated fence to track when the
1342  // command list is available for reuse.
1343  _pi_result pi_result = PI_ERROR_OUT_OF_RESOURCES;
1344 
1345  // Initally, we need to check if a command list has already been created
1346  // on this device that is available for use. If so, then reuse that
1347  // Level-Zero Command List and Fence for this PI call.
1348  {
1349  // Make sure to acquire the lock before checking the size, or there
1350  // will be a race condition.
1351  std::scoped_lock<pi_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
1352  // Under mutex since operator[] does insertion on the first usage for every
1353  // unique ZeDevice.
1354  auto &ZeCommandListCache =
1355  UseCopyEngine
1356  ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]
1357  : Queue->Context
1359 
1360  for (auto ZeCommandListIt = ZeCommandListCache.begin();
1361  ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) {
1362  auto &ZeCommandList = *ZeCommandListIt;
1363  auto it = Queue->CommandListMap.find(ZeCommandList);
1364  if (it != Queue->CommandListMap.end()) {
1365  if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue)
1366  continue;
1367  CommandList = it;
1368  if (CommandList->second.ZeFence != nullptr)
1369  CommandList->second.ZeFenceInUse = true;
1370  } else {
1371  // If there is a command list available on this context, but it
1372  // wasn't yet used in this queue then create a new entry in this
1373  // queue's map to hold the fence and other associated command
1374  // list information.
1375  auto &QGroup = Queue->getQueueGroup(UseCopyEngine);
1376  uint32_t QueueGroupOrdinal;
1377  auto &ZeCommandQueue = ForcedCmdQueue
1378  ? *ForcedCmdQueue
1379  : QGroup.getZeQueue(&QueueGroupOrdinal);
1380  if (ForcedCmdQueue)
1381  QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);
1382 
1383  ze_fence_handle_t ZeFence;
1384  ZeStruct<ze_fence_desc_t> ZeFenceDesc;
1385  ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
1386  CommandList =
1387  Queue->CommandListMap
1388  .emplace(ZeCommandList,
1389  pi_command_list_info_t{ZeFence, true, ZeCommandQueue,
1390  QueueGroupOrdinal})
1391  .first;
1392  }
1393  ZeCommandListCache.erase(ZeCommandListIt);
1394  if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine))
1395  return Res;
1396  return PI_SUCCESS;
1397  }
1398  }
1399 
1400  // If there are no available command lists in the cache, then we check for
1401  // command lists that have already signalled, but have not been added to the
1402  // available list yet. Each command list has a fence associated which tracks
1403  // if a command list has completed dispatch of its commands and is ready for
1404  // reuse. If a command list is found to have been signalled, then the
1405  // command list & fence are reset and we return.
1406  for (auto it = Queue->CommandListMap.begin();
1407  it != Queue->CommandListMap.end(); ++it) {
1408  // Make sure this is the command list type needed.
1409  if (UseCopyEngine != it->second.isCopy(Queue))
1410  continue;
1411 
1412  ze_result_t ZeResult =
1413  ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence));
1414  if (ZeResult == ZE_RESULT_SUCCESS) {
1415  std::vector<pi_event> EventListToCleanup;
1416  Queue->resetCommandList(it, false, EventListToCleanup);
1417  CleanupEventListFromResetCmdList(EventListToCleanup,
1418  true /* QueueLocked */);
1419  CommandList = it;
1420  CommandList->second.ZeFenceInUse = true;
1421  return PI_SUCCESS;
1422  }
1423  }
1424 
1425  // If there are no available command lists nor signalled command lists,
1426  // then we must create another command list.
1427  pi_result = Queue->createCommandList(UseCopyEngine, CommandList);
1428  CommandList->second.ZeFenceInUse = true;
1429  return pi_result;
1430 }
1431 
1432 // Helper function to create a new command-list to this queue and associated
1433 // fence tracking its completion. This command list & fence are added to the
1434 // map of command lists in this queue with ZeFenceInUse = false.
1435 // The caller must hold a lock of the queue already.
1436 pi_result
1437 _pi_queue::createCommandList(bool UseCopyEngine,
1438  pi_command_list_ptr_t &CommandList,
1439  ze_command_queue_handle_t *ForcedCmdQueue) {
1440 
1441  ze_fence_handle_t ZeFence;
1442  ZeStruct<ze_fence_desc_t> ZeFenceDesc;
1443  ze_command_list_handle_t ZeCommandList;
1444 
1445  uint32_t QueueGroupOrdinal;
1446  auto &QGroup = getQueueGroup(UseCopyEngine);
1447  auto &ZeCommandQueue =
1448  ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue(&QueueGroupOrdinal);
1449  if (ForcedCmdQueue)
1450  QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);
1451 
1452  ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
1453  ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
1454 
1455  ZE_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice,
1456  &ZeCommandListDesc, &ZeCommandList));
1457 
1458  ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
1459  std::tie(CommandList, std::ignore) = CommandListMap.insert(
1460  std::pair<ze_command_list_handle_t, pi_command_list_info_t>(
1461  ZeCommandList, {ZeFence, false, ZeCommandQueue, QueueGroupOrdinal}));
1462 
1463  PI_CALL(insertActiveBarriers(CommandList, UseCopyEngine));
1464  return PI_SUCCESS;
1465 }
1466 
1469  auto &ZeCommandListBatchConfig =
1471  pi_uint32 &QueueBatchSize = CommandBatch.QueueBatchSize;
1472  // QueueBatchSize of 0 means never allow batching.
1473  if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic())
1474  return;
1476 
1477  // If the number of times the list has been closed early is low, and
1478  // the number of times it has been closed full is high, then raise
1479  // the batching size slowly. Don't raise it if it is already pretty
1480  // high.
1485  if (QueueBatchSize < ZeCommandListBatchConfig.DynamicSizeMax) {
1486  QueueBatchSize += ZeCommandListBatchConfig.DynamicSizeStep;
1487  zePrint("Raising QueueBatchSize to %d\n", QueueBatchSize);
1488  }
1491  }
1492 }
1493 
1496  auto &ZeCommandListBatchConfig =
1498  pi_uint32 &QueueBatchSize = CommandBatch.QueueBatchSize;
1499  // QueueBatchSize of 0 means never allow batching.
1500  if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic())
1501  return;
1503 
1504  // If we are closing early more than about 3x the number of times
1505  // it is closing full, lower the batch size to the value of the
1506  // current open command list. This is trying to quickly get to a
1507  // batch size that will be able to be closed full at least once
1508  // in a while.
1510  (CommandBatch.NumTimesClosedFull + 1) * 3) {
1511  QueueBatchSize = CommandBatch.OpenCommandList->second.size() - 1;
1512  if (QueueBatchSize < 1)
1513  QueueBatchSize = 1;
1514  zePrint("Lowering QueueBatchSize to %d\n", QueueBatchSize);
1517  }
1518 }
1519 
1521  for (auto &Kernel : KernelsToBeSubmitted) {
1522  if (!Kernel->hasIndirectAccess())
1523  continue;
1524 
1525  auto &Contexts = Device->Platform->Contexts;
1526  for (auto &Ctx : Contexts) {
1527  for (auto &Elem : Ctx->MemAllocs) {
1528  const auto &Pair = Kernel->MemAllocs.insert(&Elem);
1529  // Kernel is referencing this memory allocation from now.
1530  // If this memory allocation was already captured for this kernel, it
1531  // means that kernel is submitted several times. Increase reference
1532  // count only once because we release all allocations only when
1533  // SubmissionsCount turns to 0. We don't want to know how many times
1534  // allocation was retained by each submission.
1535  if (Pair.second)
1536  Elem.second.RefCount.increment();
1537  }
1538  }
1539  Kernel->SubmissionsCount++;
1540  }
1541  KernelsToBeSubmitted.clear();
1542 }
1543 
1545  bool IsBlocking,
1546  bool OKToBatchCommand) {
1547  bool UseCopyEngine = CommandList->second.isCopy(this);
1548 
1549  // If the current LastCommandEvent is the nullptr, then it means
1550  // either that no command has ever been issued to the queue
1551  // or it means that the LastCommandEvent has been signalled and
1552  // therefore that this Queue is idle.
1553  //
1554  // NOTE: this behavior adds some flakyness to the batching
1555  // since last command's event may or may not be completed by the
1556  // time we get here depending on timings and system/gpu load.
1557  // So, disable it for modes where we print PI traces. Printing
1558  // traces incurs much different timings than real execution
1559  // ansyway, and many regression tests use it.
1560  //
1561  bool CurrentlyEmpty = !PrintPiTrace && this->LastCommandEvent == nullptr;
1562 
1563  // The list can be empty if command-list only contains signals of proxy
1564  // events.
1565  if (!CommandList->second.EventList.empty())
1566  this->LastCommandEvent = CommandList->second.EventList.back();
1567 
1568  if (!Device->useImmediateCommandLists()) {
1569  // Batch if allowed to, but don't batch if we know there are no kernels
1570  // from this queue that are currently executing. This is intended to get
1571  // kernels started as soon as possible when there are no kernels from this
1572  // queue awaiting execution, while allowing batching to occur when there
1573  // are kernels already executing. Also, if we are using fixed size batching,
1574  // as indicated by !ZeCommandListBatch.dynamic(), then just ignore
1575  // CurrentlyEmpty as we want to strictly follow the batching the user
1576  // specified.
1577  auto &CommandBatch = UseCopyEngine ? CopyCommandBatch : ComputeCommandBatch;
1578  auto &ZeCommandListBatchConfig = UseCopyEngine
1581  if (OKToBatchCommand && this->isBatchingAllowed(UseCopyEngine) &&
1582  (!ZeCommandListBatchConfig.dynamic() || !CurrentlyEmpty)) {
1583 
1584  if (hasOpenCommandList(UseCopyEngine) &&
1585  CommandBatch.OpenCommandList != CommandList)
1586  die("executeCommandList: OpenCommandList should be equal to"
1587  "null or CommandList");
1588 
1589  if (CommandList->second.size() < CommandBatch.QueueBatchSize) {
1590  CommandBatch.OpenCommandList = CommandList;
1591  return PI_SUCCESS;
1592  }
1593 
1594  adjustBatchSizeForFullBatch(UseCopyEngine);
1596  }
1597  }
1598 
1599  auto &ZeCommandQueue = CommandList->second.ZeQueue;
1600  // Scope of the lock must be till the end of the function, otherwise new mem
1601  // allocs can be created between the moment when we made a snapshot and the
1602  // moment when command list is closed and executed. But mutex is locked only
1603  // if indirect access tracking enabled, because std::defer_lock is used.
1604  // unique_lock destructor at the end of the function will unlock the mutex
1605  // if it was locked (which happens only if IndirectAccessTrackingEnabled is
1606  // true).
1607  std::unique_lock<pi_shared_mutex> ContextsLock(
1608  Device->Platform->ContextsMutex, std::defer_lock);
1609 
1610  if (IndirectAccessTrackingEnabled) {
1611  // We are going to submit kernels for execution. If indirect access flag is
1612  // set for a kernel then we need to make a snapshot of existing memory
1613  // allocations in all contexts in the platform. We need to lock the mutex
1614  // guarding the list of contexts in the platform to prevent creation of new
1615  // memory alocations in any context before we submit the kernel for
1616  // execution.
1617  ContextsLock.lock();
1619  }
1620 
1621  if (!Device->useImmediateCommandLists()) {
1622  // In this mode all inner-batch events have device visibility only,
1623  // and we want the last command in the batch to signal a host-visible
1624  // event that anybody waiting for any event in the batch will
1625  // really be using.
1626  // We need to create a proxy host-visible event only if the list of events
1627  // in the command list is not empty, otherwise we are going to just create
1628  // and remove proxy event right away and dereference deleted object
1629  // afterwards.
1631  !CommandList->second.EventList.empty()) {
1632  // If there are only internal events in the command list then we don't
1633  // need to create host proxy event.
1634  auto Result =
1635  std::find_if(CommandList->second.EventList.begin(),
1636  CommandList->second.EventList.end(),
1637  [](pi_event E) { return E->hasExternalRefs(); });
1638  if (Result != CommandList->second.EventList.end()) {
1639  // Create a "proxy" host-visible event.
1640  //
1641  pi_event HostVisibleEvent;
1642  auto Res = createEventAndAssociateQueue(
1643  this, &HostVisibleEvent, PI_COMMAND_TYPE_USER, CommandList,
1644  /* IsInternal */ false, /* ForceHostVisible */ true);
1645  if (Res)
1646  return Res;
1647 
1648  // Update each command's event in the command-list to "see" this
1649  // proxy event as a host-visible counterpart.
1650  for (auto &Event : CommandList->second.EventList) {
1651  std::scoped_lock<pi_shared_mutex> EventLock(Event->Mutex);
1652  // Internal event doesn't need host-visible proxy.
1653  if (!Event->hasExternalRefs())
1654  continue;
1655 
1656  if (!Event->HostVisibleEvent) {
1657  Event->HostVisibleEvent = HostVisibleEvent;
1658  HostVisibleEvent->RefCount.increment();
1659  }
1660  }
1661 
1662  // Decrement the reference count of the event such that all the
1663  // remaining references are from the other commands in this batch and
1664  // from the command-list itself. This host-visible event will not be
1665  // waited/released by SYCL RT, so it must be destroyed after all events
1666  // in the batch are gone. We know that refcount is more than 2 because
1667  // we check that EventList of the command list is not empty above, i.e.
1668  // after createEventAndAssociateQueue ref count is 2 and then +1 for
1669  // each event in the EventList.
1670  PI_CALL(piEventReleaseInternal(HostVisibleEvent));
1671  PI_CALL(piEventReleaseInternal(HostVisibleEvent));
1672 
1673  // Indicate no cleanup is needed for this PI event as it is special.
1674  HostVisibleEvent->CleanedUp = true;
1675 
1676  // Finally set to signal the host-visible event at the end of the
1677  // command-list.
1678  // TODO: see if we need a barrier here (or explicit wait for all events
1679  // in the batch).
1680  ZE_CALL(zeCommandListAppendSignalEvent,
1681  (CommandList->first, HostVisibleEvent->ZeEvent));
1682  }
1683  }
1684 
1685  // Close the command list and have it ready for dispatch.
1686  ZE_CALL(zeCommandListClose, (CommandList->first));
1687  // Offload command list to the GPU for asynchronous execution
1688  auto ZeCommandList = CommandList->first;
1689  auto ZeResult = ZE_CALL_NOCHECK(
1690  zeCommandQueueExecuteCommandLists,
1691  (ZeCommandQueue, 1, &ZeCommandList, CommandList->second.ZeFence));
1692  if (ZeResult != ZE_RESULT_SUCCESS) {
1693  this->Healthy = false;
1694  if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
1695  // Turn into a more informative end-user error.
1696  return PI_ERROR_COMMAND_EXECUTION_FAILURE;
1697  }
1698  return mapError(ZeResult);
1699  }
1700  }
1701 
1702  // Check global control to make every command blocking for debugging.
1703  if (IsBlocking || (ZeSerialize & ZeSerializeBlock) != 0) {
1705  synchronize();
1706  } else {
1707  // Wait until command lists attached to the command queue are executed.
1708  ZE_CALL(zeHostSynchronize, (ZeCommandQueue));
1709  }
1710  }
1711  return PI_SUCCESS;
1712 }
1713 
1714 bool _pi_queue::isBatchingAllowed(bool IsCopy) const {
1716  return (CommandBatch.QueueBatchSize > 0 &&
1717  ((ZeSerialize & ZeSerializeBlock) == 0));
1718 }
1719 
1720 // Return the index of the next queue to use based on a
1721 // round robin strategy and the queue group ordinal.
1722 uint32_t _pi_queue::pi_queue_group_t::getQueueIndex(uint32_t *QueueGroupOrdinal,
1723  uint32_t *QueueIndex) {
1724 
1725  auto CurrentIndex = NextIndex;
1726  ++NextIndex;
1727  if (NextIndex > UpperIndex)
1729 
1730  // Find out the right queue group ordinal (first queue might be "main" or
1731  // "link")
1732  auto QueueType = Type;
1733  if (QueueType != queue_type::Compute)
1734  QueueType = (CurrentIndex == 0 && Queue->Device->hasMainCopyEngine())
1735  ? queue_type::MainCopy
1736  : queue_type::LinkCopy;
1737 
1738  *QueueGroupOrdinal = Queue->Device->QueueGroup[QueueType].ZeOrdinal;
1739  // Adjust the index to the L0 queue group since we represent "main" and
1740  // "link"
1741  // L0 groups with a single copy group ("main" would take "0" index).
1742  auto ZeCommandQueueIndex = CurrentIndex;
1743  if (QueueType == queue_type::LinkCopy && Queue->Device->hasMainCopyEngine()) {
1744  ZeCommandQueueIndex -= 1;
1745  }
1746  *QueueIndex = ZeCommandQueueIndex;
1747 
1748  return CurrentIndex;
1749 }
1750 
1752  ze_command_queue_handle_t CmdQueue) {
1753  // Find out the right queue group ordinal (first queue might be "main" or
1754  // "link")
1755  auto QueueType = Type;
1756  if (QueueType != queue_type::Compute)
1757  QueueType = (ZeQueues[0] == CmdQueue && Queue->Device->hasMainCopyEngine())
1758  ? queue_type::MainCopy
1759  : queue_type::LinkCopy;
1760  return Queue->Device->QueueGroup[QueueType].ZeOrdinal;
1761 }
1762 
1763 // This function will return one of possibly multiple available native
1764 // queues and the value of the queue group ordinal.
1766 _pi_queue::pi_queue_group_t::getZeQueue(uint32_t *QueueGroupOrdinal) {
1767 
1768  // QueueIndex is the proper L0 index.
1769  // Index is the plugins concept of index, with main and link copy engines in
1770  // one range.
1771  uint32_t QueueIndex;
1772  auto Index = getQueueIndex(QueueGroupOrdinal, &QueueIndex);
1773 
1774  ze_command_queue_handle_t &ZeQueue = ZeQueues[Index];
1775  if (ZeQueue)
1776  return ZeQueue;
1777 
1778  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
1779  ZeCommandQueueDesc.ordinal = *QueueGroupOrdinal;
1780  ZeCommandQueueDesc.index = QueueIndex;
1781  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
1782 
1783  // Evaluate performance of explicit usage for "0" index.
1784  if (QueueIndex != 0) {
1785  ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
1786  }
1787 
1788  zePrint("[getZeQueue]: create queue ordinal = %d, index = %d "
1789  "(round robin in [%d, %d])\n",
1790  ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
1791  UpperIndex);
1792 
1793  auto ZeResult = ZE_CALL_NOCHECK(
1794  zeCommandQueueCreate, (Queue->Context->ZeContext, Queue->Device->ZeDevice,
1795  &ZeCommandQueueDesc, &ZeQueue));
1796  if (ZeResult) {
1797  die("[L0] getZeQueue: failed to create queue");
1798  }
1799 
1800  return ZeQueue;
1801 }
1802 
1803 // This function will return one of possibly multiple available
1804 // immediate commandlists associated with this Queue.
1806 
1807  uint32_t QueueIndex, QueueOrdinal;
1808  auto Index = getQueueIndex(&QueueOrdinal, &QueueIndex);
1809 
1810  if (ImmCmdLists[Index] != Queue->CommandListMap.end())
1811  return ImmCmdLists[Index];
1812 
1813  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
1814  ZeCommandQueueDesc.ordinal = QueueOrdinal;
1815  ZeCommandQueueDesc.index = QueueIndex;
1816  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
1817 
1818  // Evaluate performance of explicit usage for "0" index.
1819  if (QueueIndex != 0) {
1820  ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
1821  }
1822 
1823  zePrint("[getZeQueue]: create queue ordinal = %d, index = %d "
1824  "(round robin in [%d, %d])\n",
1825  ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
1826  UpperIndex);
1827 
1828  ze_command_list_handle_t ZeCommandList;
1829  ZE_CALL_NOCHECK(zeCommandListCreateImmediate,
1830  (Queue->Context->ZeContext, Queue->Device->ZeDevice,
1831  &ZeCommandQueueDesc, &ZeCommandList));
1832  ImmCmdLists[Index] =
1833  Queue->CommandListMap
1834  .insert(std::pair<ze_command_list_handle_t, pi_command_list_info_t>{
1835  ZeCommandList, {nullptr, true, nullptr, QueueOrdinal}})
1836  .first;
1837  // Add this commandlist to the cache so it can be destroyed as part of
1838  // piQueueReleaseInternal
1839  auto QueueType = Type;
1840  std::scoped_lock<pi_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
1841  auto &ZeCommandListCache =
1842  QueueType == queue_type::Compute
1843  ? Queue->Context->ZeComputeCommandListCache[Queue->Device->ZeDevice]
1844  : Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice];
1845  ZeCommandListCache.push_back(ZeCommandList);
1846 
1847  return ImmCmdLists[Index];
1848 }
1849 
1851  using IsCopy = bool;
1852 
1854  // When using immediate commandlists there are no open command lists.
1855  return CommandListMap.end();
1856  }
1857 
1858  const auto &ComputeEventList =
1859  ComputeCommandBatch.OpenCommandList->second.EventList;
1860  if (hasOpenCommandList(IsCopy{false}) &&
1861  std::find(ComputeEventList.begin(), ComputeEventList.end(), Event) !=
1862  ComputeEventList.end()) {
1864  }
1865  const auto &CopyEventList =
1866  CopyCommandBatch.OpenCommandList->second.EventList;
1867  if (hasOpenCommandList(IsCopy{true}) &&
1868  std::find(CopyEventList.begin(), CopyEventList.end(), Event) !=
1869  CopyEventList.end()) {
1871  }
1872  return CommandListMap.end();
1873 }
1874 
1876  bool UseCopyEngine) {
1877  // Early exit if there are no active barriers.
1878  if (ActiveBarriers.empty())
1879  return PI_SUCCESS;
1880 
1881  // Create a wait-list and retain events. This will filter out finished events.
1882  _pi_ze_event_list_t ActiveBarriersWaitList;
1883  if (auto Res = ActiveBarriersWaitList.createAndRetainPiZeEventList(
1884  ActiveBarriers.size(), ActiveBarriers.data(), this, UseCopyEngine))
1885  return Res;
1886 
1887  // We can now release all the active barriers and replace them with the ones
1888  // in the wait list.
1889  for (pi_event &BarrierEvent : ActiveBarriers)
1890  PI_CALL(piEventReleaseInternal(BarrierEvent));
1891  ActiveBarriers.clear();
1892  ActiveBarriers.insert(
1893  ActiveBarriers.end(), ActiveBarriersWaitList.PiEventList,
1894  ActiveBarriersWaitList.PiEventList + ActiveBarriersWaitList.Length);
1895 
1896  // If there are more active barriers, insert a barrier on the command-list. We
1897  // do not need an event for finishing so we pass nullptr.
1898  if (!ActiveBarriers.empty())
1899  ZE_CALL(zeCommandListAppendBarrier,
1900  (CmdList->first, nullptr, ActiveBarriersWaitList.Length,
1901  ActiveBarriersWaitList.ZeEventList));
1902  return PI_SUCCESS;
1903 }
1904 
1907  // If there are any commands still in the open command list for this
1908  // queue, then close and execute that command list now.
1909  if (hasOpenCommandList(IsCopy)) {
1911  auto Res = executeCommandList(CommandBatch.OpenCommandList, false, false);
1913  return Res;
1914  }
1915 
1916  return PI_SUCCESS;
1917 }
1918 
1919 static const bool FilterEventWaitList = [] {
1920  const char *Ret = std::getenv("SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST");
1921  const bool RetVal = Ret ? std::stoi(Ret) : 1;
1922  return RetVal;
1923 }();
1924 
1926  pi_uint32 EventListLength, const pi_event *EventList, pi_queue CurQueue,
1927  bool UseCopyEngine) {
1928  this->Length = 0;
1929  this->ZeEventList = nullptr;
1930  this->PiEventList = nullptr;
1931 
1932  try {
1933  if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) {
1934  this->ZeEventList = new ze_event_handle_t[EventListLength + 1];
1935  this->PiEventList = new pi_event[EventListLength + 1];
1936  } else if (EventListLength > 0) {
1937  this->ZeEventList = new ze_event_handle_t[EventListLength];
1938  this->PiEventList = new pi_event[EventListLength];
1939  }
1940 
1941  pi_uint32 TmpListLength = 0;
1942 
1943  if (EventListLength > 0) {
1944  for (pi_uint32 I = 0; I < EventListLength; I++) {
1945  PI_ASSERT(EventList[I] != nullptr, PI_ERROR_INVALID_VALUE);
1946  {
1947  std::shared_lock<pi_shared_mutex> Lock(EventList[I]->Mutex);
1948  if (EventList[I]->Completed)
1949  continue;
1950 
1951  // Poll of the host-visible events.
1952  auto HostVisibleEvent = EventList[I]->HostVisibleEvent;
1953  if (FilterEventWaitList && HostVisibleEvent) {
1954  auto Res = ZE_CALL_NOCHECK(zeEventQueryStatus,
1955  (HostVisibleEvent->ZeEvent));
1956  if (Res == ZE_RESULT_SUCCESS) {
1957  // Event has already completed, don't put it into the list
1958  continue;
1959  }
1960  }
1961  }
1962 
1963  auto Queue = EventList[I]->Queue;
1964  if (Queue) {
1965  // The caller of createAndRetainPiZeEventList must already hold
1966  // a lock of the CurQueue. Additionally lock the Queue if it
1967  // is different from CurQueue.
1968  // TODO: rework this to avoid deadlock when another thread is
1969  // locking the same queues but in a different order.
1970  auto Lock = ((Queue == CurQueue)
1971  ? std::unique_lock<pi_shared_mutex>()
1972  : std::unique_lock<pi_shared_mutex>(Queue->Mutex));
1973 
1974  // If the event that is going to be waited is in an open batch
1975  // different from where this next command is going to be added,
1976  // then we have to force execute of that open command-list
1977  // to avoid deadlocks.
1978  //
1979  const auto &OpenCommandList =
1980  Queue->eventOpenCommandList(EventList[I]);
1981  if (OpenCommandList != Queue->CommandListMap.end()) {
1982 
1983  if (Queue == CurQueue &&
1984  OpenCommandList->second.isCopy(Queue) == UseCopyEngine) {
1985  // Don't force execute the batch yet since the new command
1986  // is going to the same open batch as the dependent event.
1987  } else {
1988  if (auto Res = Queue->executeOpenCommandList(
1989  OpenCommandList->second.isCopy(Queue)))
1990  return Res;
1991  }
1992  }
1993  } else {
1994  // There is a dependency on an interop-event.
1995  // Similarily to the above to avoid dead locks ensure that
1996  // execution of all prior commands in the current command-
1997  // batch is visible to the host. This may not be the case
1998  // when we intended to have only last command in the batch
1999  // produce host-visible event, e.g.
2000  //
2001  // event0 = interop event
2002  // event1 = command1 (already in batch, no deps)
2003  // event2 = command2 (is being added, dep on event0)
2004  // event3 = signal host-visible event for the batch
2005  // event1.wait()
2006  // event0.signal()
2007  //
2008  // Make sure that event1.wait() will wait for a host-visible
2009  // event that is signalled before the command2 is enqueued.
2010  if (CurQueue->Device->eventsScope() != AllHostVisible) {
2011  CurQueue->executeAllOpenCommandLists();
2012  }
2013  }
2014 
2015  std::shared_lock<pi_shared_mutex> Lock(EventList[I]->Mutex);
2016  this->ZeEventList[TmpListLength] = EventList[I]->ZeEvent;
2017  this->PiEventList[TmpListLength] = EventList[I];
2018  TmpListLength += 1;
2019  }
2020  }
2021 
2022  // For in-order queues, every command should be executed only after the
2023  // previous command has finished. The event associated with the last
2024  // enqueued command is added into the waitlist to ensure in-order semantics.
2025  if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) {
2026 
2027  // Ensure LastCommandEvent's batch is submitted if it is differrent
2028  // from the one this command is going to.
2029  const auto &OpenCommandList =
2030  CurQueue->eventOpenCommandList(CurQueue->LastCommandEvent);
2031  if (OpenCommandList != CurQueue->CommandListMap.end() &&
2032  OpenCommandList->second.isCopy(CurQueue) != UseCopyEngine) {
2033 
2034  if (auto Res = CurQueue->executeOpenCommandList(
2035  OpenCommandList->second.isCopy(CurQueue)))
2036  return Res;
2037  }
2038  std::shared_lock<pi_shared_mutex> Lock(CurQueue->LastCommandEvent->Mutex);
2039  this->ZeEventList[TmpListLength] = CurQueue->LastCommandEvent->ZeEvent;
2040  this->PiEventList[TmpListLength] = CurQueue->LastCommandEvent;
2041  TmpListLength += 1;
2042  }
2043 
2044  this->Length = TmpListLength;
2045 
2046  } catch (...) {
2047  return PI_ERROR_OUT_OF_HOST_MEMORY;
2048  }
2049 
2050  for (pi_uint32 I = 0; I < this->Length; I++) {
2051  this->PiEventList[I]->RefCount.increment();
2052  }
2053 
2054  return PI_SUCCESS;
2055 }
2056 
2057 static void printZeEventList(const _pi_ze_event_list_t &PiZeEventList) {
2058  zePrint(" NumEventsInWaitList %d:", PiZeEventList.Length);
2059 
2060  for (pi_uint32 I = 0; I < PiZeEventList.Length; I++) {
2061  zePrint(" %#lx", pi_cast<std::uintptr_t>(PiZeEventList.ZeEventList[I]));
2062  }
2063 
2064  zePrint("\n");
2065 }
2066 
2068  std::list<pi_event> &EventsToBeReleased) {
2069  // acquire a lock before reading the length and list fields.
2070  // Acquire the lock, copy the needed data locally, and reset
2071  // the fields, then release the lock.
2072  // Only then do we do the actual actions to release and destroy,
2073  // holding the lock for the minimum time necessary.
2074  pi_uint32 LocLength = 0;
2075  ze_event_handle_t *LocZeEventList = nullptr;
2076  pi_event *LocPiEventList = nullptr;
2077 
2078  {
2079  // acquire the lock and copy fields locally
2080  // Lock automatically releases when this goes out of scope.
2081  std::scoped_lock<pi_mutex> lock(this->PiZeEventListMutex);
2082 
2083  LocLength = Length;
2084  LocZeEventList = ZeEventList;
2085  LocPiEventList = PiEventList;
2086 
2087  Length = 0;
2088  ZeEventList = nullptr;
2089  PiEventList = nullptr;
2090 
2091  // release lock by ending scope.
2092  }
2093 
2094  for (pi_uint32 I = 0; I < LocLength; I++) {
2095  // Add the event to be released to the list
2096  EventsToBeReleased.push_back(LocPiEventList[I]);
2097  }
2098 
2099  if (LocZeEventList != nullptr) {
2100  delete[] LocZeEventList;
2101  }
2102  if (LocPiEventList != nullptr) {
2103  delete[] LocPiEventList;
2104  }
2105 
2106  return PI_SUCCESS;
2107 }
2108 
2109 extern "C" {
2110 
2111 // Forward declarations
2113 
2114 static ze_result_t
2116  ze_module_build_log_handle_t *ZeBuildLog);
2117 
2118 // This function will ensure compatibility with both Linux and Windows for
2119 // setting environment variables.
2120 static bool setEnvVar(const char *name, const char *value) {
2121 #ifdef _WIN32
2122  int Res = _putenv_s(name, value);
2123 #else
2124  int Res = setenv(name, value, 1);
2125 #endif
2126  if (Res != 0) {
2127  zePrint(
2128  "Level Zero plugin was unable to set the environment variable: %s\n",
2129  name);
2130  return false;
2131  }
2132  return true;
2133 }
2134 
2135 static class ZeUSMImportExtension {
2136  // Pointers to functions that import/release host memory into USM
2137  ze_result_t (*zexDriverImportExternalPointer)(ze_driver_handle_t hDriver,
2138  void *, size_t);
2139  ze_result_t (*zexDriverReleaseImportedPointer)(ze_driver_handle_t, void *);
2140 
2141 public:
2142  // Whether user has requested Import/Release, and platform supports it.
2143  bool Enabled;
2144 
2145  ZeUSMImportExtension() : Enabled{false} {}
2146 
2147  void setZeUSMImport(pi_platform Platform) {
2148  // Whether env var SYCL_USM_HOSTPTR_IMPORT has been set requesting
2149  // host ptr import during buffer creation.
2150  const char *USMHostPtrImportStr = std::getenv("SYCL_USM_HOSTPTR_IMPORT");
2151  if (!USMHostPtrImportStr || std::atoi(USMHostPtrImportStr) == 0)
2152  return;
2153 
2154  // Check if USM hostptr import feature is available.
2155  ze_driver_handle_t driverHandle = Platform->ZeDriver;
2156  if (ZE_CALL_NOCHECK(zeDriverGetExtensionFunctionAddress,
2157  (driverHandle, "zexDriverImportExternalPointer",
2158  reinterpret_cast<void **>(
2159  &zexDriverImportExternalPointer))) == 0) {
2161  zeDriverGetExtensionFunctionAddress,
2162  (driverHandle, "zexDriverReleaseImportedPointer",
2163  reinterpret_cast<void **>(&zexDriverReleaseImportedPointer)));
2164  // Hostptr import/release is turned on because it has been requested
2165  // by the env var, and this platform supports the APIs.
2166  Enabled = true;
2167  // Hostptr import is only possible if piMemBufferCreate receives a
2168  // hostptr as an argument. The SYCL runtime passes a host ptr
2169  // only when SYCL_HOST_UNIFIED_MEMORY is enabled. Therefore we turn it on.
2170  setEnvVar("SYCL_HOST_UNIFIED_MEMORY", "1");
2171  }
2172  }
2173  void doZeUSMImport(ze_driver_handle_t driverHandle, void *HostPtr,
2174  size_t Size) {
2175  ZE_CALL_NOCHECK(zexDriverImportExternalPointer,
2176  (driverHandle, HostPtr, Size));
2177  }
2178  void doZeUSMRelease(ze_driver_handle_t driverHandle, void *HostPtr) {
2179  ZE_CALL_NOCHECK(zexDriverReleaseImportedPointer, (driverHandle, HostPtr));
2180  }
2181 } ZeUSMImport;
2182 
2184  // Cache driver properties
2185  ZeStruct<ze_driver_properties_t> ZeDriverProperties;
2186  ZE_CALL(zeDriverGetProperties, (ZeDriver, &ZeDriverProperties));
2187  uint32_t DriverVersion = ZeDriverProperties.driverVersion;
2188  // Intel Level-Zero GPU driver stores version as:
2189  // | 31 - 24 | 23 - 16 | 15 - 0 |
2190  // | Major | Minor | Build |
2191  auto VersionMajor = std::to_string((DriverVersion & 0xFF000000) >> 24);
2192  auto VersionMinor = std::to_string((DriverVersion & 0x00FF0000) >> 16);
2193  auto VersionBuild = std::to_string(DriverVersion & 0x0000FFFF);
2194  ZeDriverVersion = VersionMajor + "." + VersionMinor + "." + VersionBuild;
2195 
2196  ZE_CALL(zeDriverGetApiVersion, (ZeDriver, &ZeApiVersion));
2197  ZeDriverApiVersion = std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + "." +
2198  std::to_string(ZE_MINOR_VERSION(ZeApiVersion));
2199 
2200  // Cache driver extension properties
2201  uint32_t Count = 0;
2202  ZE_CALL(zeDriverGetExtensionProperties, (ZeDriver, &Count, nullptr));
2203 
2204  std::vector<ze_driver_extension_properties_t> zeExtensions(Count);
2205 
2206  ZE_CALL(zeDriverGetExtensionProperties,
2207  (ZeDriver, &Count, zeExtensions.data()));
2208 
2209  for (auto extension : zeExtensions) {
2210  // Check if global offset extension is available
2211  if (strncmp(extension.name, ZE_GLOBAL_OFFSET_EXP_NAME,
2212  strlen(ZE_GLOBAL_OFFSET_EXP_NAME) + 1) == 0) {
2213  if (extension.version == ZE_GLOBAL_OFFSET_EXP_VERSION_1_0) {
2215  }
2216  }
2217  // Check if extension is available for "static linking" (compiling multiple
2218  // SPIR-V modules together into one Level Zero module).
2219  if (strncmp(extension.name, ZE_MODULE_PROGRAM_EXP_NAME,
2220  strlen(ZE_MODULE_PROGRAM_EXP_NAME) + 1) == 0) {
2221  if (extension.version == ZE_MODULE_PROGRAM_EXP_VERSION_1_0) {
2223  }
2224  }
2225  zeDriverExtensionMap[extension.name] = extension.version;
2226  }
2227 
2228  // Check if import user ptr into USM feature has been requested.
2229  // If yes, then set up L0 API pointers if the platform supports it.
2231 
2232  return PI_SUCCESS;
2233 }
2234 
2236  pi_uint32 *NumPlatforms) {
2237 
2238  static const char *PiTrace = std::getenv("SYCL_PI_TRACE");
2239  static const int PiTraceValue = PiTrace ? std::stoi(PiTrace) : 0;
2240  if (PiTraceValue == -1 || PiTraceValue == 2) { // Means print all PI traces
2241  PrintPiTrace = true;
2242  }
2243 
2244  static std::once_flag ZeCallCountInitialized;
2245  try {
2246  std::call_once(ZeCallCountInitialized, []() {
2247  if (ZeDebug & ZE_DEBUG_CALL_COUNT) {
2248  ZeCallCount = new std::map<const char *, int>;
2249  }
2250  });
2251  } catch (const std::bad_alloc &) {
2252  return PI_ERROR_OUT_OF_HOST_MEMORY;
2253  } catch (...) {
2254  return PI_ERROR_UNKNOWN;
2255  }
2256 
2257  if (NumEntries == 0 && Platforms != nullptr) {
2258  return PI_ERROR_INVALID_VALUE;
2259  }
2260  if (Platforms == nullptr && NumPlatforms == nullptr) {
2261  return PI_ERROR_INVALID_VALUE;
2262  }
2263 
2264  // Setting these environment variables before running zeInit will enable the
2265  // validation layer in the Level Zero loader.
2266  if (ZeDebug & ZE_DEBUG_VALIDATION) {
2267  setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1");
2268  setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1");
2269  }
2270 
2271  // Enable SYSMAN support for obtaining the PCI address
2272  // and maximum memory bandwidth.
2273  if (getenv("SYCL_ENABLE_PCI") != nullptr) {
2274  setEnvVar("ZES_ENABLE_SYSMAN", "1");
2275  }
2276 
2277  // TODO: We can still safely recover if something goes wrong during the init.
2278  // Implement handling segfault using sigaction.
2279 
2280  // We must only initialize the driver once, even if piPlatformsGet() is called
2281  // multiple times. Declaring the return value as "static" ensures it's only
2282  // called once.
2283  static ze_result_t ZeResult = ZE_CALL_NOCHECK(zeInit, (0));
2284 
2285  // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms.
2286  if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
2287  PI_ASSERT(NumPlatforms != 0, PI_ERROR_INVALID_VALUE);
2288  *NumPlatforms = 0;
2289  return PI_SUCCESS;
2290  }
2291 
2292  if (ZeResult != ZE_RESULT_SUCCESS) {
2293  zePrint("zeInit: Level Zero initialization failure\n");
2294  return mapError(ZeResult);
2295  }
2296 
2297  // Cache pi_platforms for reuse in the future
2298  // It solves two problems;
2299  // 1. sycl::platform equality issue; we always return the same pi_platform.
2300  // 2. performance; we can save time by immediately return from cache.
2301  //
2302 
2303  const std::lock_guard<sycl::detail::SpinLock> Lock{*PiPlatformsCacheMutex};
2304  if (!PiPlatformCachePopulated) {
2305  try {
2306  // Level Zero does not have concept of Platforms, but Level Zero driver is
2307  // the closest match.
2308  uint32_t ZeDriverCount = 0;
2309  ZE_CALL(zeDriverGet, (&ZeDriverCount, nullptr));
2310  if (ZeDriverCount == 0) {
2311  PiPlatformCachePopulated = true;
2312  } else {
2313  std::vector<ze_driver_handle_t> ZeDrivers;
2314  ZeDrivers.resize(ZeDriverCount);
2315 
2316  ZE_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data()));
2317  for (uint32_t I = 0; I < ZeDriverCount; ++I) {
2318  pi_platform Platform = new _pi_platform(ZeDrivers[I]);
2319  pi_result Result = Platform->initialize();
2320  if (Result != PI_SUCCESS) {
2321  return Result;
2322  }
2323  // Save a copy in the cache for future uses.
2324  PiPlatformsCache->push_back(Platform);
2325  }
2326  PiPlatformCachePopulated = true;
2327  }
2328  } catch (const std::bad_alloc &) {
2329  return PI_ERROR_OUT_OF_HOST_MEMORY;
2330  } catch (...) {
2331  return PI_ERROR_UNKNOWN;
2332  }
2333  }
2334 
2335  // Populate returned platforms from the cache.
2336  if (Platforms) {
2337  PI_ASSERT(NumEntries <= PiPlatformsCache->size(),
2338  PI_ERROR_INVALID_PLATFORM);
2339  std::copy_n(PiPlatformsCache->begin(), NumEntries, Platforms);
2340  }
2341 
2342  if (NumPlatforms) {
2343  *NumPlatforms = PiPlatformsCache->size();
2344  }
2345 
2346  return PI_SUCCESS;
2347 }
2348 
2350  size_t ParamValueSize, void *ParamValue,
2351  size_t *ParamValueSizeRet) {
2352 
2353  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
2354 
2355  zePrint("==========================\n");
2356  zePrint("SYCL over Level-Zero %s\n", Platform->ZeDriverVersion.c_str());
2357  zePrint("==========================\n");
2358 
2359  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
2360 
2361  switch (ParamName) {
2362  case PI_PLATFORM_INFO_NAME:
2363  // TODO: Query Level Zero driver when relevant info is added there.
2364  return ReturnValue("Intel(R) Level-Zero");
2366  // TODO: Query Level Zero driver when relevant info is added there.
2367  return ReturnValue("Intel(R) Corporation");
2369  // Convention adopted from OpenCL:
2370  // "Returns a space-separated list of extension names (the extension
2371  // names themselves do not contain any spaces) supported by the platform.
2372  // Extensions defined here must be supported by all devices associated
2373  // with this platform."
2374  //
2375  // TODO: Check the common extensions supported by all connected devices and
2376  // return them. For now, hardcoding some extensions we know are supported by
2377  // all Level Zero devices.
2378  return ReturnValue(ZE_SUPPORTED_EXTENSIONS);
2380  // TODO: figure out what this means and how is this used
2381  return ReturnValue("FULL_PROFILE");
2383  // TODO: this should query to zeDriverGetDriverVersion
2384  // but we don't yet have the driver handle here.
2385  //
2386  // From OpenCL 2.1: "This version string has the following format:
2387  // OpenCL<space><major_version.minor_version><space><platform-specific
2388  // information>. Follow the same notation here.
2389  //
2390  return ReturnValue(Platform->ZeDriverApiVersion.c_str());
2391  default:
2392  zePrint("piPlatformGetInfo: unrecognized ParamName\n");
2393  return PI_ERROR_INVALID_VALUE;
2394  }
2395 
2396  return PI_SUCCESS;
2397 }
2398 
2400  pi_native_handle *NativeHandle) {
2401  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
2402  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2403 
2404  auto ZeDriver = pi_cast<ze_driver_handle_t *>(NativeHandle);
2405  // Extract the Level Zero driver handle from the given PI platform
2406  *ZeDriver = Platform->ZeDriver;
2407  return PI_SUCCESS;
2408 }
2409 
2411  pi_platform *Platform) {
2412  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
2413  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2414 
2415  auto ZeDriver = pi_cast<ze_driver_handle_t>(NativeHandle);
2416 
2417  pi_uint32 NumPlatforms = 0;
2418  pi_result Res = piPlatformsGet(0, nullptr, &NumPlatforms);
2419  if (Res != PI_SUCCESS) {
2420  return Res;
2421  }
2422 
2423  if (NumPlatforms) {
2424  std::vector<pi_platform> Platforms(NumPlatforms);
2425  PI_CALL(piPlatformsGet(NumPlatforms, Platforms.data(), nullptr));
2426 
2427  // The SYCL spec requires that the set of platforms must remain fixed for
2428  // the duration of the application's execution. We assume that we found all
2429  // of the Level Zero drivers when we initialized the platform cache, so the
2430  // "NativeHandle" must already be in the cache. If it is not, this must not
2431  // be a valid Level Zero driver.
2432  for (const pi_platform &CachedPlatform : Platforms) {
2433  if (CachedPlatform->ZeDriver == ZeDriver) {
2434  *Platform = CachedPlatform;
2435  return PI_SUCCESS;
2436  }
2437  }
2438  }
2439 
2440  return PI_ERROR_INVALID_VALUE;
2441 }
2442 
2443 // Get the cached PI device created for the L0 device handle.
2444 // Return NULL if no such PI device found.
2446 
2447  pi_result Res = populateDeviceCacheIfNeeded();
2448  if (Res != PI_SUCCESS) {
2449  return nullptr;
2450  }
2451 
2452  // TODO: our sub-sub-device representation is currently [Level-Zero device
2453  // handle + Level-Zero compute group/engine index], so there is now no 1:1
2454  // mapping from L0 device handle to PI device assumed in this function. Until
2455  // Level-Zero adds unique ze_device_handle_t for sub-sub-devices, here we
2456  // filter out PI sub-sub-devices.
2457  std::shared_lock<pi_shared_mutex> Lock(PiDevicesCacheMutex);
2458  auto it = std::find_if(PiDevicesCache.begin(), PiDevicesCache.end(),
2459  [&](std::unique_ptr<_pi_device> &D) {
2460  return D.get()->ZeDevice == ZeDevice &&
2461  (D.get()->RootDevice == nullptr ||
2462  D.get()->RootDevice->RootDevice == nullptr);
2463  });
2464  if (it != PiDevicesCache.end()) {
2465  return (*it).get();
2466  }
2467  return nullptr;
2468 }
2469 
2471  pi_uint32 NumEntries, pi_device *Devices,
2472  pi_uint32 *NumDevices) {
2473 
2474  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
2475 
2476  pi_result Res = Platform->populateDeviceCacheIfNeeded();
2477  if (Res != PI_SUCCESS) {
2478  return Res;
2479  }
2480 
2481  // Filter available devices based on input DeviceType.
2482  std::vector<pi_device> MatchedDevices;
2483  std::shared_lock<pi_shared_mutex> Lock(Platform->PiDevicesCacheMutex);
2484  for (auto &D : Platform->PiDevicesCache) {
2485  // Only ever return root-devices from piDevicesGet, but the
2486  // devices cache also keeps sub-devices.
2487  if (D->isSubDevice())
2488  continue;
2489 
2490  bool Matched = false;
2491  switch (DeviceType) {
2492  case PI_DEVICE_TYPE_ALL:
2493  Matched = true;
2494  break;
2495  case PI_DEVICE_TYPE_GPU:
2497  Matched = (D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_GPU);
2498  break;
2499  case PI_DEVICE_TYPE_CPU:
2500  Matched = (D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_CPU);
2501  break;
2502  case PI_DEVICE_TYPE_ACC:
2503  Matched = (D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_MCA ||
2504  D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_FPGA);
2505  break;
2506  default:
2507  Matched = false;
2508  zePrint("Unknown device type");
2509  break;
2510  }
2511  if (Matched)
2512  MatchedDevices.push_back(D.get());
2513  }
2514 
2515  uint32_t ZeDeviceCount = MatchedDevices.size();
2516 
2517  if (NumDevices)
2518  *NumDevices = ZeDeviceCount;
2519 
2520  if (NumEntries == 0) {
2521  // Devices should be nullptr when querying the number of devices.
2522  PI_ASSERT(Devices == nullptr, PI_ERROR_INVALID_VALUE);
2523  return PI_SUCCESS;
2524  }
2525 
2526  // Return the devices from the cache.
2527  if (Devices) {
2528  PI_ASSERT(NumEntries <= ZeDeviceCount, PI_ERROR_INVALID_DEVICE);
2529  std::copy_n(MatchedDevices.begin(), NumEntries, Devices);
2530  }
2531 
2532  return PI_SUCCESS;
2533 }
2534 
2535 // Check the device cache and load it if necessary.
2537  std::scoped_lock<pi_shared_mutex> Lock(PiDevicesCacheMutex);
2538 
2539  if (DeviceCachePopulated) {
2540  return PI_SUCCESS;
2541  }
2542 
2543  uint32_t ZeDeviceCount = 0;
2544  ZE_CALL(zeDeviceGet, (ZeDriver, &ZeDeviceCount, nullptr));
2545 
2546  try {
2547  std::vector<ze_device_handle_t> ZeDevices(ZeDeviceCount);
2548  ZE_CALL(zeDeviceGet, (ZeDriver, &ZeDeviceCount, ZeDevices.data()));
2549 
2550  for (uint32_t I = 0; I < ZeDeviceCount; ++I) {
2551  std::unique_ptr<_pi_device> Device(new _pi_device(ZeDevices[I], this));
2552  pi_result Result = Device->initialize();
2553  if (Result != PI_SUCCESS) {
2554  return Result;
2555  }
2556 
2557  // Additionally we need to cache all sub-devices too, such that they
2558  // are readily visible to the piextDeviceCreateWithNativeHandle.
2559  //
2560  pi_uint32 SubDevicesCount = 0;
2561  ZE_CALL(zeDeviceGetSubDevices,
2562  (Device->ZeDevice, &SubDevicesCount, nullptr));
2563 
2564  auto ZeSubdevices = new ze_device_handle_t[SubDevicesCount];
2565  ZE_CALL(zeDeviceGetSubDevices,
2566  (Device->ZeDevice, &SubDevicesCount, ZeSubdevices));
2567 
2568  // Wrap the Level Zero sub-devices into PI sub-devices, and add them to
2569  // cache.
2570  for (uint32_t I = 0; I < SubDevicesCount; ++I) {
2571  std::unique_ptr<_pi_device> PiSubDevice(
2572  new _pi_device(ZeSubdevices[I], this, Device.get()));
2573  pi_result Result = PiSubDevice->initialize();
2574  if (Result != PI_SUCCESS) {
2575  delete[] ZeSubdevices;
2576  return Result;
2577  }
2578 
2579  // collect all the ordinals for the sub-sub-devices
2580  std::vector<int> Ordinals;
2581 
2582  uint32_t numQueueGroups = 0;
2583  ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
2584  (PiSubDevice->ZeDevice, &numQueueGroups, nullptr));
2585  if (numQueueGroups == 0) {
2586  return PI_ERROR_UNKNOWN;
2587  }
2588  std::vector<ze_command_queue_group_properties_t> QueueGroupProperties(
2589  numQueueGroups);
2590  ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
2591  (PiSubDevice->ZeDevice, &numQueueGroups,
2592  QueueGroupProperties.data()));
2593 
2594  for (uint32_t i = 0; i < numQueueGroups; i++) {
2595  if (QueueGroupProperties[i].flags &
2596  ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE &&
2597  QueueGroupProperties[i].numQueues > 1) {
2598  Ordinals.push_back(i);
2599  }
2600  }
2601 
2602  // Create PI sub-sub-devices with the sub-device for all the ordinals.
2603  // Each {ordinal, index} points to a specific CCS which constructs
2604  // a sub-sub-device at this point.
2605  // FIXME: Level Zero creates multiple PiDevices for a single physical
2606  // device when sub-device is partitioned into sub-sub-devices.
2607  // Sub-sub-device is technically a command queue and we should not build
2608  // program for each command queue. PiDevice is probably not the right
2609  // abstraction for a Level Zero command queue.
2610  for (uint32_t J = 0; J < Ordinals.size(); ++J) {
2611  for (uint32_t K = 0; K < QueueGroupProperties[Ordinals[J]].numQueues;
2612  ++K) {
2613  std::unique_ptr<_pi_device> PiSubSubDevice(
2614  new _pi_device(ZeSubdevices[I], this, PiSubDevice.get()));
2615  pi_result Result = PiSubSubDevice->initialize(Ordinals[J], K);
2616  if (Result != PI_SUCCESS) {
2617  return Result;
2618  }
2619 
2620  // save pointers to sub-sub-devices for quick retrieval in the
2621  // future.
2622  PiSubDevice->SubDevices.push_back(PiSubSubDevice.get());
2623  PiDevicesCache.push_back(std::move(PiSubSubDevice));
2624  }
2625  }
2626 
2627  // save pointers to sub-devices for quick retrieval in the future.
2628  Device->SubDevices.push_back(PiSubDevice.get());
2629  PiDevicesCache.push_back(std::move(PiSubDevice));
2630  }
2631  delete[] ZeSubdevices;
2632 
2633  // Save the root device in the cache for future uses.
2634  PiDevicesCache.push_back(std::move(Device));
2635  }
2636  } catch (const std::bad_alloc &) {
2637  return PI_ERROR_OUT_OF_HOST_MEMORY;
2638  } catch (...) {
2639  return PI_ERROR_UNKNOWN;
2640  }
2641  DeviceCachePopulated = true;
2642  return PI_SUCCESS;
2643 }
2644 
2646  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
2647 
2648  // The root-device ref-count remains unchanged (always 1).
2649  if (Device->isSubDevice()) {
2650  Device->RefCount.increment();
2651  }
2652  return PI_SUCCESS;
2653 }
2654 
2656  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
2657 
2658  // Root devices are destroyed during the piTearDown process.
2659  if (Device->isSubDevice()) {
2660  if (Device->RefCount.decrementAndTest()) {
2661  delete Device;
2662  }
2663  }
2664 
2665  return PI_SUCCESS;
2666 }
2667 
2669  size_t ParamValueSize, void *ParamValue,
2670  size_t *ParamValueSizeRet) {
2671 
2672  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
2673 
2674  ze_device_handle_t ZeDevice = Device->ZeDevice;
2675 
2676  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
2677 
2678  switch (ParamName) {
2679  case PI_DEVICE_INFO_TYPE: {
2680  switch (Device->ZeDeviceProperties->type) {
2681  case ZE_DEVICE_TYPE_GPU:
2682  return ReturnValue(PI_DEVICE_TYPE_GPU);
2683  case ZE_DEVICE_TYPE_CPU:
2684  return ReturnValue(PI_DEVICE_TYPE_CPU);
2685  case ZE_DEVICE_TYPE_MCA:
2686  case ZE_DEVICE_TYPE_FPGA:
2687  return ReturnValue(PI_DEVICE_TYPE_ACC);
2688  default:
2689  zePrint("This device type is not supported\n");
2690  return PI_ERROR_INVALID_VALUE;
2691  }
2692  }
2694  return ReturnValue(Device->RootDevice);
2696  return ReturnValue(Device->Platform);
2698  return ReturnValue(pi_uint32{Device->ZeDeviceProperties->vendorId});
2699  case PI_DEVICE_INFO_UUID:
2700  // Intel extension for device UUID. This returns the UUID as
2701  // std::array<std::byte, 16>. For details about this extension,
2702  // see sycl/doc/extensions/supported/sycl_ext_intel_device_info.md.
2703  return ReturnValue(Device->ZeDeviceProperties->uuid.id);
2705  return ReturnValue(pi_bool{Device->ZeDeviceModuleProperties->flags &
2706  ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS});
2708  // Convention adopted from OpenCL:
2709  // "Returns a space separated list of extension names (the extension
2710  // names themselves do not contain any spaces) supported by the device."
2711  //
2712  // TODO: Use proper mechanism to get this information from Level Zero after
2713  // it is added to Level Zero.
2714  // Hardcoding the few we know are supported by the current hardware.
2715  //
2716  //
2717  std::string SupportedExtensions;
2718 
2719  // cl_khr_il_program - OpenCL 2.0 KHR extension for SPIR-V support. Core
2720  // feature in >OpenCL 2.1
2721  // cl_khr_subgroups - Extension adds support for implementation-controlled
2722  // subgroups.
2723  // cl_intel_subgroups - Extension adds subgroup features, defined by Intel.
2724  // cl_intel_subgroups_short - Extension adds subgroup functions described in
2725  // the cl_intel_subgroups extension to support 16-bit integer data types
2726  // for performance.
2727  // cl_intel_required_subgroup_size - Extension to allow programmers to
2728  // optionally specify the required subgroup size for a kernel function.
2729  // cl_khr_fp16 - Optional half floating-point support.
2730  // cl_khr_fp64 - Support for double floating-point precision.
2731  // cl_khr_int64_base_atomics, cl_khr_int64_extended_atomics - Optional
2732  // extensions that implement atomic operations on 64-bit signed and
2733  // unsigned integers to locations in __global and __local memory.
2734  // cl_khr_3d_image_writes - Extension to enable writes to 3D image memory
2735  // objects.
2736  //
2737  // Hardcoding some extensions we know are supported by all Level Zero
2738  // devices.
2739  SupportedExtensions += (ZE_SUPPORTED_EXTENSIONS);
2740  if (Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP16)
2741  SupportedExtensions += ("cl_khr_fp16 ");
2742  if (Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP64)
2743  SupportedExtensions += ("cl_khr_fp64 ");
2744  if (Device->ZeDeviceModuleProperties->flags &
2745  ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS)
2746  // int64AtomicsSupported indicates support for both.
2747  SupportedExtensions +=
2748  ("cl_khr_int64_base_atomics cl_khr_int64_extended_atomics ");
2749  if (Device->ZeDeviceImageProperties->maxImageDims3D > 0)
2750  // Supports reading and writing of images.
2751  SupportedExtensions += ("cl_khr_3d_image_writes ");
2752 
2753  return ReturnValue(SupportedExtensions.c_str());
2754  }
2755  case PI_DEVICE_INFO_NAME:
2756  return ReturnValue(Device->ZeDeviceProperties->name);
2757  // zeModuleCreate allows using root device module for sub-devices:
2758  // > The application must only use the module for the device, or its
2759  // > sub-devices, which was provided during creation.
2761  return ReturnValue(PI_FALSE);
2763  return ReturnValue(pi_bool{1});
2765  return ReturnValue(pi_bool{1});
2767  pi_uint32 MaxComputeUnits =
2768  Device->ZeDeviceProperties->numEUsPerSubslice *
2769  Device->ZeDeviceProperties->numSubslicesPerSlice *
2770  Device->ZeDeviceProperties->numSlices;
2771  return ReturnValue(pi_uint32{MaxComputeUnits});
2772  }
2774  // Level Zero spec defines only three dimensions
2775  return ReturnValue(pi_uint32{3});
2777  return ReturnValue(
2778  pi_uint64{Device->ZeDeviceComputeProperties->maxTotalGroupSize});
2780  struct {
2781  size_t Arr[3];
2782  } MaxGroupSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX,
2783  Device->ZeDeviceComputeProperties->maxGroupSizeY,
2784  Device->ZeDeviceComputeProperties->maxGroupSizeZ}};
2785  return ReturnValue(MaxGroupSize);
2786  }
2788  struct {
2789  size_t Arr[3];
2790  } MaxGroupCounts = {{Device->ZeDeviceComputeProperties->maxGroupCountX,
2791  Device->ZeDeviceComputeProperties->maxGroupCountY,
2792  Device->ZeDeviceComputeProperties->maxGroupCountZ}};
2793  return ReturnValue(MaxGroupCounts);
2794  }
2796  return ReturnValue(pi_uint32{Device->ZeDeviceProperties->coreClockRate});
2798  // TODO: To confirm with spec.
2799  return ReturnValue(pi_uint32{64});
2800  }
2802  return ReturnValue(pi_uint64{Device->ZeDeviceProperties->maxMemAllocSize});
2804  uint64_t GlobalMemSize = 0;
2805  for (uint32_t I = 0; I < Device->ZeDeviceMemoryProperties->size(); I++) {
2806  GlobalMemSize +=
2807  (*Device->ZeDeviceMemoryProperties.operator->())[I].totalSize;
2808  }
2809  return ReturnValue(pi_uint64{GlobalMemSize});
2810  }
2812  return ReturnValue(
2813  pi_uint64{Device->ZeDeviceComputeProperties->maxSharedLocalMemory});
2815  return ReturnValue(
2816  pi_bool{Device->ZeDeviceImageProperties->maxImageDims1D > 0});
2818  return ReturnValue(pi_bool{(Device->ZeDeviceProperties->flags &
2819  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0});
2821  return ReturnValue(pi_bool{ZeDevice ? true : false});
2822  case PI_DEVICE_INFO_VENDOR:
2823  // TODO: Level-Zero does not return vendor's name at the moment
2824  // only the ID.
2825  return ReturnValue("Intel(R) Corporation");
2827  return ReturnValue(Device->Platform->ZeDriverVersion.c_str());
2829  return ReturnValue(Device->Platform->ZeDriverApiVersion.c_str());
2832  if (Res != PI_SUCCESS) {
2833  return Res;
2834  }
2835  return ReturnValue(pi_uint32{(unsigned int)(Device->SubDevices.size())});
2836  }
2838  return ReturnValue(pi_uint32{Device->RefCount.load()});
2840  // SYCL spec says: if this SYCL device cannot be partitioned into at least
2841  // two sub devices then the returned vector must be empty.
2843  if (Res != PI_SUCCESS) {
2844  return Res;
2845  }
2846 
2847  uint32_t ZeSubDeviceCount = Device->SubDevices.size();
2848  if (ZeSubDeviceCount < 2) {
2849  return ReturnValue(pi_device_partition_property{0});
2850  }
2851  // It is debatable if SYCL sub-device and partitioning APIs sufficient to
2852  // expose Level Zero sub-devices? We start with support of
2853  // "partition_by_affinity_domain" and "next_partitionable" but if that
2854  // doesn't seem to be a good fit we could look at adding a more descriptive
2855  // partitioning type.
2856  struct {
2858  } PartitionProperties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, 0}};
2859  return ReturnValue(PartitionProperties);
2860  }
2862  return ReturnValue(pi_device_affinity_domain{
2866  if (Device->isSubDevice()) {
2867  struct {
2869  } PartitionProperties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
2871  0}};
2872  return ReturnValue(PartitionProperties);
2873  }
2874  // For root-device there is no partitioning to report.
2875  return ReturnValue(pi_device_partition_property{0});
2876  }
2877 
2878  // Everything under here is not supported yet
2879 
2881  return ReturnValue("");
2883  return ReturnValue(pi_bool{true});
2885  return ReturnValue(
2886  size_t{Device->ZeDeviceModuleProperties->printfBufferSize});
2888  return ReturnValue("FULL_PROFILE");
2890  // TODO: To find out correct value
2891  return ReturnValue("");
2893  return ReturnValue(pi_queue_properties{
2896  return ReturnValue(
2899  return ReturnValue(pi_bool{true});
2901  return ReturnValue(pi_bool{Device->ZeDeviceProperties->flags &
2902  ZE_DEVICE_PROPERTY_FLAG_ECC});
2904  return ReturnValue(size_t{Device->ZeDeviceProperties->timerResolution});
2906  return ReturnValue(PI_DEVICE_LOCAL_MEM_TYPE_LOCAL);
2908  return ReturnValue(pi_uint32{64});
2910  return ReturnValue(
2911  pi_uint64{Device->ZeDeviceImageProperties->maxImageBufferSize});
2913  return ReturnValue(PI_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE);
2915  return ReturnValue(
2916  // TODO[1.0]: how to query cache line-size?
2917  pi_uint32{1});
2919  return ReturnValue(pi_uint64{Device->ZeDeviceCacheProperties->cacheSize});
2921  return ReturnValue(
2922  size_t{Device->ZeDeviceModuleProperties->maxArgumentsSize});
2924  // SYCL/OpenCL spec is vague on what this means exactly, but seems to
2925  // be for "alignment requirement (in bits) for sub-buffer offsets."
2926  // An OpenCL implementation returns 8*128, but Level Zero can do just 8,
2927  // meaning unaligned access for values of types larger than 8 bits.
2928  return ReturnValue(pi_uint32{8});
2930  return ReturnValue(pi_uint32{Device->ZeDeviceImageProperties->maxSamplers});
2932  return ReturnValue(
2933  pi_uint32{Device->ZeDeviceImageProperties->maxReadImageArgs});
2935  return ReturnValue(
2936  pi_uint32{Device->ZeDeviceImageProperties->maxWriteImageArgs});
2938  uint64_t SingleFPValue = 0;
2939  ze_device_fp_flags_t ZeSingleFPCapabilities =
2940  Device->ZeDeviceModuleProperties->fp32flags;
2941  if (ZE_DEVICE_FP_FLAG_DENORM & ZeSingleFPCapabilities) {
2942  SingleFPValue |= PI_FP_DENORM;
2943  }
2944  if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeSingleFPCapabilities) {
2945  SingleFPValue |= PI_FP_INF_NAN;
2946  }
2947  if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeSingleFPCapabilities) {
2948  SingleFPValue |= PI_FP_ROUND_TO_NEAREST;
2949  }
2950  if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeSingleFPCapabilities) {
2951  SingleFPValue |= PI_FP_ROUND_TO_ZERO;
2952  }
2953  if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeSingleFPCapabilities) {
2954  SingleFPValue |= PI_FP_ROUND_TO_INF;
2955  }
2956  if (ZE_DEVICE_FP_FLAG_FMA & ZeSingleFPCapabilities) {
2957  SingleFPValue |= PI_FP_FMA;
2958  }
2959  if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeSingleFPCapabilities) {
2960  SingleFPValue |= PI_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
2961  }
2962  return ReturnValue(pi_uint64{SingleFPValue});
2963  }
2965  uint64_t HalfFPValue = 0;
2966  ze_device_fp_flags_t ZeHalfFPCapabilities =
2967  Device->ZeDeviceModuleProperties->fp16flags;
2968  if (ZE_DEVICE_FP_FLAG_DENORM & ZeHalfFPCapabilities) {
2969  HalfFPValue |= PI_FP_DENORM;
2970  }
2971  if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeHalfFPCapabilities) {
2972  HalfFPValue |= PI_FP_INF_NAN;
2973  }
2974  if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeHalfFPCapabilities) {
2975  HalfFPValue |= PI_FP_ROUND_TO_NEAREST;
2976  }
2977  if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeHalfFPCapabilities) {
2978  HalfFPValue |= PI_FP_ROUND_TO_ZERO;
2979  }
2980  if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeHalfFPCapabilities) {
2981  HalfFPValue |= PI_FP_ROUND_TO_INF;
2982  }
2983  if (ZE_DEVICE_FP_FLAG_FMA & ZeHalfFPCapabilities) {
2984  HalfFPValue |= PI_FP_FMA;
2985  }
2986  if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeHalfFPCapabilities) {
2987  HalfFPValue |= PI_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
2988  }
2989  return ReturnValue(pi_uint64{HalfFPValue});
2990  }
2992  uint64_t DoubleFPValue = 0;
2993  ze_device_fp_flags_t ZeDoubleFPCapabilities =
2994  Device->ZeDeviceModuleProperties->fp64flags;
2995  if (ZE_DEVICE_FP_FLAG_DENORM & ZeDoubleFPCapabilities) {
2996  DoubleFPValue |= PI_FP_DENORM;
2997  }
2998  if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeDoubleFPCapabilities) {
2999  DoubleFPValue |= PI_FP_INF_NAN;
3000  }
3001  if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeDoubleFPCapabilities) {
3002  DoubleFPValue |= PI_FP_ROUND_TO_NEAREST;
3003  }
3004  if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeDoubleFPCapabilities) {
3005  DoubleFPValue |= PI_FP_ROUND_TO_ZERO;
3006  }
3007  if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeDoubleFPCapabilities) {
3008  DoubleFPValue |= PI_FP_ROUND_TO_INF;
3009  }
3010  if (ZE_DEVICE_FP_FLAG_FMA & ZeDoubleFPCapabilities) {
3011  DoubleFPValue |= PI_FP_FMA;
3012  }
3013  if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeDoubleFPCapabilities) {
3014  DoubleFPValue |= PI_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
3015  }
3016  return ReturnValue(pi_uint64{DoubleFPValue});
3017  }
3019  return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims2D});
3021  return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims2D});
3023  return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D});
3025  return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D});
3027  return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D});
3029  return ReturnValue(
3030  size_t{Device->ZeDeviceImageProperties->maxImageBufferSize});
3032  return ReturnValue(
3033  size_t{Device->ZeDeviceImageProperties->maxImageArraySlices});
3034  // Handle SIMD widths.
3035  // TODO: can we do better than this?
3038  return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 1);
3041  return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 2);
3044  return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 4);
3047  return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 8);
3050  return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 4);
3053  return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 8);
3056  return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 2);
3058  // Max_num_sub_Groups = maxTotalGroupSize/min(set of subGroupSizes);
3059  uint32_t MinSubGroupSize =
3060  Device->ZeDeviceComputeProperties->subGroupSizes[0];
3061  for (uint32_t I = 1;
3062  I < Device->ZeDeviceComputeProperties->numSubGroupSizes; I++) {
3063  if (MinSubGroupSize > Device->ZeDeviceComputeProperties->subGroupSizes[I])
3064  MinSubGroupSize = Device->ZeDeviceComputeProperties->subGroupSizes[I];
3065  }
3066  return ReturnValue(Device->ZeDeviceComputeProperties->maxTotalGroupSize /
3067  MinSubGroupSize);
3068  }
3070  // TODO: Not supported yet. Needs to be updated after support is added.
3071  return ReturnValue(pi_bool{false});
3072  }
3074  // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the
3075  // expected return is size_t datatype. size_t can be 8 bytes of data.
3076  return getInfoArray<uint32_t, size_t>(
3077  Device->ZeDeviceComputeProperties->numSubGroupSizes, ParamValueSize,
3078  ParamValue, ParamValueSizeRet,
3079  Device->ZeDeviceComputeProperties->subGroupSizes);
3080  }
3082  // Set to a space separated list of IL version strings of the form
3083  // <IL_Prefix>_<Major_version>.<Minor_version>.
3084  // "SPIR-V" is a required IL prefix when cl_khr_il_progam extension is
3085  // reported.
3086  uint32_t SpirvVersion =
3087  Device->ZeDeviceModuleProperties->spirvVersionSupported;
3088  uint32_t SpirvVersionMajor = ZE_MAJOR_VERSION(SpirvVersion);
3089  uint32_t SpirvVersionMinor = ZE_MINOR_VERSION(SpirvVersion);
3090 
3091  char SpirvVersionString[50];
3092  int Len = sprintf(SpirvVersionString, "SPIR-V_%d.%d ", SpirvVersionMajor,
3093  SpirvVersionMinor);
3094  // returned string to contain only len number of characters.
3095  std::string ILVersion(SpirvVersionString, Len);
3096  return ReturnValue(ILVersion.c_str());
3097  }
3103  auto MapCaps = [](const ze_memory_access_cap_flags_t &ZeCapabilities) {
3104  pi_uint64 Capabilities = 0;
3105  if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_RW)
3106  Capabilities |= PI_USM_ACCESS;
3107  if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC)
3108  Capabilities |= PI_USM_ATOMIC_ACCESS;
3109  if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT)
3110  Capabilities |= PI_USM_CONCURRENT_ACCESS;
3111  if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC)
3112  Capabilities |= PI_USM_CONCURRENT_ATOMIC_ACCESS;
3113  return Capabilities;
3114  };
3115  auto &Props = Device->ZeDeviceMemoryAccessProperties;
3116  switch (ParamName) {
3118  return ReturnValue(MapCaps(Props->hostAllocCapabilities));
3120  return ReturnValue(MapCaps(Props->deviceAllocCapabilities));
3122  return ReturnValue(MapCaps(Props->sharedSingleDeviceAllocCapabilities));
3124  return ReturnValue(MapCaps(Props->sharedCrossDeviceAllocCapabilities));
3126  return ReturnValue(MapCaps(Props->sharedSystemAllocCapabilities));
3127  default:
3128  die("piDeviceGetInfo: enexpected ParamName.");
3129  }
3130  }
3131 
3132  // intel extensions for GPU information
3134  if (getenv("ZES_ENABLE_SYSMAN") == nullptr) {
3135  zePrint("Set SYCL_ENABLE_PCI=1 to obtain PCI data.\n");
3136  return PI_ERROR_INVALID_VALUE;
3137  }
3138  ZesStruct<zes_pci_properties_t> ZeDevicePciProperties;
3139  ZE_CALL(zesDevicePciGetProperties, (ZeDevice, &ZeDevicePciProperties));
3140  constexpr size_t AddressBufferSize = 13;
3141  char AddressBuffer[AddressBufferSize];
3142  std::snprintf(AddressBuffer, AddressBufferSize, "%04x:%02x:%02x.%01x",
3143  ZeDevicePciProperties.address.domain,
3144  ZeDevicePciProperties.address.bus,
3145  ZeDevicePciProperties.address.device,
3146  ZeDevicePciProperties.address.function);
3147  return ReturnValue(AddressBuffer);
3148  }
3149 
3151  if (getenv("ZES_ENABLE_SYSMAN") == nullptr) {
3152  setErrorMessage("Set ZES_ENABLE_SYSMAN=1 to obtain free memory",
3153  PI_SUCCESS);
3154  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
3155  }
3156  // Only report device memory which zeMemAllocDevice can allocate from.
3157  // Currently this is only the one enumerated with ordinal 0.
3158  uint64_t FreeMemory = 0;
3159  uint32_t MemCount = 1;
3160  zes_mem_handle_t ZesMemHandle;
3161  ZE_CALL(zesDeviceEnumMemoryModules, (ZeDevice, &MemCount, &ZesMemHandle));
3162  if (MemCount != 0) {
3163  ZesStruct<zes_mem_properties_t> ZeMemProperties;
3164  ZE_CALL(zesMemoryGetProperties, (ZesMemHandle, &ZeMemProperties));
3165  ZesStruct<zes_mem_state_t> ZeMemState;
3166  ZE_CALL(zesMemoryGetState, (ZesMemHandle, &ZeMemState));
3167  FreeMemory += ZeMemState.free;
3168  }
3169  return ReturnValue(FreeMemory);
3170  }
3171 
3173  pi_uint32 count = Device->ZeDeviceProperties->numEUsPerSubslice *
3174  Device->ZeDeviceProperties->numSubslicesPerSlice *
3175  Device->ZeDeviceProperties->numSlices;
3176  return ReturnValue(pi_uint32{count});
3177  }
3179  return ReturnValue(
3180  pi_uint32{Device->ZeDeviceProperties->physicalEUSimdWidth});
3182  return ReturnValue(pi_uint32{Device->ZeDeviceProperties->numSlices});
3184  return ReturnValue(
3185  pi_uint32{Device->ZeDeviceProperties->numSubslicesPerSlice});
3187  return ReturnValue(
3188  pi_uint32{Device->ZeDeviceProperties->numEUsPerSubslice});
3190  return ReturnValue(pi_uint32{Device->ZeDeviceProperties->numThreadsPerEU});
3192  // currently not supported in level zero runtime
3193  return PI_ERROR_INVALID_VALUE;
3195  return PI_ERROR_INVALID_VALUE;
3196 
3197  // TODO: Implement.
3199  default:
3200  zePrint("Unsupported ParamName in piGetDeviceInfo\n");
3201  zePrint("ParamName=%d(0x%x)\n", ParamName, ParamName);
3202  return PI_ERROR_INVALID_VALUE;
3203  }
3204 
3205  return PI_SUCCESS;
3206 }
3207 
3210  pi_uint32 NumDevices, pi_device *OutDevices,
3211  pi_uint32 *OutNumDevices) {
3212  // Other partitioning ways are not supported by Level Zero
3216  return PI_ERROR_INVALID_VALUE;
3217  }
3218 
3219  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
3220 
3221  // Devices cache is normally created in piDevicesGet but still make
3222  // sure that cache is populated.
3223  //
3225  if (Res != PI_SUCCESS) {
3226  return Res;
3227  }
3228 
3229  if (OutNumDevices) {
3230  *OutNumDevices = Device->SubDevices.size();
3231  }
3232 
3233  if (OutDevices) {
3234  // TODO: Consider support for partitioning to <= total sub-devices.
3235  // Currently supported partitioning (by affinity domain/numa) would always
3236  // partition to all sub-devices.
3237  //
3238  PI_ASSERT(NumDevices == Device->SubDevices.size(), PI_ERROR_INVALID_VALUE);
3239 
3240  for (uint32_t I = 0; I < NumDevices; I++) {
3241  OutDevices[I] = Device->SubDevices[I];
3242  // reusing the same pi_device needs to increment the reference count
3243  PI_CALL(piDeviceRetain(OutDevices[I]));
3244  }
3245  }
3246  return PI_SUCCESS;
3247 }
3248 
3249 pi_result
3250 piextDeviceSelectBinary(pi_device Device, // TODO: does this need to be context?
3251  pi_device_binary *Binaries, pi_uint32 NumBinaries,
3252  pi_uint32 *SelectedBinaryInd) {
3253 
3254  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
3255  PI_ASSERT(SelectedBinaryInd, PI_ERROR_INVALID_VALUE);
3256  PI_ASSERT(NumBinaries == 0 || Binaries, PI_ERROR_INVALID_VALUE);
3257 
3258  // TODO: this is a bare-bones implementation for choosing a device image
3259  // that would be compatible with the targeted device. An AOT-compiled
3260  // image is preferred over SPIR-V for known devices (i.e. Intel devices)
3261  // The implementation makes no effort to differentiate between multiple images
3262  // for the given device, and simply picks the first one compatible.
3263  //
3264  // Real implementation will use the same mechanism OpenCL ICD dispatcher
3265  // uses. Something like:
3266  // PI_VALIDATE_HANDLE_RETURN_HANDLE(ctx, PI_ERROR_INVALID_CONTEXT);
3267  // return context->dispatch->piextDeviceSelectIR(
3268  // ctx, images, num_images, selected_image);
3269  // where context->dispatch is set to the dispatch table provided by PI
3270  // plugin for platform/device the ctx was created for.
3271 
3272  // Look for GEN binary, which we known can only be handled by Level-Zero now.
3273  const char *BinaryTarget = __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN;
3274 
3275  // Find the appropriate device image, fallback to spirv if not found
3276  constexpr pi_uint32 InvalidInd = std::numeric_limits<pi_uint32>::max();
3277  pi_uint32 Spirv = InvalidInd;
3278 
3279  for (pi_uint32 i = 0; i < NumBinaries; ++i) {
3280  if (strcmp(Binaries[i]->DeviceTargetSpec, BinaryTarget) == 0) {
3281  *SelectedBinaryInd = i;
3282  return PI_SUCCESS;
3283  }
3284  if (strcmp(Binaries[i]->DeviceTargetSpec,
3286  Spirv = i;
3287  }
3288  // Points to a spirv image, if such indeed was found
3289  if ((*SelectedBinaryInd = Spirv) != InvalidInd)
3290  return PI_SUCCESS;
3291 
3292  // No image can be loaded for the given device
3293  return PI_ERROR_INVALID_BINARY;
3294 }
3295 
3297  pi_native_handle *NativeHandle) {
3298  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
3299  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3300 
3301  auto ZeDevice = pi_cast<ze_device_handle_t *>(NativeHandle);
3302  // Extract the Level Zero module handle from the given PI device
3303  *ZeDevice = Device->ZeDevice;
3304  return PI_SUCCESS;
3305 }
3306 
3308  pi_platform Platform,
3309  pi_device *Device) {
3310  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
3311  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3312 
3313  auto ZeDevice = pi_cast<ze_device_handle_t>(NativeHandle);
3314 
3315  // The SYCL spec requires that the set of devices must remain fixed for the
3316  // duration of the application's execution. We assume that we found all of the
3317  // Level Zero devices when we initialized the platforms/devices cache, so the
3318  // "NativeHandle" must already be in the cache. If it is not, this must not be
3319  // a valid Level Zero device.
3320  //
3321  // TODO: maybe we should populate cache of platforms if it wasn't already.
3322  // For now assert that is was populated.
3323  PI_ASSERT(PiPlatformCachePopulated, PI_ERROR_INVALID_VALUE);
3324  const std::lock_guard<sycl::detail::SpinLock> Lock{*PiPlatformsCacheMutex};
3325 
3326  pi_device Dev = nullptr;
3327  for (auto &ThePlatform : *PiPlatformsCache) {
3328  Dev = ThePlatform->getDeviceFromNativeHandle(ZeDevice);
3329  if (Dev) {
3330  // Check that the input Platform, if was given, matches the found one.
3331  PI_ASSERT(!Platform || Platform == ThePlatform,
3332  PI_ERROR_INVALID_PLATFORM);
3333  break;
3334  }
3335  }
3336 
3337  if (Dev == nullptr)
3338  return PI_ERROR_INVALID_VALUE;
3339 
3340  *Device = Dev;
3341  return PI_SUCCESS;
3342 }
3343 
3345  pi_uint32 NumDevices, const pi_device *Devices,
3346  void (*PFnNotify)(const char *ErrInfo,
3347  const void *PrivateInfo, size_t CB,
3348  void *UserData),
3349  void *UserData, pi_context *RetContext) {
3350  (void)Properties;
3351  (void)PFnNotify;
3352  (void)UserData;
3353  PI_ASSERT(NumDevices, PI_ERROR_INVALID_VALUE);
3354  PI_ASSERT(Devices, PI_ERROR_INVALID_DEVICE);
3355  PI_ASSERT(RetContext, PI_ERROR_INVALID_VALUE);
3356 
3357  pi_platform Platform = (*Devices)->Platform;
3358  ZeStruct<ze_context_desc_t> ContextDesc;
3359  ContextDesc.flags = 0;
3360 
3361  ze_context_handle_t ZeContext;
3362  ZE_CALL(zeContextCreate, (Platform->ZeDriver, &ContextDesc, &ZeContext));
3363  try {
3364  *RetContext = new _pi_context(ZeContext, NumDevices, Devices, true);
3365  (*RetContext)->initialize();
3366  if (IndirectAccessTrackingEnabled) {
3367  std::scoped_lock<pi_shared_mutex> Lock(Platform->ContextsMutex);
3368  Platform->Contexts.push_back(*RetContext);
3369  }
3370  } catch (const std::bad_alloc &) {
3371  return PI_ERROR_OUT_OF_HOST_MEMORY;
3372  } catch (...) {
3373  return PI_ERROR_UNKNOWN;
3374  }
3375 
3376  return PI_SUCCESS;
3377 }
3378 
3380  size_t ParamValueSize, void *ParamValue,
3381  size_t *ParamValueSizeRet) {
3382 
3383  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3384 
3385  std::shared_lock<pi_shared_mutex> Lock(Context->Mutex);
3386  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
3387  switch (ParamName) {
3389  return getInfoArray(Context->Devices.size(), ParamValueSize, ParamValue,
3390  ParamValueSizeRet, &Context->Devices[0]);
3392  return ReturnValue(pi_uint32(Context->Devices.size()));
3394  return ReturnValue(pi_uint32{Context->RefCount.load()});
3396  default:
3397  // TODO: implement other parameters
3398  die("piGetContextInfo: unsuppported ParamName.");
3399  }
3400 
3401  return PI_SUCCESS;
3402 }
3403 
3404 // FIXME: Dummy implementation to prevent link fail
3406  pi_context_extended_deleter Function,
3407  void *UserData) {
3408  (void)Context;
3409  (void)Function;
3410  (void)UserData;
3411  die("piextContextSetExtendedDeleter: not supported");
3412  return PI_SUCCESS;
3413 }
3414 
3416  pi_native_handle *NativeHandle) {
3417  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3418  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3419 
3420  auto ZeContext = pi_cast<ze_context_handle_t *>(NativeHandle);
3421  // Extract the Level Zero queue handle from the given PI queue
3422  *ZeContext = Context->ZeContext;
3423  return PI_SUCCESS;
3424 }
3425 
3427  pi_uint32 NumDevices,
3428  const pi_device *Devices,
3429  bool OwnNativeHandle,
3430  pi_context *RetContext) {
3431  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3432  PI_ASSERT(Devices, PI_ERROR_INVALID_DEVICE);
3433  PI_ASSERT(RetContext, PI_ERROR_INVALID_VALUE);
3434  PI_ASSERT(NumDevices, PI_ERROR_INVALID_VALUE);
3435 
3436  try {
3437  *RetContext = new _pi_context(pi_cast<ze_context_handle_t>(NativeHandle),
3438  NumDevices, Devices, OwnNativeHandle);
3439  (*RetContext)->initialize();
3440  } catch (const std::bad_alloc &) {
3441  return PI_ERROR_OUT_OF_HOST_MEMORY;
3442  } catch (...) {
3443  return PI_ERROR_UNKNOWN;
3444  }
3445 
3446  return PI_SUCCESS;
3447 }
3448 
3450 
3451  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3452 
3453  Context->RefCount.increment();
3454  return PI_SUCCESS;
3455 }
3456 
3457 // Helper function to release the context, a caller must lock the platform-level
3458 // mutex guarding the container with contexts because the context can be removed
3459 // from the list of tracked contexts.
3461 
3462  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3463 
3464  if (!Context->RefCount.decrementAndTest())
3465  return PI_SUCCESS;
3466 
3467  if (IndirectAccessTrackingEnabled) {
3468  pi_platform Plt = Context->getPlatform();
3469  auto &Contexts = Plt->Contexts;
3470  auto It = std::find(Contexts.begin(), Contexts.end(), Context);
3471  if (It != Contexts.end())
3472  Contexts.erase(It);
3473  }
3474  ze_context_handle_t DestoryZeContext =
3475  Context->OwnZeContext ? Context->ZeContext : nullptr;
3476 
3477  // Clean up any live memory associated with Context
3478  pi_result Result = Context->finalize();
3479 
3480  // We must delete Context first and then destroy zeContext because
3481  // Context deallocation requires ZeContext in some member deallocation of
3482  // pi_context.
3483  delete Context;
3484 
3485  // Destruction of some members of pi_context uses L0 context
3486  // and therefore it must be valid at that point.
3487  // Technically it should be placed to the destructor of pi_context
3488  // but this makes API error handling more complex.
3489  if (DestoryZeContext)
3490  ZE_CALL(zeContextDestroy, (DestoryZeContext));
3491 
3492  return Result;
3493 }
3494 
3496  pi_platform Plt = Context->getPlatform();
3497  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
3498  std::defer_lock);
3499  if (IndirectAccessTrackingEnabled)
3500  ContextsLock.lock();
3501 
3502  return ContextReleaseHelper(Context);
3503 }
3504 
3507 
3508  // Check that unexpected bits are not set.
3513  PI_ERROR_INVALID_VALUE);
3514 
3515  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3516  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
3517  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
3518  PI_ASSERT(Context->isValidDevice(Device), PI_ERROR_INVALID_DEVICE);
3519 
3520  // Create placeholder queues in the compute queue group.
3521  // Actual L0 queues will be created at first use.
3522  std::vector<ze_command_queue_handle_t> ZeComputeCommandQueues(
3523  Device->QueueGroup[_pi_queue::queue_type::Compute].ZeProperties.numQueues,
3524  nullptr);
3525 
3526  // Create placeholder queues in the copy queue group (main and link
3527  // native groups are combined into one group).
3528  // Actual L0 queues will be created at first use.
3529  size_t NumCopyGroups = 0;
3530  if (Device->hasMainCopyEngine()) {
3531  NumCopyGroups += Device->QueueGroup[_pi_queue::queue_type::MainCopy]
3532  .ZeProperties.numQueues;
3533  }
3534  if (Device->hasLinkCopyEngine()) {
3535  NumCopyGroups += Device->QueueGroup[_pi_queue::queue_type::LinkCopy]
3536  .ZeProperties.numQueues;
3537  }
3538  std::vector<ze_command_queue_handle_t> ZeCopyCommandQueues(NumCopyGroups,
3539  nullptr);
3540 
3541  try {
3542  *Queue = new _pi_queue(ZeComputeCommandQueues, ZeCopyCommandQueues, Context,
3543  Device, true, Properties);
3544  } catch (const std::bad_alloc &) {
3545  return PI_ERROR_OUT_OF_HOST_MEMORY;
3546  } catch (...) {
3547  return PI_ERROR_UNKNOWN;
3548  }
3549 
3550  // Do eager initialization of Level Zero handles on request.
3551  if (doEagerInit) {
3552  pi_queue Q = *Queue;
3553  // Creates said number of command-lists.
3554  auto warmupQueueGroup = [Q](bool UseCopyEngine,
3555  uint32_t RepeatCount) -> pi_result {
3556  pi_command_list_ptr_t CommandList;
3557  while (RepeatCount--) {
3558  if (Q->Device->useImmediateCommandLists()) {
3559  CommandList = Q->getQueueGroup(UseCopyEngine).getImmCmdList();
3560  } else {
3561  // Heuristically create some number of regular command-list to reuse.
3562  for (int I = 0; I < 10; ++I) {
3563  PI_CALL(Q->createCommandList(UseCopyEngine, CommandList));
3564  // Immediately return them to the cache of available command-lists.
3565  std::vector<pi_event> EventsUnused;
3566  PI_CALL(Q->resetCommandList(CommandList, true /* MakeAvailable */,
3567  EventsUnused));
3568  }
3569  }
3570  }
3571  return PI_SUCCESS;
3572  };
3573  // Create as many command-lists as there are queues in the group.
3574  // With this the underlying round-robin logic would initialize all
3575  // native queues, and create command-lists and their fences.
3576  PI_CALL(warmupQueueGroup(false, Q->ComputeQueueGroup.UpperIndex -
3577  Q->ComputeQueueGroup.LowerIndex + 1));
3578  if (Q->useCopyEngine()) {
3579  PI_CALL(warmupQueueGroup(true, Q->CopyQueueGroup.UpperIndex -
3580  Q->CopyQueueGroup.LowerIndex + 1));
3581  }
3582  // TODO: warmup event pools. Both host-visible and device-only.
3583  }
3584  return PI_SUCCESS;
3585 }
3586 
3588  size_t ParamValueSize, void *ParamValue,
3589  size_t *ParamValueSizeRet) {
3590 
3591  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
3592 
3593  std::shared_lock<pi_shared_mutex> Lock(Queue->Mutex);
3594  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
3595  // TODO: consider support for queue properties and size
3596  switch (ParamName) {
3597  case PI_QUEUE_INFO_CONTEXT:
3598  return ReturnValue(Queue->Context);
3599  case PI_QUEUE_INFO_DEVICE:
3600  return ReturnValue(Queue->Device);
3602  return ReturnValue(pi_uint32{Queue->RefCount.load()});
3604  die("PI_QUEUE_INFO_PROPERTIES in piQueueGetInfo not implemented\n");
3605  break;
3606  case PI_QUEUE_INFO_SIZE:
3607  die("PI_QUEUE_INFO_SIZE in piQueueGetInfo not implemented\n");
3608  break;
3610  die("PI_QUEUE_INFO_DEVICE_DEFAULT in piQueueGetInfo not implemented\n");
3611  break;
3612  default:
3613  zePrint("Unsupported ParamName in piQueueGetInfo: ParamName=%d(0x%x)\n",
3614  ParamName, ParamName);
3615  return PI_ERROR_INVALID_VALUE;
3616  }
3617 
3618  return PI_SUCCESS;
3619 }
3620 
3622  {
3623  std::scoped_lock<pi_shared_mutex> Lock(Queue->Mutex);
3624  Queue->RefCountExternal++;
3625  }
3626  Queue->RefCount.increment();
3627  return PI_SUCCESS;
3628 }
3629 
3631  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
3632  std::vector<pi_event> EventListToCleanup;
3633 
3634  {
3635  std::scoped_lock<pi_shared_mutex> Lock(Queue->Mutex);
3636 
3637  if ((--Queue->RefCountExternal) != 0)
3638  return PI_SUCCESS;
3639 
3640  // When external reference count goes to zero it is still possible
3641  // that internal references still exists, e.g. command-lists that
3642  // are not yet completed. So do full queue synchronization here
3643  // and perform proper cleanup.
3644  //
3645  // It is possible to get to here and still have an open command list
3646  // if no wait or finish ever occurred for this queue.
3647  if (auto Res = Queue->executeAllOpenCommandLists())
3648  return Res;
3649 
3650  // Make sure all commands get executed.
3651  Queue->synchronize();
3652 
3653  // Destroy all the fences created associated with this queue.
3654  for (auto it = Queue->CommandListMap.begin();
3655  it != Queue->CommandListMap.end(); ++it) {
3656  // This fence wasn't yet signalled when we polled it for recycling
3657  // the command-list, so need to release the command-list too.
3658  // For immediate commandlists we don't need to do an L0 reset of the
3659  // commandlist but do need to do event cleanup which is also in the
3660  // resetCommandList function.
3661  // If the fence is a nullptr we are using immediate commandlists,
3662  // otherwise regular commandlists which use a fence.
3663  if (it->second.ZeFence == nullptr || it->second.ZeFenceInUse) {
3664  Queue->resetCommandList(it, true, EventListToCleanup);
3665  }
3666  // TODO: remove "if" when the problem is fixed in the level zero
3667  // runtime. Destroy only if a queue is healthy. Destroying a fence may
3668  // cause a hang otherwise.
3669  // If the fence is a nullptr we are using immediate commandlists.
3670  if (Queue->Healthy && it->second.ZeFence != nullptr)
3671  ZE_CALL(zeFenceDestroy, (it->second.ZeFence));
3672  }
3673  Queue->CommandListMap.clear();
3674  }
3675 
3676  for (auto Event : EventListToCleanup) {
3677  // We don't need to synchronize the events since the queue
3678  // synchronized above already does that.
3679  {
3680  std::scoped_lock<pi_shared_mutex> EventLock(Event->Mutex);
3681  Event->Completed = true;
3682  }
3684  // This event was removed from the command list, so decrement ref count
3685  // (it was incremented when they were added to the command list).
3687  }
3689  return PI_SUCCESS;
3690 }
3691 
3693  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
3694 
3695  if (!Queue->RefCount.decrementAndTest())
3696  return PI_SUCCESS;
3697 
3698  if (Queue->OwnZeCommandQueue) {
3699  for (auto &ZeQueue : Queue->ComputeQueueGroup.ZeQueues) {
3700  if (ZeQueue)
3701  ZE_CALL(zeCommandQueueDestroy, (ZeQueue));
3702  }
3703  for (auto &ZeQueue : Queue->CopyQueueGroup.ZeQueues) {
3704  if (ZeQueue)
3705  ZE_CALL(zeCommandQueueDestroy, (ZeQueue));
3706  }
3707  }
3708 
3709  zePrint("piQueueRelease(compute) NumTimesClosedFull %d, "
3710  "NumTimesClosedEarly %d\n",
3713  zePrint("piQueueRelease(copy) NumTimesClosedFull %d, NumTimesClosedEarly "
3714  "%d\n",
3717 
3718  delete Queue;
3719 
3720  return PI_SUCCESS;
3721 }
3722 
3724  // Wait until command lists attached to the command queue are executed.
3725  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
3726 
3727  if (Queue->Device->useImmediateCommandLists()) {
3728  // Lock automatically releases when this goes out of scope.
3729  std::scoped_lock<pi_shared_mutex> Lock(Queue->Mutex);
3730 
3731  Queue->synchronize();
3732  } else {
3733  std::unique_lock<pi_shared_mutex> Lock(Queue->Mutex);
3734  std::vector<ze_command_queue_handle_t> ZeQueues;
3735 
3736  // execute any command list that may still be open.
3737  if (auto Res = Queue->executeAllOpenCommandLists())
3738  return Res;
3739 
3740  // Make a copy of queues to sync and release the lock.
3741  ZeQueues = Queue->CopyQueueGroup.ZeQueues;
3742  std::copy(Queue->ComputeQueueGroup.ZeQueues.begin(),
3743  Queue->ComputeQueueGroup.ZeQueues.end(),
3744  std::back_inserter(ZeQueues));
3745 
3746  // Remember the last command's event.
3747  auto LastCommandEvent = Queue->LastCommandEvent;
3748 
3749  // Don't hold a lock to the queue's mutex while waiting.
3750  // This allows continue working with the queue from other threads.
3751  // TODO: this currently exhibits some issues in the driver, so
3752  // we control this with an env var. Remove this control when
3753  // we settle one way or the other.
3754  static bool HoldLock =
3755  std::getenv("SYCL_PI_LEVEL_ZERO_QUEUE_FINISH_HOLD_LOCK") != nullptr;
3756  if (!HoldLock) {
3757  Lock.unlock();
3758  }
3759 
3760  for (auto ZeQueue : ZeQueues) {
3761  if (ZeQueue)
3762  ZE_CALL(zeHostSynchronize, (ZeQueue));
3763  }
3764 
3765  // Prevent unneeded already finished events to show up in the wait list.
3766  // We can only do so if nothing else was submitted to the queue
3767  // while we were synchronizing it.
3768  if (!HoldLock) {
3769  std::scoped_lock<pi_shared_mutex> Lock(Queue->Mutex);
3770  if (LastCommandEvent == Queue->LastCommandEvent) {
3771  Queue->LastCommandEvent = nullptr;
3772  }
3773  } else {
3774  Queue->LastCommandEvent = nullptr;
3775  }
3776  }
3777  // Reset signalled command lists and return them back to the cache of
3778  // available command lists.
3779  resetCommandLists(Queue);
3780  return PI_SUCCESS;
3781 }
3782 
3783 // Flushing cross-queue dependencies is covered by createAndRetainPiZeEventList,
3784 // so this can be left as a no-op.
3786  (void)Queue;
3787  return PI_SUCCESS;
3788 }
3789 
3791  pi_native_handle *NativeHandle) {
3792  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
3793  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3794 
3795  // Lock automatically releases when this goes out of scope.
3796  std::shared_lock<pi_shared_mutex> lock(Queue->Mutex);
3797 
3798  auto ZeQueue = pi_cast<ze_command_queue_handle_t *>(NativeHandle);
3799  // Extract the Level Zero compute queue handle from the given PI queue
3800  *ZeQueue = Queue->ComputeQueueGroup.ZeQueues[0];
3801  return PI_SUCCESS;
3802 }
3803 
3806  bool OwnNativeHandle,
3807  pi_queue *Queue) {
3808  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3809  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3810  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
3811  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
3812 
3813  auto ZeQueue = pi_cast<ze_command_queue_handle_t>(NativeHandle);
3814  // Assume this is the "0" index queue in the compute command-group.
3815  std::vector<ze_command_queue_handle_t> ZeQueues{ZeQueue};
3816 
3817  // TODO: see what we can do to correctly initialize PI queue for
3818  // compute vs. copy Level-Zero queue. Currently we will send
3819  // all commands to the "ZeQueue".
3820  std::vector<ze_command_queue_handle_t> ZeroCopyQueues;
3821  *Queue =
3822  new _pi_queue(ZeQueues, ZeroCopyQueues, Context, Device, OwnNativeHandle);
3823  return PI_SUCCESS;
3824 }
3825 
3826 // If indirect access tracking is enabled then performs reference counting,
3827 // otherwise just calls zeMemAllocDevice.
3829  pi_device Device, size_t Size) {
3830  pi_platform Plt = Device->Platform;
3831  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
3832  std::defer_lock);
3833  if (IndirectAccessTrackingEnabled) {
3834  // Lock the mutex which is guarding contexts container in the platform.
3835  // This prevents new kernels from being submitted in any context while
3836  // we are in the process of allocating a memory, this is needed to
3837  // properly capture allocations by kernels with indirect access.
3838  ContextsLock.lock();
3839  // We are going to defer memory release if there are kernels with
3840  // indirect access, that is why explicitly retain context to be sure
3841  // that it is released after all memory allocations in this context are
3842  // released.
3844  }
3845 
3846  ze_device_mem_alloc_desc_t ZeDesc = {};
3847  ZeDesc.flags = 0;
3848  ZeDesc.ordinal = 0;
3849  ZE_CALL(zeMemAllocDevice,
3850  (Context->ZeContext, &ZeDesc, Size, 1, Device->ZeDevice, ResultPtr));
3851 
3852  if (IndirectAccessTrackingEnabled) {
3853  // Keep track of all memory allocations in the context
3854  Context->MemAllocs.emplace(std::piecewise_construct,
3855  std::forward_as_tuple(*ResultPtr),
3856  std::forward_as_tuple(Context));
3857  }
3858  return PI_SUCCESS;
3859 }
3860 
3861 // If indirect access tracking is enabled then performs reference counting,
3862 // otherwise just calls zeMemAllocHost.
3864  size_t Size) {
3865  pi_platform Plt = Context->getPlatform();
3866  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
3867  std::defer_lock);
3868  if (IndirectAccessTrackingEnabled) {
3869  // Lock the mutex which is guarding contexts container in the platform.
3870  // This prevents new kernels from being submitted in any context while
3871  // we are in the process of allocating a memory, this is needed to
3872  // properly capture allocations by kernels with indirect access.
3873  ContextsLock.lock();
3874  // We are going to defer memory release if there are kernels with
3875  // indirect access, that is why explicitly retain context to be sure
3876  // that it is released after all memory allocations in this context are
3877  // released.
3879  }
3880 
3882  ZeDesc.flags = 0;
3883  ZE_CALL(zeMemAllocHost, (Context->ZeContext, &ZeDesc, Size, 1, ResultPtr));
3884 
3885  if (IndirectAccessTrackingEnabled) {
3886  // Keep track of all memory allocations in the context
3887  Context->MemAllocs.emplace(std::piecewise_construct,
3888  std::forward_as_tuple(*ResultPtr),
3889  std::forward_as_tuple(Context));
3890  }
3891  return PI_SUCCESS;
3892 }
3893 
3895  void *HostPtr, pi_mem *RetMem,
3896  const pi_mem_properties *properties) {
3897 
3898  // TODO: implement support for more access modes
3899  if (!((Flags & PI_MEM_FLAGS_ACCESS_RW) ||
3900  (Flags & PI_MEM_ACCESS_READ_ONLY))) {
3901  die("piMemBufferCreate: Level-Zero supports read-write and read-only "
3902  "buffer,"
3903  "but not other accesses (such as write-only) yet.");
3904  }
3905 
3906  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3907  PI_ASSERT(RetMem, PI_ERROR_INVALID_VALUE);
3908 
3909  if (properties != nullptr) {
3910  die("piMemBufferCreate: no mem properties goes to Level-Zero RT yet");
3911  }
3912 
3913  if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
3914  // Having PI_MEM_FLAGS_HOST_PTR_ALLOC for buffer requires allocation of
3915  // pinned host memory, see:
3916  // sycl/doc/extensions/supported/sycl_ext_oneapi_use_pinned_host_memory_property.asciidoc
3917  // We are however missing such functionality in Level Zero, so we just
3918  // ignore the flag for now.
3919  //
3920  }
3921 
3922  // If USM Import feature is enabled and hostptr is supplied,
3923  // import the hostptr if not already imported into USM.
3924  // Data transfer rate is maximized when both source and destination
3925  // are USM pointers. Promotion of the host pointer to USM thus
3926  // optimizes data transfer performance.
3927  bool HostPtrImported = false;
3928  if (ZeUSMImport.Enabled && HostPtr != nullptr &&
3929  (Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0) {
3930  // Query memory type of the host pointer
3931  ze_device_handle_t ZeDeviceHandle;
3932  ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
3933  ZE_CALL(zeMemGetAllocProperties,
3934  (Context->ZeContext, HostPtr, &ZeMemoryAllocationProperties,
3935  &ZeDeviceHandle));
3936 
3937  // If not shared of any type, we can import the ptr
3938  if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) {
3939  // Promote the host ptr to USM host memory
3940  ze_driver_handle_t driverHandle = Context->getPlatform()->ZeDriver;
3941  ZeUSMImport.doZeUSMImport(driverHandle, HostPtr, Size);
3942  HostPtrImported = true;
3943  }
3944  }
3945 
3946  pi_buffer Buffer = nullptr;
3947  auto HostPtrOrNull =
3948  (Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast<char *>(HostPtr) : nullptr;
3949  try {
3950  Buffer = new _pi_buffer(Context, Size, HostPtrOrNull, HostPtrImported);
3951  } catch (const std::bad_alloc &) {
3952  return PI_ERROR_OUT_OF_HOST_MEMORY;
3953  } catch (...) {
3954  return PI_ERROR_UNKNOWN;
3955  }
3956 
3957  // Initialize the buffer with user data
3958  if (HostPtr) {
3959  if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
3960  (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
3961 
3962  char *ZeHandleDst;
3963  PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only));
3964  if (Buffer->OnHost) {
3965  // Do a host to host copy.
3966  // For an imported HostPtr the copy is unneeded.
3967  if (!HostPtrImported)
3968  memcpy(ZeHandleDst, HostPtr, Size);
3969  } else {
3970  // Initialize the buffer synchronously with immediate offload
3971  // zeCommandListAppendMemoryCopy must not be called from simultaneous
3972  // threads with the same command list handle, so we need exclusive lock.
3973  std::scoped_lock<pi_mutex> Lock(Context->ImmediateCommandListMutex);
3974  ZE_CALL(zeCommandListAppendMemoryCopy,
3975  (Context->ZeCommandListInit, ZeHandleDst, HostPtr, Size,
3976  nullptr, 0, nullptr));
3977  }
3978  } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
3979  // Nothing more to do.
3980  } else {
3981  die("piMemBufferCreate: not implemented");
3982  }
3983  }
3984 
3985  *RetMem = Buffer;
3986  return PI_SUCCESS;
3987 }
3988 
3989 pi_result piMemGetInfo(pi_mem Mem, pi_mem_info ParamName, size_t ParamValueSize,
3990  void *ParamValue, size_t *ParamValueSizeRet) {
3991  PI_ASSERT(Mem, PI_ERROR_INVALID_VALUE);
3992  // piMemImageGetInfo must be used for images
3993  PI_ASSERT(!Mem->isImage(), PI_ERROR_INVALID_VALUE);
3994 
3995  std::shared_lock<pi_shared_mutex> Lock(Mem->Mutex);
3996  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
3997 
3998  switch (ParamName) {
3999  case PI_MEM_CONTEXT:
4000  return ReturnValue(Mem->Context);
4001  case PI_MEM_SIZE: {
4002  // Get size of the allocation
4003  auto Buffer = pi_cast<pi_buffer>(Mem);
4004  return ReturnValue(size_t{Buffer->Size});
4005  }
4006  default:
4007  die("piMemGetInfo: Parameter is not implemented");
4008  }
4009 
4010  return {};
4011 }
4012 
4014  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
4015 
4016  Mem->RefCount.increment();
4017  return PI_SUCCESS;
4018 }
4019 
4020 // If indirect access tracking is not enabled then this functions just performs
4021 // zeMemFree. If indirect access tracking is enabled then reference counting is
4022 // performed.
4024  bool OwnZeMemHandle = true) {
4025  pi_platform Plt = Context->getPlatform();
4026  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
4027  std::defer_lock);
4028  if (IndirectAccessTrackingEnabled) {
4029  ContextsLock.lock();
4030  auto It = Context->MemAllocs.find(Ptr);
4031  if (It == std::end(Context->MemAllocs)) {
4032  die("All memory allocations must be tracked!");
4033  }
4034  if (!It->second.RefCount.decrementAndTest()) {
4035  // Memory can't be deallocated yet.
4036  return PI_SUCCESS;
4037  }
4038 
4039  // Reference count is zero, it is ok to free memory.
4040  // We don't need to track this allocation anymore.
4041  Context->MemAllocs.erase(It);
4042  }
4043 
4044  if (OwnZeMemHandle)
4045  ZE_CALL(zeMemFree, (Context->ZeContext, Ptr));
4046 
4047  if (IndirectAccessTrackingEnabled)
4049 
4050  return PI_SUCCESS;
4051 }
4052 
4053 static pi_result USMFreeHelper(pi_context Context, void *Ptr,
4054  bool OwnZeMemHandle);
4055 
4057  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
4058 
4059  if (!Mem->RefCount.decrementAndTest())
4060  return PI_SUCCESS;
4061 
4062  if (Mem->isImage()) {
4063  char *ZeHandleImage;
4064  PI_CALL(Mem->getZeHandle(ZeHandleImage, _pi_mem::write_only));
4065  ZE_CALL(zeImageDestroy, (pi_cast<ze_image_handle_t>(ZeHandleImage)));
4066  } else {
4067  auto Buffer = static_cast<pi_buffer>(Mem);
4068  Buffer->free();
4069  }
4070  delete Mem;
4071 
4072  return PI_SUCCESS;
4073 }
4074 
4076  const pi_image_format *ImageFormat,
4077  const pi_image_desc *ImageDesc, void *HostPtr,
4078  pi_mem *RetImage) {
4079 
4080  // TODO: implement read-only, write-only
4081  if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) {
4082  die("piMemImageCreate: Level-Zero implements only read-write buffer,"
4083  "no read-only or write-only yet.");
4084  }
4085  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
4086  PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE);
4087  PI_ASSERT(ImageFormat, PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
4088 
4089  ze_image_format_type_t ZeImageFormatType;
4090  size_t ZeImageFormatTypeSize;
4091  switch (ImageFormat->image_channel_data_type) {
4093  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT;
4094  ZeImageFormatTypeSize = 32;
4095  break;
4097  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT;
4098  ZeImageFormatTypeSize = 16;
4099  break;
4101  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
4102  ZeImageFormatTypeSize = 32;
4103  break;
4105  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
4106  ZeImageFormatTypeSize = 16;
4107  break;
4109  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
4110  ZeImageFormatTypeSize = 8;
4111  break;
4113  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM;
4114  ZeImageFormatTypeSize = 16;
4115  break;
4117  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM;
4118  ZeImageFormatTypeSize = 8;
4119  break;
4121  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
4122  ZeImageFormatTypeSize = 32;
4123  break;
4125  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
4126  ZeImageFormatTypeSize = 16;
4127  break;
4129  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
4130  ZeImageFormatTypeSize = 8;
4131  break;
4133  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM;
4134  ZeImageFormatTypeSize = 16;
4135  break;
4137  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM;
4138  ZeImageFormatTypeSize = 8;
4139  break;
4140  default:
4141  zePrint("piMemImageCreate: unsupported image data type: data type = %d\n",
4142  ImageFormat->image_channel_data_type);
4143  return PI_ERROR_INVALID_VALUE;
4144  }
4145 
4146  // TODO: populate the layout mapping
4147  ze_image_format_layout_t ZeImageFormatLayout;
4148  switch (ImageFormat->image_channel_order) {
4150  switch (ZeImageFormatTypeSize) {
4151  case 8:
4152  ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8;
4153  break;
4154  case 16:
4155  ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16;
4156  break;
4157  case 32:
4158  ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
4159  break;
4160  default:
4161  zePrint("piMemImageCreate: unexpected data type Size\n");
4162  return PI_ERROR_INVALID_VALUE;
4163  }
4164  break;
4165  default:
4166  zePrint("format layout = %d\n", ImageFormat->image_channel_order);
4167  die("piMemImageCreate: unsupported image format layout\n");
4168  break;
4169  }
4170 
4171  ze_image_format_t ZeFormatDesc = {
4172  ZeImageFormatLayout, ZeImageFormatType,
4173  // TODO: are swizzles deducted from image_format->image_channel_order?
4174  ZE_IMAGE_FORMAT_SWIZZLE_R, ZE_IMAGE_FORMAT_SWIZZLE_G,
4175  ZE_IMAGE_FORMAT_SWIZZLE_B, ZE_IMAGE_FORMAT_SWIZZLE_A};
4176 
4177  ze_image_type_t ZeImageType;
4178  switch (ImageDesc->image_type) {
4179  case PI_MEM_TYPE_IMAGE1D:
4180  ZeImageType = ZE_IMAGE_TYPE_1D;
4181  break;
4182  case PI_MEM_TYPE_IMAGE2D:
4183  ZeImageType = ZE_IMAGE_TYPE_2D;
4184  break;
4185  case PI_MEM_TYPE_IMAGE3D:
4186  ZeImageType = ZE_IMAGE_TYPE_3D;
4187  break;
4189  ZeImageType = ZE_IMAGE_TYPE_1DARRAY;
4190  break;
4192  ZeImageType = ZE_IMAGE_TYPE_2DARRAY;
4193  break;
4194  default:
4195  zePrint("piMemImageCreate: unsupported image type\n");
4196  return PI_ERROR_INVALID_VALUE;
4197  }
4198 
4199  ZeStruct<ze_image_desc_t> ZeImageDesc;
4200  ZeImageDesc.arraylevels = ZeImageDesc.flags = 0;
4201  ZeImageDesc.type = ZeImageType;
4202  ZeImageDesc.format = ZeFormatDesc;
4203  ZeImageDesc.width = pi_cast<uint32_t>(ImageDesc->image_width);
4204  ZeImageDesc.height = pi_cast<uint32_t>(ImageDesc->image_height);
4205  ZeImageDesc.depth = pi_cast<uint32_t>(ImageDesc->image_depth);
4206  ZeImageDesc.arraylevels = pi_cast<uint32_t>(ImageDesc->image_array_size);
4207  ZeImageDesc.miplevels = ImageDesc->num_mip_levels;
4208 
4209  std::shared_lock<pi_shared_mutex> Lock(Context->Mutex);
4210 
4211  // Currently we have the "0" device in context with mutliple root devices to
4212  // own the image.
4213  // TODO: Implement explicit copying for acessing the image from other devices
4214  // in the context.
4216  : Context->Devices[0];
4217  ze_image_handle_t ZeHImage;
4218  ZE_CALL(zeImageCreate,
4219  (Context->ZeContext, Device->ZeDevice, &ZeImageDesc, &ZeHImage));
4220 
4221  try {
4222  auto ZePIImage = new _pi_image(Context, ZeHImage);
4223 
4224 #ifndef NDEBUG
4225  ZePIImage->ZeImageDesc = ZeImageDesc;
4226 #endif // !NDEBUG
4227 
4228  if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
4229  (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
4230  // Initialize image synchronously with immediate offload.
4231  // zeCommandListAppendImageCopyFromMemory must not be called from
4232  // simultaneous threads with the same command list handle, so we need
4233  // exclusive lock.
4234  std::scoped_lock<pi_mutex> Lock(Context->ImmediateCommandListMutex);
4235  ZE_CALL(zeCommandListAppendImageCopyFromMemory,
4236  (Context->ZeCommandListInit, ZeHImage, HostPtr, nullptr, nullptr,
4237  0, nullptr));
4238  }
4239 
4240  *RetImage = ZePIImage;
4241  } catch (const std::bad_alloc &) {
4242  return PI_ERROR_OUT_OF_HOST_MEMORY;
4243  } catch (...) {
4244  return PI_ERROR_UNKNOWN;
4245  }
4246  return PI_SUCCESS;
4247 }
4248 
4250  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
4251  std::shared_lock<pi_shared_mutex> Guard(Mem->Mutex);
4252  char *ZeHandle;
4253  PI_CALL(Mem->getZeHandle(ZeHandle, _pi_mem::read_write));
4254  *NativeHandle = pi_cast<pi_native_handle>(ZeHandle);
4255  return PI_SUCCESS;
4256 }
4257 
4260  bool ownNativeHandle, pi_mem *Mem) {
4261  PI_ASSERT(Mem, PI_ERROR_INVALID_VALUE);
4262  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
4263  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
4264 
4265  std::shared_lock<pi_shared_mutex> Lock(Context->Mutex);
4266 
4267  // Get base of the allocation
4268  void *Base;
4269  size_t Size;
4270  void *Ptr = pi_cast<void *>(NativeHandle);
4271  ZE_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, &Size));
4272  PI_ASSERT(Ptr == Base, PI_ERROR_INVALID_VALUE);
4273 
4275  ze_device_handle_t ZeDevice = nullptr;
4276  ZE_CALL(zeMemGetAllocProperties,
4277  (Context->ZeContext, Ptr, &ZeMemProps, &ZeDevice));
4278 
4279  // Check type of the allocation
4280  switch (ZeMemProps.type) {
4281  case ZE_MEMORY_TYPE_HOST:
4282  case ZE_MEMORY_TYPE_SHARED:
4283  case ZE_MEMORY_TYPE_DEVICE:
4284  break;
4285  case ZE_MEMORY_TYPE_UNKNOWN:
4286  // Memory allocation is unrelated to the context
4287  return PI_ERROR_INVALID_CONTEXT;
4288  default:
4289  die("Unexpected memory type");
4290  }
4291 
4292  pi_device Device = nullptr;
4293  if (ZeDevice) {
4295  PI_ASSERT(Context->isValidDevice(Device), PI_ERROR_INVALID_CONTEXT);
4296  }
4297 
4298  try {
4299  *Mem = new _pi_buffer(Context, Size, Device, pi_cast<char *>(NativeHandle),
4300  ownNativeHandle);
4301 
4302  pi_platform Plt = Context->getPlatform();
4303  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
4304  std::defer_lock);
4305  if (IndirectAccessTrackingEnabled) {
4306  // We need to keep track of all memory allocations in the context
4307  ContextsLock.lock();
4308  // Retain context to be sure that it is released after all memory
4309  // allocations in this context are released.
4311 
4312  Context->MemAllocs.emplace(
4313  std::piecewise_construct, std::forward_as_tuple(Ptr),
4314  std::forward_as_tuple(Context, ownNativeHandle));
4315  }
4316  } catch (const std::bad_alloc &) {
4317  return PI_ERROR_OUT_OF_HOST_MEMORY;
4318  } catch (...) {
4319  return PI_ERROR_UNKNOWN;
4320  }
4321 
4322  // Initialize the buffer as necessary
4323  auto Buffer = pi_cast<pi_buffer>(*Mem);
4324  if (Device) {
4325  // If this allocation is on a device, then we re-use it for the buffer.
4326  // Nothing to do.
4327  } else if (Buffer->OnHost) {
4328  // If this is host allocation and buffer always stays on host there
4329  // nothing more to do.
4330  } else {
4331  // In all other cases (shared allocation, or host allocation that cannot
4332  // represent the buffer in this context) copy the data to a newly
4333  // created device allocation.
4334  char *ZeHandleDst;
4335  PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only));
4336 
4337  // zeCommandListAppendMemoryCopy must not be called from simultaneous
4338  // threads with the same command list handle, so we need exclusive lock.
4339  std::scoped_lock<pi_mutex> Lock(Context->ImmediateCommandListMutex);
4340  ZE_CALL(zeCommandListAppendMemoryCopy,
4341  (Context->ZeCommandListInit, ZeHandleDst, Ptr, Size, nullptr, 0,
4342  nullptr));
4343  }
4344 
4345  return PI_SUCCESS;
4346 }
4347 
4349  size_t Length, pi_program *Program) {
4350 
4351  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
4352  PI_ASSERT(ILBytes && Length, PI_ERROR_INVALID_VALUE);
4353  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4354 
4355  // NOTE: the Level Zero module creation is also building the program, so we
4356  // are deferring it until the program is ready to be built.
4357 
4358  try {
4359  *Program = new _pi_program(_pi_program::IL, Context, ILBytes, Length);
4360  } catch (const std::bad_alloc &) {
4361  return PI_ERROR_OUT_OF_HOST_MEMORY;
4362  } catch (...) {
4363  return PI_ERROR_UNKNOWN;
4364  }
4365  return PI_SUCCESS;
4366 }
4367 
4369  pi_context Context, pi_uint32 NumDevices, const pi_device *DeviceList,
4370  const size_t *Lengths, const unsigned char **Binaries,
4371  size_t NumMetadataEntries, const pi_device_binary_property *Metadata,
4372  pi_int32 *BinaryStatus, pi_program *Program) {
4373  (void)Metadata;
4374  (void)NumMetadataEntries;
4375 
4376  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
4377  PI_ASSERT(DeviceList && NumDevices, PI_ERROR_INVALID_VALUE);
4378  PI_ASSERT(Binaries && Lengths, PI_ERROR_INVALID_VALUE);
4379  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4380 
4381  // For now we support only one device.
4382  if (NumDevices != 1) {
4383  zePrint("piProgramCreateWithBinary: level_zero supports only one device.");
4384  return PI_ERROR_INVALID_VALUE;
4385  }
4386  if (!Binaries[0] || !Lengths[0]) {
4387  if (BinaryStatus)
4388  *BinaryStatus = PI_ERROR_INVALID_VALUE;
4389  return PI_ERROR_INVALID_VALUE;
4390  }
4391 
4392  size_t Length = Lengths[0];
4393  auto Binary = Binaries[0];
4394 
4395  // In OpenCL, clCreateProgramWithBinary() can be used to load any of the
4396  // following: "program executable", "compiled program", or "library of
4397  // compiled programs". In addition, the loaded program can be either
4398  // IL (SPIR-v) or native device code. For now, we assume that
4399  // piProgramCreateWithBinary() is only used to load a "program executable"
4400  // as native device code.
4401  // If we wanted to support all the same cases as OpenCL, we would need to
4402  // somehow examine the binary image to distinguish the cases. Alternatively,
4403  // we could change the PI interface and have the caller pass additional
4404  // information to distinguish the cases.
4405 
4406  try {
4407  *Program = new _pi_program(_pi_program::Native, Context, Binary, Length);
4408  } catch (const std::bad_alloc &) {
4409  return PI_ERROR_OUT_OF_HOST_MEMORY;
4410  } catch (...) {
4411  return PI_ERROR_UNKNOWN;
4412  }
4413 
4414  if (BinaryStatus)
4415  *BinaryStatus = PI_SUCCESS;
4416  return PI_SUCCESS;
4417 }
4418 
4420  const char **Strings,
4421  const size_t *Lengths,
4422  pi_program *RetProgram) {
4423 
4424  (void)Context;
4425  (void)Count;
4426  (void)Strings;
4427  (void)Lengths;
4428  (void)RetProgram;
4429  zePrint("piclProgramCreateWithSource: not supported in Level Zero\n");
4430  return PI_ERROR_INVALID_OPERATION;
4431 }
4432 
4434  size_t ParamValueSize, void *ParamValue,
4435  size_t *ParamValueSizeRet) {
4436 
4437  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4438 
4439  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4440  switch (ParamName) {
4442  return ReturnValue(pi_uint32{Program->RefCount.load()});
4444  // TODO: return true number of devices this program exists for.
4445  return ReturnValue(pi_uint32{1});
4447  // TODO: return all devices this program exists for.
4448  return ReturnValue(Program->Context->Devices[0]);
4450  std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
4451  size_t SzBinary;
4452  if (Program->State == _pi_program::IL ||
4453  Program->State == _pi_program::Native ||
4454  Program->State == _pi_program::Object) {
4455  SzBinary = Program->CodeLength;
4456  } else if (Program->State == _pi_program::Exe) {
4457  ZE_CALL(zeModuleGetNativeBinary, (Program->ZeModule, &SzBinary, nullptr));
4458  } else {
4459  return PI_ERROR_INVALID_PROGRAM;
4460  }
4461  // This is an array of 1 element, initialized as if it were scalar.
4462  return ReturnValue(size_t{SzBinary});
4463  }
4464  case PI_PROGRAM_INFO_BINARIES: {
4465  // The caller sets "ParamValue" to an array of pointers, one for each
4466  // device. Since Level Zero supports only one device, there is only one
4467  // pointer. If the pointer is NULL, we don't do anything. Otherwise, we
4468  // copy the program's binary image to the buffer at that pointer.
4469  uint8_t **PBinary = pi_cast<uint8_t **>(ParamValue);
4470  if (!PBinary[0])
4471  break;
4472 
4473  std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
4474  if (Program->State == _pi_program::IL ||
4475  Program->State == _pi_program::Native ||
4476  Program->State == _pi_program::Object) {
4477  std::memcpy(PBinary[0], Program->Code.get(), Program->CodeLength);
4478  } else if (Program->State == _pi_program::Exe) {
4479  size_t SzBinary = 0;
4480  ZE_CALL(zeModuleGetNativeBinary,
4481  (Program->ZeModule, &SzBinary, PBinary[0]));
4482  } else {
4483  return PI_ERROR_INVALID_PROGRAM;
4484  }
4485  break;
4486  }
4488  std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
4489  uint32_t NumKernels;
4490  if (Program->State == _pi_program::IL ||
4491  Program->State == _pi_program::Native ||
4492  Program->State == _pi_program::Object) {
4493  return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
4494  } else if (Program->State == _pi_program::Exe) {
4495  NumKernels = 0;
4496  ZE_CALL(zeModuleGetKernelNames,
4497  (Program->ZeModule, &NumKernels, nullptr));
4498  } else {
4499  return PI_ERROR_INVALID_PROGRAM;
4500  }
4501  return ReturnValue(size_t{NumKernels});
4502  }
4504  try {
4505  std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
4506  std::string PINames{""};
4507  if (Program->State == _pi_program::IL ||
4508  Program->State == _pi_program::Native ||
4509  Program->State == _pi_program::Object) {
4510  return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
4511  } else if (Program->State == _pi_program::Exe) {
4512  uint32_t Count = 0;
4513  ZE_CALL(zeModuleGetKernelNames, (Program->ZeModule, &Count, nullptr));
4514  std::unique_ptr<const char *[]> PNames(new const char *[Count]);
4515  ZE_CALL(zeModuleGetKernelNames,
4516  (Program->ZeModule, &Count, PNames.get()));
4517  for (uint32_t I = 0; I < Count; ++I) {
4518  PINames += (I > 0 ? ";" : "");
4519  PINames += PNames[I];
4520  }
4521  } else {
4522  return PI_ERROR_INVALID_PROGRAM;
4523  }
4524  return ReturnValue(PINames.c_str());
4525  } catch (const std::bad_alloc &) {
4526  return PI_ERROR_OUT_OF_HOST_MEMORY;
4527  } catch (...) {
4528  return PI_ERROR_UNKNOWN;
4529  }
4530  default:
4531  die("piProgramGetInfo: not implemented");
4532  }
4533 
4534  return PI_SUCCESS;
4535 }
4536 
4538  const pi_device *DeviceList, const char *Options,
4539  pi_uint32 NumInputPrograms,
4540  const pi_program *InputPrograms,
4541  void (*PFnNotify)(pi_program Program, void *UserData),
4542  void *UserData, pi_program *RetProgram) {
4543  // We only support one device with Level Zero currently.
4544  if (NumDevices != 1) {
4545  zePrint("piProgramLink: level_zero supports only one device.");
4546  return PI_ERROR_INVALID_VALUE;
4547  }
4548 
4549  // We do not support any link flags at this time because the Level Zero API
4550  // does not have any way to pass flags that are specific to linking.
4551  if (Options && *Options != '\0') {
4552  std::string ErrorMessage(
4553  "Level Zero does not support kernel link flags: \"");
4554  ErrorMessage.append(Options);
4555  ErrorMessage.push_back('\"');
4556  pi_program Program =
4558  *RetProgram = Program;
4559  return PI_ERROR_LINK_PROGRAM_FAILURE;
4560  }
4561 
4562  // Validate input parameters.
4563  PI_ASSERT(DeviceList, PI_ERROR_INVALID_DEVICE);
4564  PI_ASSERT(Context->isValidDevice(DeviceList[0]), PI_ERROR_INVALID_DEVICE);
4565  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
4566  if (NumInputPrograms == 0 || InputPrograms == nullptr)
4567  return PI_ERROR_INVALID_VALUE;
4568 
4569  pi_result PiResult = PI_SUCCESS;
4570  try {
4571  // Acquire a "shared" lock on each of the input programs, and also validate
4572  // that they are all in Object state.
4573  //
4574  // There is no danger of deadlock here even if two threads call
4575  // piProgramLink simultaneously with the same input programs in a different
4576  // order. If we were acquiring these with "exclusive" access, this could
4577  // lead to a classic lock ordering deadlock. However, there is no such
4578  // deadlock potential with "shared" access. There could also be a deadlock
4579  // potential if there was some other code that holds more than one of these
4580  // locks simultaneously with "exclusive" access. However, there is no such
4581  // code like that, so this is also not a danger.
4582  std::vector<std::shared_lock<pi_shared_mutex>> Guards(NumInputPrograms);
4583  for (pi_uint32 I = 0; I < NumInputPrograms; I++) {
4584  std::shared_lock<pi_shared_mutex> Guard(InputPrograms[I]->Mutex);
4585  Guards[I].swap(Guard);
4586  if (InputPrograms[I]->State != _pi_program::Object) {
4587  return PI_ERROR_INVALID_OPERATION;
4588  }
4589  }
4590 
4591  // Previous calls to piProgramCompile did not actually compile the SPIR-V.
4592  // Instead, we postpone compilation until this point, when all the modules
4593  // are linked together. By doing compilation and linking together, the JIT
4594  // compiler is able see all modules and do cross-module optimizations.
4595  //
4596  // Construct a ze_module_program_exp_desc_t which contains information about
4597  // all of the modules that will be linked together.
4599  std::vector<size_t> CodeSizes(NumInputPrograms);
4600  std::vector<const uint8_t *> CodeBufs(NumInputPrograms);
4601  std::vector<const char *> BuildFlagPtrs(NumInputPrograms);
4602  std::vector<const ze_module_constants_t *> SpecConstPtrs(NumInputPrograms);
4603  std::vector<_pi_program::SpecConstantShim> SpecConstShims;
4604  SpecConstShims.reserve(NumInputPrograms);
4605 
4606  for (pi_uint32 I = 0; I < NumInputPrograms; I++) {
4607  pi_program Program = InputPrograms[I];
4608  CodeSizes[I] = Program->CodeLength;
4609  CodeBufs[I] = Program->Code.get();
4610  BuildFlagPtrs[I] = Program->BuildFlags.c_str();
4611  SpecConstShims.emplace_back(Program);
4612  SpecConstPtrs[I] = SpecConstShims[I].ze();
4613  }
4614 
4615  ZeExtModuleDesc.count = NumInputPrograms;
4616  ZeExtModuleDesc.inputSizes = CodeSizes.data();
4617  ZeExtModuleDesc.pInputModules = CodeBufs.data();
4618  ZeExtModuleDesc.pBuildFlags = BuildFlagPtrs.data();
4619  ZeExtModuleDesc.pConstants = SpecConstPtrs.data();
4620 
4621  ZeStruct<ze_module_desc_t> ZeModuleDesc;
4622  ZeModuleDesc.pNext = &ZeExtModuleDesc;
4623  ZeModuleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV;
4624 
4625  // This works around a bug in the Level Zero driver. When "ZE_DEBUG=-1",
4626  // the driver does validation of the API calls, and it expects
4627  // "pInputModule" to be non-NULL and "inputSize" to be non-zero. This
4628  // validation is wrong when using the "ze_module_program_exp_desc_t"
4629  // extension because those fields are supposed to be ignored. As a
4630  // workaround, set both fields to 1.
4631  //
4632  // TODO: Remove this workaround when the driver is fixed.
4633  ZeModuleDesc.pInputModule = reinterpret_cast<const uint8_t *>(1);
4634  ZeModuleDesc.inputSize = 1;
4635 
4636  // We need a Level Zero extension to compile multiple programs together into
4637  // a single Level Zero module. However, we don't need that extension if
4638  // there happens to be only one input program.
4639  //
4640  // The "|| (NumInputPrograms == 1)" term is a workaround for a bug in the
4641  // Level Zero driver. The driver's "ze_module_program_exp_desc_t"
4642  // extension should work even in the case when there is just one input
4643  // module. However, there is currently a bug in the driver that leads to a
4644  // crash. As a workaround, do not use the extension when there is one
4645  // input module.
4646  //
4647  // TODO: Remove this workaround when the driver is fixed.
4648  if (!PiDriverModuleProgramExtensionFound || (NumInputPrograms == 1)) {
4649  if (NumInputPrograms == 1) {
4650  ZeModuleDesc.pNext = nullptr;
4651  ZeModuleDesc.inputSize = ZeExtModuleDesc.inputSizes[0];
4652  ZeModuleDesc.pInputModule = ZeExtModuleDesc.pInputModules[0];
4653  ZeModuleDesc.pBuildFlags = ZeExtModuleDesc.pBuildFlags[0];
4654  ZeModuleDesc.pConstants = ZeExtModuleDesc.pConstants[0];
4655  } else {
4656  zePrint("piProgramLink: level_zero driver does not have static linking "
4657  "support.");
4658  return PI_ERROR_INVALID_VALUE;
4659  }
4660  }
4661 
4662  // Call the Level Zero API to compile, link, and create the module.
4663  ze_device_handle_t ZeDevice = DeviceList[0]->ZeDevice;
4664  ze_context_handle_t ZeContext = Context->ZeContext;
4665  ze_module_handle_t ZeModule = nullptr;
4666  ze_module_build_log_handle_t ZeBuildLog = nullptr;
4667  ze_result_t ZeResult =
4668  ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc,
4669  &ZeModule, &ZeBuildLog));
4670 
4671  // We still create a _pi_program object even if there is a BUILD_FAILURE
4672  // because we need the object to hold the ZeBuildLog. There is no build
4673  // log created for other errors, so we don't create an object.
4674  PiResult = mapError(ZeResult);
4675  if (ZeResult != ZE_RESULT_SUCCESS &&
4676  ZeResult != ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) {
4677  return PiResult;
4678  }
4679 
4680  // The call to zeModuleCreate does not report an error if there are
4681  // unresolved symbols because it thinks these could be resolved later via a
4682  // call to zeModuleDynamicLink. However, modules created with piProgramLink
4683  // are supposed to be fully linked and ready to use. Therefore, do an extra
4684  // check now for unresolved symbols. Note that we still create a
4685  // _pi_program if there are unresolved symbols because the ZeBuildLog tells
4686  // which symbols are unresolved.
4687  if (ZeResult == ZE_RESULT_SUCCESS) {
4688  ZeResult = checkUnresolvedSymbols(ZeModule, &ZeBuildLog);
4689  if (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE) {
4690  PiResult = PI_ERROR_LINK_PROGRAM_FAILURE;
4691  } else if (ZeResult != ZE_RESULT_SUCCESS) {
4692  return mapError(ZeResult);
4693  }
4694  }
4695 
4696  _pi_program::state State =
4697  (PiResult == PI_SUCCESS) ? _pi_program::Exe : _pi_program::Invalid;
4698  *RetProgram = new _pi_program(State, Context, ZeModule, ZeBuildLog);
4699  } catch (const std::bad_alloc &) {
4700  return PI_ERROR_OUT_OF_HOST_MEMORY;
4701  } catch (...) {
4702  return PI_ERROR_UNKNOWN;
4703  }
4704  return PiResult;
4705 }
4706 
4708  pi_program Program, pi_uint32 NumDevices, const pi_device *DeviceList,
4709  const char *Options, pi_uint32 NumInputHeaders,
4710  const pi_program *InputHeaders, const char **HeaderIncludeNames,
4711  void (*PFnNotify)(pi_program Program, void *UserData), void *UserData) {
4712  (void)NumInputHeaders;
4713  (void)InputHeaders;
4714  (void)HeaderIncludeNames;
4715 
4716  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4717 
4718  if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList))
4719  return PI_ERROR_INVALID_VALUE;
4720 
4721  // These aren't supported.
4722  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
4723 
4724  std::scoped_lock<pi_shared_mutex> Guard(Program->Mutex);
4725 
4726  // It's only valid to compile a program created from IL (we don't support
4727  // programs created from source code).
4728  //
4729  // The OpenCL spec says that the header parameters are ignored when compiling
4730  // IL programs, so we don't validate them.
4731  if (Program->State != _pi_program::IL)
4732  return PI_ERROR_INVALID_OPERATION;
4733 
4734  // We don't compile anything now. Instead, we delay compilation until
4735  // piProgramLink, where we do both compilation and linking as a single step.
4736  // This produces better code because the driver can do cross-module
4737  // optimizations. Therefore, we just remember the compilation flags, so we
4738  // can use them later.
4739  if (Options)
4740  Program->BuildFlags = Options;
4741  Program->State = _pi_program::Object;
4742 
4743  return PI_SUCCESS;
4744 }
4745 
4747  const pi_device *DeviceList, const char *Options,
4748  void (*PFnNotify)(pi_program Program, void *UserData),
4749  void *UserData) {
4750 
4751  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4752  if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList))
4753  return PI_ERROR_INVALID_VALUE;
4754 
4755  // We only support build to one device with Level Zero now.
4756  // TODO: we should eventually build to the possibly multiple root
4757  // devices in the context.
4758  if (NumDevices != 1) {
4759  zePrint("piProgramBuild: level_zero supports only one device.");
4760  return PI_ERROR_INVALID_VALUE;
4761  }
4762 
4763  // These aren't supported.
4764  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
4765 
4766  std::scoped_lock<pi_shared_mutex> Guard(Program->Mutex);
4767  // Check if device belongs to associated context.
4768  PI_ASSERT(Program->Context, PI_ERROR_INVALID_PROGRAM);
4769  PI_ASSERT(Program->Context->isValidDevice(DeviceList[0]),
4770  PI_ERROR_INVALID_VALUE);
4771 
4772  // It is legal to build a program created from either IL or from native
4773  // device code.
4774  if (Program->State != _pi_program::IL &&
4775  Program->State != _pi_program::Native)
4776  return PI_ERROR_INVALID_OPERATION;
4777 
4778  // We should have either IL or native device code.
4779  PI_ASSERT(Program->Code, PI_ERROR_INVALID_PROGRAM);
4780 
4781  // Ask Level Zero to build and load the native code onto the device.
4782  ZeStruct<ze_module_desc_t> ZeModuleDesc;
4783  _pi_program::SpecConstantShim Shim(Program);
4784  ZeModuleDesc.format = (Program->State == _pi_program::IL)
4785  ? ZE_MODULE_FORMAT_IL_SPIRV
4786  : ZE_MODULE_FORMAT_NATIVE;
4787  ZeModuleDesc.inputSize = Program->CodeLength;
4788  ZeModuleDesc.pInputModule = Program->Code.get();
4789  ZeModuleDesc.pBuildFlags = Options;
4790  ZeModuleDesc.pConstants = Shim.ze();
4791 
4792  ze_device_handle_t ZeDevice = DeviceList[0]->ZeDevice;
4793  ze_context_handle_t ZeContext = Program->Context->ZeContext;
4794  ze_module_handle_t ZeModule = nullptr;
4795 
4796  pi_result Result = PI_SUCCESS;
4797  Program->State = _pi_program::Exe;
4798  ze_result_t ZeResult =
4799  ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc,
4800  &ZeModule, &Program->ZeBuildLog));
4801  if (ZeResult != ZE_RESULT_SUCCESS) {
4802  // We adjust pi_program below to avoid attempting to release zeModule when
4803  // RT calls piProgramRelease().
4804  ZeModule = nullptr;
4805  Program->State = _pi_program::Invalid;
4806  Result = mapError(ZeResult);
4807  } else {
4808  // The call to zeModuleCreate does not report an error if there are
4809  // unresolved symbols because it thinks these could be resolved later via a
4810  // call to zeModuleDynamicLink. However, modules created with
4811  // piProgramBuild are supposed to be fully linked and ready to use.
4812  // Therefore, do an extra check now for unresolved symbols.
4813  ZeResult = checkUnresolvedSymbols(ZeModule, &Program->ZeBuildLog);
4814  if (ZeResult != ZE_RESULT_SUCCESS) {
4815  Program->State = _pi_program::Invalid;
4816  Result = (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE)
4817  ? PI_ERROR_BUILD_PROGRAM_FAILURE
4818  : mapError(ZeResult);
4819  }
4820  }
4821 
4822  // We no longer need the IL / native code.
4823  Program->Code.reset();
4824  Program->ZeModule = ZeModule;
4825  return Result;
4826 }
4827 
4829  pi_program_build_info ParamName,
4830  size_t ParamValueSize, void *ParamValue,
4831  size_t *ParamValueSizeRet) {
4832  (void)Device;
4833 
4834  std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
4835  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4836  if (ParamName == PI_PROGRAM_BUILD_INFO_BINARY_TYPE) {
4838  if (Program->State == _pi_program::Object) {
4840  } else if (Program->State == _pi_program::Exe) {
4842  }
4843  return ReturnValue(pi_program_binary_type{Type});
4844  }
4845  if (ParamName == PI_PROGRAM_BUILD_INFO_OPTIONS) {
4846  // TODO: how to get module build options out of Level Zero?
4847  // For the programs that we compiled we can remember the options
4848  // passed with piProgramCompile/piProgramBuild, but what can we
4849  // return for programs that were built outside and registered
4850  // with piProgramRegister?
4851  return ReturnValue("");
4852  } else if (ParamName == PI_PROGRAM_BUILD_INFO_LOG) {
4853  // Check first to see if the plugin code recorded an error message.
4854  if (!Program->ErrorMessage.empty()) {
4855  return ReturnValue(Program->ErrorMessage.c_str());
4856  }
4857 
4858  // Next check if there is a Level Zero build log.
4859  if (Program->ZeBuildLog) {
4860  size_t LogSize = ParamValueSize;
4861  ZE_CALL(zeModuleBuildLogGetString,
4862  (Program->ZeBuildLog, &LogSize, pi_cast<char *>(ParamValue)));
4863  if (ParamValueSizeRet) {
4864  *ParamValueSizeRet = LogSize;
4865  }
4866  return PI_SUCCESS;
4867  }
4868 
4869  // Otherwise, there is no error. The OpenCL spec says to return an empty
4870  // string if there ws no previous attempt to compile, build, or link the
4871  // program.
4872  return ReturnValue("");
4873  } else {
4874  zePrint("piProgramGetBuildInfo: unsupported ParamName\n");
4875  return PI_ERROR_INVALID_VALUE;
4876  }
4877  return PI_SUCCESS;
4878 }
4879 
4881  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4882  Program->RefCount.increment();
4883  return PI_SUCCESS;
4884 }
4885 
4887  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4888 
4889  if (!Program->RefCount.decrementAndTest())
4890  return PI_SUCCESS;
4891 
4892  delete Program;
4893 
4894  return PI_SUCCESS;
4895 }
4896 
4898  pi_native_handle *NativeHandle) {
4899  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4900  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
4901 
4902  auto ZeModule = pi_cast<ze_module_handle_t *>(NativeHandle);
4903 
4904  std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
4905  switch (Program->State) {
4906  case _pi_program::Exe: {
4907  *ZeModule = Program->ZeModule;
4908  break;
4909  }
4910 
4911  default:
4912  return PI_ERROR_INVALID_OPERATION;
4913  }
4914 
4915  return PI_SUCCESS;
4916 }
4917 
4920  bool ownNativeHandle,
4921  pi_program *Program) {
4922  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4923  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
4924  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
4925 
4926  auto ZeModule = pi_cast<ze_module_handle_t>(NativeHandle);
4927 
4928  // We assume here that programs created from a native handle always
4929  // represent a fully linked executable (state Exe) and not an unlinked
4930  // executable (state Object).
4931 
4932  try {
4933  *Program =
4934  new _pi_program(_pi_program::Exe, Context, ZeModule, ownNativeHandle);
4935  } catch (const std::bad_alloc &) {
4936  return PI_ERROR_OUT_OF_HOST_MEMORY;
4937  } catch (...) {
4938  return PI_ERROR_UNKNOWN;
4939  }
4940  return PI_SUCCESS;
4941 }
4942 
4944  // According to Level Zero Specification, all kernels and build logs
4945  // must be destroyed before the Module can be destroyed. So, be sure
4946  // to destroy build log before destroying the module.
4947  if (ZeBuildLog) {
4948  ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (ZeBuildLog));
4949  }
4950 
4951  if (ZeModule && OwnZeModule) {
4952  ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule));
4953  }
4954 }
4955 
4956 // Check to see if a Level Zero module has any unresolved symbols.
4957 //
4958 // @param ZeModule The module handle to check.
4959 // @param ZeBuildLog If there are unresolved symbols, this build log handle is
4960 // modified to receive information telling which symbols
4961 // are unresolved.
4962 //
4963 // @return ZE_RESULT_ERROR_MODULE_LINK_FAILURE indicates there are unresolved
4964 // symbols. ZE_RESULT_SUCCESS indicates all symbols are resolved. Any other
4965 // value indicates there was an error and we cannot tell if symbols are
4966 // resolved.
4967 static ze_result_t
4969  ze_module_build_log_handle_t *ZeBuildLog) {
4970 
4971  // First check to see if the module has any imported symbols. If there are
4972  // no imported symbols, it's not possible to have any unresolved symbols. We
4973  // do this check first because we assume it's faster than the call to
4974  // zeModuleDynamicLink below.
4975  ZeStruct<ze_module_properties_t> ZeModuleProps;
4976  ze_result_t ZeResult =
4977  ZE_CALL_NOCHECK(zeModuleGetProperties, (ZeModule, &ZeModuleProps));
4978  if (ZeResult != ZE_RESULT_SUCCESS)
4979  return ZeResult;
4980 
4981  // If there are imported symbols, attempt to "link" the module with itself.
4982  // As a side effect, this will return the error
4983  // ZE_RESULT_ERROR_MODULE_LINK_FAILURE if there are any unresolved symbols.
4984  if (ZeModuleProps.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS) {
4985  return ZE_CALL_NOCHECK(zeModuleDynamicLink, (1, &ZeModule, ZeBuildLog));
4986  }
4987  return ZE_RESULT_SUCCESS;
4988 }
4989 
4990 pi_result piKernelCreate(pi_program Program, const char *KernelName,
4991  pi_kernel *RetKernel) {
4992 
4993  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4994  PI_ASSERT(RetKernel, PI_ERROR_INVALID_VALUE);
4995  PI_ASSERT(KernelName, PI_ERROR_INVALID_VALUE);
4996 
4997  std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
4998  if (Program->State != _pi_program::Exe) {
4999  return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
5000  }
5001 
5002  ZeStruct<ze_kernel_desc_t> ZeKernelDesc;
5003  ZeKernelDesc.flags = 0;
5004  ZeKernelDesc.pKernelName = KernelName;
5005 
5006  ze_kernel_handle_t ZeKernel;
5007  ZE_CALL(zeKernelCreate, (Program->ZeModule, &ZeKernelDesc, &ZeKernel));
5008 
5009  try {
5010  *RetKernel = new _pi_kernel(ZeKernel, true, Program);
5011  } catch (const std::bad_alloc &) {
5012  return PI_ERROR_OUT_OF_HOST_MEMORY;
5013  } catch (...) {
5014  return PI_ERROR_UNKNOWN;
5015  }
5016 
5017  PI_CALL((*RetKernel)->initialize());
5018  return PI_SUCCESS;
5019 }
5020 
5022  // Retain the program and context to show it's used by this kernel.
5023  PI_CALL(piProgramRetain(Program));
5024  if (IndirectAccessTrackingEnabled)
5025  // TODO: do piContextRetain without the guard
5026  PI_CALL(piContextRetain(Program->Context));
5027 
5028  // Set up how to obtain kernel properties when needed.
5029  ZeKernelProperties.Compute = [this](ze_kernel_properties_t &Properties) {
5030  ZE_CALL_NOCHECK(zeKernelGetProperties, (ZeKernel, &Properties));
5031  };
5032 
5033  // Cache kernel name.
5034  ZeKernelName.Compute = [this](std::string &Name) {
5035  size_t Size = 0;
5036  ZE_CALL_NOCHECK(zeKernelGetName, (ZeKernel, &Size, nullptr));
5037  char *KernelName = new char[Size];
5038  ZE_CALL_NOCHECK(zeKernelGetName, (ZeKernel, &Size, KernelName));
5039  Name = KernelName;
5040  delete[] KernelName;
5041  };
5042 
5043  return PI_SUCCESS;
5044 }
5045 
5046 pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex, size_t ArgSize,
5047  const void *ArgValue) {
5048 
5049  // OpenCL: "the arg_value pointer can be NULL or point to a NULL value
5050  // in which case a NULL value will be used as the value for the argument
5051  // declared as a pointer to global or constant memory in the kernel"
5052  //
5053  // We don't know the type of the argument but it seems that the only time
5054  // SYCL RT would send a pointer to NULL in 'arg_value' is when the argument
5055  // is a NULL pointer. Treat a pointer to NULL in 'arg_value' as a NULL.
5056  if (ArgSize == sizeof(void *) && ArgValue &&
5057  *(void **)(const_cast<void *>(ArgValue)) == nullptr) {
5058  ArgValue = nullptr;
5059  }
5060 
5061  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
5062 
5063  std::scoped_lock<pi_shared_mutex> Guard(Kernel->Mutex);
5064  ZE_CALL(zeKernelSetArgumentValue,
5065  (pi_cast<ze_kernel_handle_t>(Kernel->ZeKernel),
5066  pi_cast<uint32_t>(ArgIndex), pi_cast<size_t>(ArgSize),
5067  pi_cast<const void *>(ArgValue)));
5068 
5069  return PI_SUCCESS;
5070 }
5071 
5072 // Special version of piKernelSetArg to accept pi_mem.
5074  const pi_mem *ArgValue) {
5075  // TODO: the better way would probably be to add a new PI API for
5076  // extracting native PI object from PI handle, and have SYCL
5077  // RT pass that directly to the regular piKernelSetArg (and
5078  // then remove this piextKernelSetArgMemObj).
5079 
5080  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
5081 
5082  // We don't yet know the device where this kernel will next be run on.
5083  // Thus we can't know the actual memory allocation that needs to be used.
5084  // Remember the memory object being used as an argument for this kernel
5085  // to process it later when the device is known (at the kernel enqueue).
5086  //
5087  // TODO: for now we have to conservatively assume the access as read-write.
5088  // Improve that by passing SYCL buffer accessor type into
5089  // piextKernelSetArgMemObj.
5090  //
5091  std::scoped_lock<pi_shared_mutex> Guard(Kernel->Mutex);
5092  // The ArgValue may be a NULL pointer in which case a NULL value is used for
5093  // the kernel argument declared as a pointer to global or constant memory.
5094  auto Arg = ArgValue ? *ArgValue : nullptr;
5095  Kernel->PendingArguments.push_back(
5096  {ArgIndex, sizeof(void *), Arg, _pi_mem::read_write});
5097 
5098  return PI_SUCCESS;
5099 }
5100 
5101 // Special version of piKernelSetArg to accept pi_sampler.
5103  const pi_sampler *ArgValue) {
5104  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
5105 
5106  std::scoped_lock<pi_shared_mutex> Guard(Kernel->Mutex);
5107  ZE_CALL(zeKernelSetArgumentValue,
5108  (pi_cast<ze_kernel_handle_t>(Kernel->ZeKernel),
5109  pi_cast<uint32_t>(ArgIndex), sizeof(void *),
5110  &(*ArgValue)->ZeSampler));
5111 
5112  return PI_SUCCESS;
5113 }
5114 
5116  size_t ParamValueSize, void *ParamValue,
5117  size_t *ParamValueSizeRet) {
5118  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
5119 
5120  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
5121 
5122  std::shared_lock<pi_shared_mutex> Guard(Kernel->Mutex);
5123  switch (ParamName) {
5125  return ReturnValue(pi_context{Kernel->Program->Context});
5127  return ReturnValue(pi_program{Kernel->Program});
5129  try {
5130  std::string &KernelName = *Kernel->ZeKernelName.operator->();
5131  return ReturnValue(static_cast<const char *>(KernelName.c_str()));
5132  } catch (const std::bad_alloc &) {
5133  return PI_ERROR_OUT_OF_HOST_MEMORY;
5134  } catch (...) {
5135  return PI_ERROR_UNKNOWN;
5136  }
5138  return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->numKernelArgs});
5140  return ReturnValue(pi_uint32{Kernel->RefCount.load()});
5142  try {
5143  uint32_t Size;
5144  ZE_CALL(zeKernelGetSourceAttributes, (Kernel->ZeKernel, &Size, nullptr));
5145  char *attributes = new char[Size];
5146  ZE_CALL(zeKernelGetSourceAttributes,
5147  (Kernel->ZeKernel, &Size, &attributes));
5148  auto Res = ReturnValue(attributes);
5149  delete[] attributes;
5150  return Res;
5151  } catch (const std::bad_alloc &) {
5152  return PI_ERROR_OUT_OF_HOST_MEMORY;
5153  } catch (...) {
5154  return PI_ERROR_UNKNOWN;
5155  }
5156  default:
5157  zePrint("Unsupported ParamName in piKernelGetInfo: ParamName=%d(0x%x)\n",
5158  ParamName, ParamName);
5159  return PI_ERROR_INVALID_VALUE;
5160  }
5161 
5162  return PI_SUCCESS;
5163 }
5164 
5166  pi_kernel_group_info ParamName,
5167  size_t ParamValueSize, void *ParamValue,
5168  size_t *ParamValueSizeRet) {
5169  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
5170  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
5171 
5172  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
5173 
5174  std::shared_lock<pi_shared_mutex> Guard(Kernel->Mutex);
5175  switch (ParamName) {
5177  // TODO: To revisit after level_zero/issues/262 is resolved
5178  struct {
5179  size_t Arr[3];
5180  } WorkSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX,
5181  Device->ZeDeviceComputeProperties->maxGroupSizeY,
5182  Device->ZeDeviceComputeProperties->maxGroupSizeZ}};
5183  return ReturnValue(WorkSize);
5184  }
5186  uint32_t X, Y, Z;
5187  ZE_CALL(zeKernelSuggestGroupSize,
5188  (Kernel->ZeKernel, 10000, 10000, 10000, &X, &Y, &Z));
5189  return ReturnValue(size_t{X * Y * Z});
5190  }
5192  struct {
5193  size_t Arr[3];
5194  } WgSize = {{Kernel->ZeKernelProperties->requiredGroupSizeX,
5195  Kernel->ZeKernelProperties->requiredGroupSizeY,
5196  Kernel->ZeKernelProperties->requiredGroupSizeZ}};
5197  return ReturnValue(WgSize);
5198  }
5200  return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->localMemSize});
5202  return ReturnValue(size_t{Device->ZeDeviceProperties->physicalEUSimdWidth});
5203  }
5205  return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->privateMemSize});
5207  die("PI_KERNEL_GROUP_INFO_NUM_REGS in piKernelGetGroupInfo not "
5208  "implemented\n");
5209  break;
5210  }
5211  default:
5212  zePrint("Unknown ParamName in piKernelGetGroupInfo: ParamName=%d(0x%x)\n",
5213  ParamName, ParamName);
5214  return PI_ERROR_INVALID_VALUE;
5215  }
5216  return PI_SUCCESS;
5217 }
5218 
5220  pi_kernel_sub_group_info ParamName,
5221  size_t InputValueSize, const void *InputValue,
5222  size_t ParamValueSize, void *ParamValue,
5223  size_t *ParamValueSizeRet) {
5224  (void)Device;
5225  (void)InputValueSize;
5226  (void)InputValue;
5227 
5228  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
5229 
5230  std::shared_lock<pi_shared_mutex> Guard(Kernel->Mutex);
5231  if (ParamName == PI_KERNEL_MAX_SUB_GROUP_SIZE) {
5232  ReturnValue(uint32_t{Kernel->ZeKernelProperties->maxSubgroupSize});
5233  } else if (ParamName == PI_KERNEL_MAX_NUM_SUB_GROUPS) {
5234  ReturnValue(uint32_t{Kernel->ZeKernelProperties->maxNumSubgroups});
5235  } else if (ParamName == PI_KERNEL_COMPILE_NUM_SUB_GROUPS) {
5236  ReturnValue(uint32_t{Kernel->ZeKernelProperties->requiredNumSubGroups});
5237  } else if (ParamName == PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL) {
5238  ReturnValue(uint32_t{Kernel->ZeKernelProperties->requiredSubgroupSize});
5239  } else {
5240  die("piKernelGetSubGroupInfo: parameter not implemented");
5241  return {};
5242  }
5243  return PI_SUCCESS;
5244 }
5245 
5247 
5248  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
5249 
5250  Kernel->RefCount.increment();
5251  return PI_SUCCESS;
5252 }
5253 
5255  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
5256 
5257  if (IndirectAccessTrackingEnabled) {
5258  // piKernelRelease is called by CleanupCompletedEvent(Event) as soon as
5259  // kernel execution has finished. This is the place where we need to release
5260  // memory allocations. If kernel is not in use (not submitted by some
5261  // other thread) then release referenced memory allocations. As a result,
5262  // memory can be deallocated and context can be removed from container in
5263  // the platform. That's why we need to lock a mutex here.
5264  pi_platform Plt = Kernel->Program->Context->getPlatform();
5265  std::scoped_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex);
5266 
5267  if (--Kernel->SubmissionsCount == 0) {
5268  // Kernel is not submitted for execution, release referenced memory
5269  // allocations.
5270  for (auto &MemAlloc : Kernel->MemAllocs) {
5271  USMFreeHelper(MemAlloc->second.Context, MemAlloc->first,
5272  MemAlloc->second.OwnZeMemHandle);
5273  }
5274  Kernel->MemAllocs.clear();
5275  }
5276  }
5277 
5278  if (!Kernel->RefCount.decrementAndTest())
5279  return PI_SUCCESS;
5280 
5281  auto KernelProgram = Kernel->Program;
5282  if (Kernel->OwnZeKernel)
5283  ZE_CALL(zeKernelDestroy, (Kernel->ZeKernel));
5284  if (IndirectAccessTrackingEnabled) {
5285  PI_CALL(piContextRelease(KernelProgram->Context));
5286  }
5287  // do a release on the program this kernel was part of
5288  PI_CALL(piProgramRelease(KernelProgram));
5289  delete Kernel;
5290 
5291  return PI_SUCCESS;
5292 }
5293 
5294 pi_result
5296  const size_t *GlobalWorkOffset,
5297  const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
5298  pi_uint32 NumEventsInWaitList,
5299  const pi_event *EventWaitList, pi_event *OutEvent) {
5300  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
5301  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
5302  PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION);
5303 
5304  // Lock automatically releases when this goes out of scope.
5305  std::scoped_lock<pi_shared_mutex, pi_shared_mutex, pi_shared_mutex> Lock(
5306  Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex);
5307  if (GlobalWorkOffset != NULL) {
5309  zePrint("No global offset extension found on this driver\n");
5310  return PI_ERROR_INVALID_VALUE;
5311  }
5312 
5313  ZE_CALL(zeKernelSetGlobalOffsetExp,
5314  (Kernel->ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1],
5315  GlobalWorkOffset[2]));
5316  }
5317 
5318  // If there are any pending arguments set them now.
5319  for (auto &Arg : Kernel->PendingArguments) {
5320  // The ArgValue may be a NULL pointer in which case a NULL value is used for
5321  // the kernel argument declared as a pointer to global or constant memory.
5322  char **ZeHandlePtr = nullptr;
5323  if (Arg.Value) {
5324  PI_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode,
5325  Queue->Device));
5326  }
5327  ZE_CALL(zeKernelSetArgumentValue,
5328  (Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr));
5329  }
5330  Kernel->PendingArguments.clear();
5331 
5332  ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
5333  uint32_t WG[3];
5334 
5335  // global_work_size of unused dimensions must be set to 1
5336  PI_ASSERT(WorkDim == 3 || GlobalWorkSize[2] == 1, PI_ERROR_INVALID_VALUE);
5337  PI_ASSERT(WorkDim >= 2 || GlobalWorkSize[1] == 1, PI_ERROR_INVALID_VALUE);
5338 
5339  if (LocalWorkSize) {
5340  WG[0] = pi_cast<uint32_t>(LocalWorkSize[0]);
5341  WG[1] = pi_cast<uint32_t>(LocalWorkSize[1]);
5342  WG[2] = pi_cast<uint32_t>(LocalWorkSize[2]);
5343  } else {
5344  ZE_CALL(zeKernelSuggestGroupSize,
5345  (Kernel->ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1],
5346  GlobalWorkSize[2], &WG[0], &WG[1], &WG[2]));
5347  }
5348 
5349  // TODO: assert if sizes do not fit into 32-bit?
5350  switch (WorkDim) {
5351  case 3:
5352  ZeThreadGroupDimensions.groupCountX =
5353  pi_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
5354  ZeThreadGroupDimensions.groupCountY =
5355  pi_cast<uint32_t>(GlobalWorkSize[1] / WG[1]);
5356  ZeThreadGroupDimensions.groupCountZ =
5357  pi_cast<uint32_t>(GlobalWorkSize[2] / WG[2]);
5358  break;
5359  case 2:
5360  ZeThreadGroupDimensions.groupCountX =
5361  pi_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
5362  ZeThreadGroupDimensions.groupCountY =
5363  pi_cast<uint32_t>(GlobalWorkSize[1] / WG[1]);
5364  WG[2] = 1;
5365  break;
5366  case 1:
5367  ZeThreadGroupDimensions.groupCountX =
5368  pi_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
5369  WG[1] = WG[2] = 1;
5370  break;
5371 
5372  default:
5373  zePrint("piEnqueueKernelLaunch: unsupported work_dim\n");
5374  return PI_ERROR_INVALID_VALUE;
5375  }
5376 
5377  // Error handling for non-uniform group size case
5378  if (GlobalWorkSize[0] != (ZeThreadGroupDimensions.groupCountX * WG[0])) {
5379  zePrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
5380  "multiple of the group size in the 1st dimension\n");
5381  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
5382  }
5383  if (GlobalWorkSize[1] != (ZeThreadGroupDimensions.groupCountY * WG[1])) {
5384  zePrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
5385  "multiple of the group size in the 2nd dimension\n");
5386  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
5387  }
5388  if (GlobalWorkSize[2] != (ZeThreadGroupDimensions.groupCountZ * WG[2])) {
5389  zePrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
5390  "multiple of the group size in the 3rd dimension\n");
5391  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
5392  }
5393 
5394  ZE_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2]));
5395 
5396  bool UseCopyEngine = false;
5397  _pi_ze_event_list_t TmpWaitList;
5398  if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
5399  NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
5400  return Res;
5401 
5402  // Get a new command list to be used on this call
5403  pi_command_list_ptr_t CommandList{};
5404  if (auto Res = Queue->Context->getAvailableCommandList(
5405  Queue, CommandList, UseCopyEngine, true /* AllowBatching */))
5406  return Res;
5407 
5408  ze_event_handle_t ZeEvent = nullptr;
5409  pi_event InternalEvent;
5410  bool IsInternal = OutEvent == nullptr;
5411  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
5413  Queue, Event, PI_COMMAND_TYPE_NDRANGE_KERNEL, CommandList, IsInternal);
5414  if (Res != PI_SUCCESS)
5415  return Res;
5416  ZeEvent = (*Event)->ZeEvent;
5417  (*Event)->WaitList = TmpWaitList;
5418 
5419  // Save the kernel in the event, so that when the event is signalled
5420  // the code can do a piKernelRelease on this kernel.
5421  (*Event)->CommandData = (void *)Kernel;
5422 
5423  // Increment the reference count of the Kernel and indicate that the Kernel is
5424  // in use. Once the event has been signalled, the code in
5425  // CleanupCompletedEvent(Event) will do a piReleaseKernel to update the
5426  // reference count on the kernel, using the kernel saved in CommandData.
5427  PI_CALL(piKernelRetain(Kernel));
5428 
5429  // Add to list of kernels to be submitted
5430  if (IndirectAccessTrackingEnabled)
5431  Queue->KernelsToBeSubmitted.push_back(Kernel);
5432 
5433  if (Queue->Device->useImmediateCommandLists() &&
5434  IndirectAccessTrackingEnabled) {
5435  // If using immediate commandlists then gathering of indirect
5436  // references and appending to the queue (which means submission)
5437  // must be done together.
5438  std::unique_lock<pi_shared_mutex> ContextsLock(
5439  Queue->Device->Platform->ContextsMutex, std::defer_lock);
5440  // We are going to submit kernels for execution. If indirect access flag is
5441  // set for a kernel then we need to make a snapshot of existing memory
5442  // allocations in all contexts in the platform. We need to lock the mutex
5443  // guarding the list of contexts in the platform to prevent creation of new
5444  // memory alocations in any context before we submit the kernel for
5445  // execution.
5446  ContextsLock.lock();
5447  Queue->CaptureIndirectAccesses();
5448  // Add the command to the command list, which implies submission.
5449  ZE_CALL(zeCommandListAppendLaunchKernel,
5450  (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
5451  ZeEvent, (*Event)->WaitList.Length,
5452  (*Event)->WaitList.ZeEventList));
5453  } else {
5454  // Add the command to the command list for later submission.
5455  // No lock is needed here, unlike the immediate commandlist case above,
5456  // because the kernels are not actually submitted yet. Kernels will be
5457  // submitted only when the comamndlist is closed. Then, a lock is held.
5458  ZE_CALL(zeCommandListAppendLaunchKernel,
5459  (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
5460  ZeEvent, (*Event)->WaitList.Length,
5461  (*Event)->WaitList.ZeEventList));
5462  }
5463 
5464  zePrint("calling zeCommandListAppendLaunchKernel() with"
5465  " ZeEvent %#lx\n",
5466  pi_cast<std::uintptr_t>(ZeEvent));
5467  printZeEventList((*Event)->WaitList);
5468 
5469  // Execute command list asynchronously, as the event will be used
5470  // to track down its completion.
5471  if (auto Res = Queue->executeCommandList(CommandList, false, true))
5472  return Res;
5473 
5474  return PI_SUCCESS;
5475 }
5476 
5479  pi_program Program,
5480  bool OwnNativeHandle,
5481  pi_kernel *Kernel) {
5482  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
5483  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
5484  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
5485  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
5486 
5487  auto ZeKernel = pi_cast<ze_kernel_handle_t>(NativeHandle);
5488  *Kernel = new _pi_kernel(ZeKernel, OwnNativeHandle, Program);
5489  PI_CALL((*Kernel)->initialize());
5490  return PI_SUCCESS;
5491 }
5492 
5494  pi_native_handle *NativeHandle) {
5495  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
5496  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
5497 
5498  std::shared_lock<pi_shared_mutex> Guard(Kernel->Mutex);
5499  auto *ZeKernel = pi_cast<ze_kernel_handle_t *>(NativeHandle);
5500  *ZeKernel = Kernel->ZeKernel;
5501  return PI_SUCCESS;
5502 }
5503 
5504 //
5505 // Events
5506 //
5507 pi_result
5509  PI_ASSERT(Queue, PI_ERROR_INVALID_EVENT);
5510 
5511  std::scoped_lock<pi_shared_mutex, pi_shared_mutex> Lock(Queue->Mutex,
5512  this->Mutex);
5513 
5514  if (!HostVisibleEvent) {
5515  if (Queue->Device->eventsScope() != OnDemandHostVisibleProxy)
5516  die("getOrCreateHostVisibleEvent: missing host-visible event");
5517 
5518  // Submit the command(s) signalling the proxy event to the queue.
5519  // We have to first submit a wait for the device-only event for which this
5520  // proxy is created.
5521  //
5522  // Get a new command list to be used on this call
5523 
5524  // We want to batch these commands to avoid extra submissions (costly)
5525  bool OkToBatch = true;
5526 
5527  pi_command_list_ptr_t CommandList{};
5528  if (auto Res = Queue->Context->getAvailableCommandList(
5529  Queue, CommandList, false /* UseCopyEngine */, OkToBatch))
5530  return Res;
5531 
5532  // Create a "proxy" host-visible event.
5533  auto Res = createEventAndAssociateQueue(
5534  Queue, &HostVisibleEvent, PI_COMMAND_TYPE_USER, CommandList,
5535  /* IsInternal */ false, /* ForceHostVisible */ true);
5536  if (Res != PI_SUCCESS)
5537  return Res;
5538 
5539  ZE_CALL(zeCommandListAppendWaitOnEvents, (CommandList->first, 1, &ZeEvent));
5540  ZE_CALL(zeCommandListAppendSignalEvent,
5541  (CommandList->first, HostVisibleEvent->ZeEvent));
5542 
5543  if (auto Res = Queue->executeCommandList(CommandList, false, OkToBatch))
5544  return Res;
5545  }
5546 
5547  ZeHostVisibleEvent = HostVisibleEvent->ZeEvent;
5548  return PI_SUCCESS;
5549 }
5550 
5552  Queue = nullptr;
5553  CleanedUp = false;
5554  Completed = false;
5555  CommandData = nullptr;
5556  CommandType = PI_COMMAND_TYPE_USER;
5557  WaitList = {};
5558  RefCountExternal = 0;
5559  RefCount.reset();
5560 
5561  if (!isHostVisible())
5562  HostVisibleEvent = nullptr;
5563 
5564  ZE_CALL(zeEventHostReset, (ZeEvent));
5565  return PI_SUCCESS;
5566 }
5567 
5568 pi_event _pi_context::getEventFromCache(bool HostVisible, bool WithProfiling) {
5569  std::scoped_lock<pi_mutex> Lock(EventCacheMutex);
5570  auto Cache = getEventCache(HostVisible, WithProfiling);
5571  if (Cache->empty())
5572  return nullptr;
5573 
5574  auto It = Cache->begin();
5575  pi_event Event = *It;
5576  Cache->erase(It);
5577  return Event;
5578 }
5579 
5581  std::scoped_lock<pi_mutex> Lock(EventCacheMutex);
5582  auto Cache =
5583  getEventCache(Event->isHostVisible(), Event->isProfilingEnabled());
5584  Event->reset();
5585  Cache->emplace_back(Event);
5586 }
5587 
5588 // Helper function for creating a PI event.
5589 // The "Queue" argument specifies the PI queue where a command is submitted.
5590 // The "HostVisible" argument specifies if event needs to be allocated from
5591 // a host-visible pool.
5592 //
5594  bool HostVisible, pi_event *RetEvent) {
5595  bool ProfilingEnabled =
5596  !Queue || (Queue->Properties & PI_QUEUE_PROFILING_ENABLE) != 0;
5597 
5598  if (auto CachedEvent =
5599  Context->getEventFromCache(HostVisible, ProfilingEnabled)) {
5600  *RetEvent = CachedEvent;
5601  return PI_SUCCESS;
5602  }
5603 
5604  ze_event_handle_t ZeEvent;
5605  ze_event_pool_handle_t ZeEventPool = {};
5606 
5607  size_t Index = 0;
5608 
5609  if (auto Res = Context->getFreeSlotInExistingOrNewPool(
5610  ZeEventPool, Index, HostVisible, ProfilingEnabled))
5611  return Res;
5612 
5613  ZeStruct<ze_event_desc_t> ZeEventDesc;
5614  ZeEventDesc.index = Index;
5615  ZeEventDesc.wait = 0;
5616 
5617  if (HostVisible) {
5618  ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
5619  } else {
5620  //
5621  // Set the scope to "device" for every event. This is sufficient for
5622  // global device access and peer device access. If needed to be seen on
5623  // the host we are doing special handling, see EventsScope options.
5624  //
5625  // TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be
5626  // used in some circumstances.
5627  //
5628  ZeEventDesc.signal = 0;
5629  }
5630 
5631  ZE_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeEvent));
5632 
5633  try {
5634  PI_ASSERT(RetEvent, PI_ERROR_INVALID_VALUE);
5635 
5636  *RetEvent = new _pi_event(ZeEvent, ZeEventPool, Context,
5637  PI_COMMAND_TYPE_USER, true);
5638  } catch (const std::bad_alloc &) {
5639  return PI_ERROR_OUT_OF_HOST_MEMORY;
5640  } catch (...) {
5641  return PI_ERROR_UNKNOWN;
5642  }
5643 
5644  if (HostVisible)
5645  (*RetEvent)->HostVisibleEvent = *RetEvent;
5646 
5647  return PI_SUCCESS;
5648 }
5649 
5650 // External PI API entry
5652  pi_result Result = EventCreate(Context, nullptr, true, RetEvent);
5653  (*RetEvent)->RefCountExternal++;
5654  return Result;
5655 }
5656 
5658  size_t ParamValueSize, void *ParamValue,
5659  size_t *ParamValueSizeRet) {
5660 
5661  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5662 
5663  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
5664  switch (ParamName) {
5666  std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex);
5667  return ReturnValue(pi_queue{Event->Queue});
5668  }
5669  case PI_EVENT_INFO_CONTEXT: {
5670  std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex);
5671  return ReturnValue(pi_context{Event->Context});
5672  }
5674  std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex);
5675  return ReturnValue(pi_cast<pi_uint64>(Event->CommandType));
5676  }
5678  // Check to see if the event's Queue has an open command list due to
5679  // batching. If so, go ahead and close and submit it, because it is
5680  // possible that this is trying to query some event's status that
5681  // is part of the batch. This isn't strictly required, but it seems
5682  // like a reasonable thing to do.
5683  auto Queue = Event->Queue;
5684  if (Queue) {
5685  // Lock automatically releases when this goes out of scope.
5686  std::scoped_lock<pi_shared_mutex> lock(Queue->Mutex);
5687  const auto &OpenCommandList = Queue->eventOpenCommandList(Event);
5688  if (OpenCommandList != Queue->CommandListMap.end()) {
5689  if (auto Res = Queue->executeOpenCommandList(
5690  OpenCommandList->second.isCopy(Queue)))
5691  return Res;
5692  }
5693  }
5694 
5695  // Level Zero has a much more explicit notion of command submission than
5696  // OpenCL. It doesn't happen unless the user submits a command list. We've
5697  // done it just above so the status is at least PI_EVENT_RUNNING.
5698  pi_int32 Result = PI_EVENT_RUNNING;
5699 
5700  // Make sure that we query a host-visible event only.
5701  // If one wasn't yet created then don't create it here as well, and
5702  // just conservatively return that event is not yet completed.
5703  std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex);
5704  auto HostVisibleEvent = Event->HostVisibleEvent;
5705  if (Event->Completed) {
5706  Result = PI_EVENT_COMPLETE;
5707  } else if (HostVisibleEvent) {
5708  ze_result_t ZeResult;
5709  ZeResult =
5710  ZE_CALL_NOCHECK(zeEventQueryStatus, (HostVisibleEvent->ZeEvent));
5711  if (ZeResult == ZE_RESULT_SUCCESS) {
5712  Result = PI_EVENT_COMPLETE;
5713  }
5714  }
5715  return ReturnValue(pi_cast<pi_int32>(Result));
5716  }
5718  return ReturnValue(pi_uint32{Event->RefCount.load()});
5719  default:
5720  zePrint("Unsupported ParamName in piEventGetInfo: ParamName=%d(%x)\n",
5721  ParamName, ParamName);
5722  return PI_ERROR_INVALID_VALUE;
5723  }
5724 
5725  return PI_SUCCESS;
5726 }
5727 
5729  size_t ParamValueSize, void *ParamValue,
5730  size_t *ParamValueSizeRet) {
5731 
5732