DPC++ Runtime
Runtime libraries for oneAPI DPC++
pi_level_zero.cpp
Go to the documentation of this file.
1 //===-------- pi_level_zero.cpp - Level Zero Plugin --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===------------------------------------------------------------------===//
8 
13 
14 #include "pi_level_zero.hpp"
16 #include <algorithm>
17 #include <cstdarg>
18 #include <cstdio>
19 #include <cstring>
20 #include <memory>
21 #include <set>
22 #include <sstream>
23 #include <string>
24 #include <thread>
25 #include <utility>
26 
27 #include <level_zero/zet_api.h>
28 
29 #include "usm_allocator.hpp"
30 
31 extern "C" {
32 // Forward declarartions.
34 static pi_result EventCreate(pi_context Context, pi_queue Queue,
35  bool HostVisible, pi_event *RetEvent);
36 }
37 
38 // Defined in tracing.cpp
39 void enableZeTracing();
40 void disableZeTracing();
41 
42 namespace {
43 
44 // Controls Level Zero calls serialization to w/a Level Zero driver being not MT
45 // ready. Recognized values (can be used as a bit mask):
46 enum {
47  ZeSerializeNone =
48  0, // no locking or blocking (except when SYCL RT requested blocking)
49  ZeSerializeLock = 1, // locking around each ZE_CALL
50  ZeSerializeBlock =
51  2, // blocking ZE calls, where supported (usually in enqueue commands)
52 };
53 static const pi_uint32 ZeSerialize = [] {
54  const char *SerializeMode = std::getenv("ZE_SERIALIZE");
55  const pi_uint32 SerializeModeValue =
56  SerializeMode ? std::atoi(SerializeMode) : 0;
57  return SerializeModeValue;
58 }();
59 
60 // This is an experimental option to test performance of device to device copy
61 // operations on copy engines (versus compute engine)
62 static const bool UseCopyEngineForD2DCopy = [] {
63  const char *CopyEngineForD2DCopy =
64  std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY");
65  return (CopyEngineForD2DCopy && (std::stoi(CopyEngineForD2DCopy) != 0));
66 }();
67 
68 // This is an experimental option that allows the use of copy engine, if
69 // available in the device, in Level Zero plugin for copy operations submitted
70 // to an in-order queue. The default is 1.
71 static const bool UseCopyEngineForInOrderQueue = [] {
72  const char *CopyEngineForInOrderQueue =
73  std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE");
74  return (!CopyEngineForInOrderQueue ||
75  (std::stoi(CopyEngineForInOrderQueue) != 0));
76 }();
77 
78 // To enable an experimental feature that uses immediate commandlists
79 // for kernel launches and copies. The default is standard commandlists.
80 // Setting a value >=1 specifies use of immediate commandlists.
81 // Note: when immediate commandlists are used then device-only events
82 // must be either AllHostVisible or OnDemandHostVisibleProxy.
83 // (See env var SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS).
84 static const bool UseImmediateCommandLists = [] {
85  const char *ImmediateFlag =
86  std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS");
87  if (!ImmediateFlag)
88  return false;
89  return std::stoi(ImmediateFlag) > 0;
90 }();
91 
92 // This class encapsulates actions taken along with a call to Level Zero API.
93 class ZeCall {
94 private:
95  // The global mutex that is used for total serialization of Level Zero calls.
96  static std::mutex GlobalLock;
97 
98 public:
99  ZeCall() {
100  if ((ZeSerialize & ZeSerializeLock) != 0) {
101  GlobalLock.lock();
102  }
103  }
104  ~ZeCall() {
105  if ((ZeSerialize & ZeSerializeLock) != 0) {
106  GlobalLock.unlock();
107  }
108  }
109 
110  // The non-static version just calls static one.
111  ze_result_t doCall(ze_result_t ZeResult, const char *ZeName,
112  const char *ZeArgs, bool TraceError = true);
113 };
114 std::mutex ZeCall::GlobalLock;
115 
116 // Controls PI level tracing prints.
117 static bool PrintPiTrace = false;
118 
119 // Controls support of the indirect access kernels and deferred memory release.
120 static const bool IndirectAccessTrackingEnabled = [] {
121  return std::getenv("SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY") !=
122  nullptr;
123 }();
124 
125 // Map Level Zero runtime error code to PI error code.
126 static pi_result mapError(ze_result_t ZeResult) {
127  // TODO: these mapping need to be clarified and synced with the PI API return
128  // values, which is TBD.
129  static std::unordered_map<ze_result_t, pi_result> ErrorMapping = {
130  {ZE_RESULT_SUCCESS, PI_SUCCESS},
131  {ZE_RESULT_ERROR_DEVICE_LOST, PI_ERROR_DEVICE_NOT_FOUND},
132  {ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, PI_ERROR_INVALID_OPERATION},
133  {ZE_RESULT_ERROR_NOT_AVAILABLE, PI_ERROR_INVALID_OPERATION},
134  {ZE_RESULT_ERROR_UNINITIALIZED, PI_ERROR_INVALID_PLATFORM},
135  {ZE_RESULT_ERROR_INVALID_ARGUMENT, PI_ERROR_INVALID_ARG_VALUE},
136  {ZE_RESULT_ERROR_INVALID_NULL_POINTER, PI_ERROR_INVALID_VALUE},
137  {ZE_RESULT_ERROR_INVALID_SIZE, PI_ERROR_INVALID_VALUE},
138  {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, PI_ERROR_INVALID_VALUE},
139  {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, PI_ERROR_INVALID_VALUE},
140  {ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT, PI_ERROR_INVALID_EVENT},
141  {ZE_RESULT_ERROR_INVALID_ENUMERATION, PI_ERROR_INVALID_VALUE},
142  {ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION, PI_ERROR_INVALID_VALUE},
143  {ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT, PI_ERROR_INVALID_VALUE},
144  {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, PI_ERROR_INVALID_BINARY},
145  {ZE_RESULT_ERROR_INVALID_KERNEL_NAME, PI_ERROR_INVALID_KERNEL_NAME},
146  {ZE_RESULT_ERROR_INVALID_FUNCTION_NAME, PI_ERROR_BUILD_PROGRAM_FAILURE},
147  {ZE_RESULT_ERROR_OVERLAPPING_REGIONS, PI_ERROR_INVALID_OPERATION},
148  {ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION,
149  PI_ERROR_INVALID_WORK_GROUP_SIZE},
150  {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, PI_ERROR_BUILD_PROGRAM_FAILURE},
151  {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, PI_ERROR_OUT_OF_RESOURCES},
152  {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, PI_ERROR_OUT_OF_HOST_MEMORY}};
153 
154  auto It = ErrorMapping.find(ZeResult);
155  if (It == ErrorMapping.end()) {
156  return PI_ERROR_UNKNOWN;
157  }
158  return It->second;
159 }
160 
161 // This will count the calls to Level-Zero
162 static std::map<const char *, int> *ZeCallCount = nullptr;
163 
164 // Trace a call to Level-Zero RT
165 #define ZE_CALL(ZeName, ZeArgs) \
166  { \
167  ze_result_t ZeResult = ZeName ZeArgs; \
168  if (auto Result = ZeCall().doCall(ZeResult, #ZeName, #ZeArgs, true)) \
169  return mapError(Result); \
170  }
171 
172 #define ZE_CALL_NOCHECK(ZeName, ZeArgs) \
173  ZeCall().doCall(ZeName ZeArgs, #ZeName, #ZeArgs, false)
174 
175 // Trace an internal PI call; returns in case of an error.
176 #define PI_CALL(Call) \
177  { \
178  if (PrintPiTrace) \
179  fprintf(stderr, "PI ---> %s\n", #Call); \
180  pi_result Result = (Call); \
181  if (Result != PI_SUCCESS) \
182  return Result; \
183  }
184 
185 enum DebugLevel {
186  ZE_DEBUG_NONE = 0x0,
187  ZE_DEBUG_BASIC = 0x1,
188  ZE_DEBUG_VALIDATION = 0x2,
189  ZE_DEBUG_CALL_COUNT = 0x4,
190  ZE_DEBUG_ALL = -1
191 };
192 
193 // Controls Level Zero calls tracing.
194 static const int ZeDebug = [] {
195  const char *DebugMode = std::getenv("ZE_DEBUG");
196  return DebugMode ? std::atoi(DebugMode) : ZE_DEBUG_NONE;
197 }();
198 
199 static void zePrint(const char *Format, ...) {
200  if (ZeDebug & ZE_DEBUG_BASIC) {
201  va_list Args;
202  va_start(Args, Format);
203  vfprintf(stderr, Format, Args);
204  va_end(Args);
205  }
206 }
207 
208 // Controls whether device-scope events are used, and how.
209 static const enum EventsScope {
210  // All events are created host-visible.
211  AllHostVisible,
212  // All events are created with device-scope and only when
213  // host waits them or queries their status that a proxy
214  // host-visible event is created and set to signal after
215  // original event signals.
216  OnDemandHostVisibleProxy,
217  // All events are created with device-scope and only
218  // when a batch of commands is submitted for execution a
219  // last command in that batch is added to signal host-visible
220  // completion of each command in this batch (the default mode).
221  LastCommandInBatchHostVisible
222 } EventsScope = [] {
223  const auto DeviceEventsStr =
224  std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS");
225 
226  auto Default = LastCommandInBatchHostVisible;
227  switch (DeviceEventsStr ? std::atoi(DeviceEventsStr) : Default) {
228  case 0:
229  return AllHostVisible;
230  case 1:
231  return OnDemandHostVisibleProxy;
232  case 2:
233  return LastCommandInBatchHostVisible;
234  }
235  return Default;
236 }();
237 
238 // Maximum number of events that can be present in an event ZePool is captured
239 // here. Setting it to 256 gave best possible performance for several
240 // benchmarks.
241 static const pi_uint32 MaxNumEventsPerPool = [] {
242  const auto MaxNumEventsPerPoolEnv =
243  std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
244  pi_uint32 Result =
245  MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256;
246  if (Result <= 0)
247  Result = 256;
248  return Result;
249 }();
250 
251 // Helper function to implement zeHostSynchronize.
252 // The behavior is to avoid infinite wait during host sync under ZE_DEBUG.
253 // This allows for a much more responsive debugging of hangs.
254 //
255 template <typename T, typename Func>
256 ze_result_t zeHostSynchronizeImpl(Func Api, T Handle) {
257  if (!ZeDebug) {
258  return Api(Handle, UINT64_MAX);
259  }
260 
261  ze_result_t R;
262  while ((R = Api(Handle, 1000)) == ZE_RESULT_NOT_READY)
263  ;
264  return R;
265 }
266 
267 // Template function to do various types of host synchronizations.
268 // This is intended to be used instead of direct calls to specific
269 // Level-Zero synchronization APIs.
270 //
271 template <typename T> ze_result_t zeHostSynchronize(T Handle);
272 template <> ze_result_t zeHostSynchronize(ze_event_handle_t Handle) {
273  return zeHostSynchronizeImpl(zeEventHostSynchronize, Handle);
274 }
275 template <> ze_result_t zeHostSynchronize(ze_command_queue_handle_t Handle) {
276  return zeHostSynchronizeImpl(zeCommandQueueSynchronize, Handle);
277 }
278 
279 template <typename T, typename Assign>
280 pi_result getInfoImpl(size_t param_value_size, void *param_value,
281  size_t *param_value_size_ret, T value, size_t value_size,
282  Assign &&assign_func) {
283 
284  if (param_value != nullptr) {
285 
286  if (param_value_size < value_size) {
287  return PI_ERROR_INVALID_VALUE;
288  }
289 
290  assign_func(param_value, value, value_size);
291  }
292 
293  if (param_value_size_ret != nullptr) {
294  *param_value_size_ret = value_size;
295  }
296 
297  return PI_SUCCESS;
298 }
299 
300 template <typename T>
301 pi_result getInfo(size_t param_value_size, void *param_value,
302  size_t *param_value_size_ret, T value) {
303 
304  auto assignment = [](void *param_value, T value, size_t value_size) {
305  (void)value_size;
306  *static_cast<T *>(param_value) = value;
307  };
308 
309  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
310  sizeof(T), assignment);
311 }
312 
313 template <typename T>
314 pi_result getInfoArray(size_t array_length, size_t param_value_size,
315  void *param_value, size_t *param_value_size_ret,
316  T *value) {
317  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
318  array_length * sizeof(T), memcpy);
319 }
320 
321 template <typename T, typename RetType>
322 pi_result getInfoArray(size_t array_length, size_t param_value_size,
323  void *param_value, size_t *param_value_size_ret,
324  T *value) {
325  if (param_value) {
326  memset(param_value, 0, param_value_size);
327  for (uint32_t I = 0; I < array_length; I++)
328  ((RetType *)param_value)[I] = (RetType)value[I];
329  }
330  if (param_value_size_ret)
331  *param_value_size_ret = array_length * sizeof(RetType);
332  return PI_SUCCESS;
333 }
334 
335 template <>
336 pi_result getInfo<const char *>(size_t param_value_size, void *param_value,
337  size_t *param_value_size_ret,
338  const char *value) {
339  return getInfoArray(strlen(value) + 1, param_value_size, param_value,
340  param_value_size_ret, value);
341 }
342 
343 class ReturnHelper {
344 public:
345  ReturnHelper(size_t param_value_size, void *param_value,
346  size_t *param_value_size_ret)
347  : param_value_size(param_value_size), param_value(param_value),
348  param_value_size_ret(param_value_size_ret) {}
349 
350  template <class T> pi_result operator()(const T &t) {
351  return getInfo(param_value_size, param_value, param_value_size_ret, t);
352  }
353 
354 private:
355  size_t param_value_size;
356  void *param_value;
357  size_t *param_value_size_ret;
358 };
359 
360 } // anonymous namespace
361 
362 // SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in
363 // which case all compute commands will be submitted to the command-queue
364 // with the given index in the compute command group. If it is instead set
365 // to negative then all available compute engines may be used.
366 //
367 // The default value is "0".
368 //
369 static const std::pair<int, int> getRangeOfAllowedComputeEngines = [] {
370  const char *EnvVar = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE");
371  // If the environment variable is not set only use "0" CCS for now.
372  // TODO: allow all CCSs when HW support is complete.
373  if (!EnvVar)
374  return std::pair<int, int>(0, 0);
375 
376  auto EnvVarValue = std::atoi(EnvVar);
377  if (EnvVarValue >= 0) {
378  return std::pair<int, int>(EnvVarValue, EnvVarValue);
379  }
380 
381  return std::pair<int, int>(0, INT_MAX);
382 }();
383 
384 // SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE can be set to an integer value, or
385 // a pair of integer values of the form "lower_index:upper_index".
386 // Here, the indices point to copy engines in a list of all available copy
387 // engines.
388 // This functions returns this pair of indices.
389 // If the user specifies only a single integer, a value of 0 indicates that
390 // the copy engines will not be used at all. A value of 1 indicates that all
391 // available copy engines can be used.
392 static const std::pair<int, int> getRangeOfAllowedCopyEngines = [] {
393  const char *EnvVar = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE");
394  // If the environment variable is not set, only index 0 compute engine will be
395  // used when immediate commandlists are being used. For standard commandlists
396  // all are used.
397  if (!EnvVar)
398  return std::pair<int, int>(0, UseImmediateCommandLists ? 0 : INT_MAX);
399  std::string CopyEngineRange = EnvVar;
400  // Environment variable can be a single integer or a pair of integers
401  // separated by ":"
402  auto pos = CopyEngineRange.find(":");
403  if (pos == std::string::npos) {
404  bool UseCopyEngine = (std::stoi(CopyEngineRange) != 0);
405  if (UseCopyEngine)
406  return std::pair<int, int>(0, INT_MAX); // All copy engines can be used.
407  return std::pair<int, int>(-1, -1); // No copy engines will be used.
408  }
409  int LowerCopyEngineIndex = std::stoi(CopyEngineRange.substr(0, pos));
410  int UpperCopyEngineIndex = std::stoi(CopyEngineRange.substr(pos + 1));
411  if ((LowerCopyEngineIndex > UpperCopyEngineIndex) ||
412  (LowerCopyEngineIndex < -1) || (UpperCopyEngineIndex < -1)) {
413  zePrint("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE: invalid value provided, "
414  "default set.\n");
415  LowerCopyEngineIndex = 0;
416  UpperCopyEngineIndex = INT_MAX;
417  }
418  return std::pair<int, int>(LowerCopyEngineIndex, UpperCopyEngineIndex);
419 }();
420 
421 static const bool CopyEngineRequested = [] {
422  int LowerCopyQueueIndex = getRangeOfAllowedCopyEngines.first;
423  int UpperCopyQueueIndex = getRangeOfAllowedCopyEngines.second;
424  return ((LowerCopyQueueIndex != -1) || (UpperCopyQueueIndex != -1));
425 }();
426 
427 // Global variables used in PI_Level_Zero
428 // Note we only create a simple pointer variables such that C++ RT won't
429 // deallocate them automatically at the end of the main program.
430 // The heap memory allocated for these global variables reclaimed only when
431 // Sycl RT calls piTearDown().
432 static std::vector<pi_platform> *PiPlatformsCache =
433  new std::vector<pi_platform>;
434 static sycl::detail::SpinLock *PiPlatformsCacheMutex =
435  new sycl::detail::SpinLock;
436 static bool PiPlatformCachePopulated = false;
437 
438 // Flags which tell whether various Level Zero extensions are available.
441 
442 pi_result
443 _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool,
444  size_t &Index, bool HostVisible,
445  bool ProfilingEnabled) {
446  // Lock while updating event pool machinery.
447  std::scoped_lock Lock(ZeEventPoolCacheMutex);
448 
449  std::list<ze_event_pool_handle_t> *ZePoolCache =
450  getZeEventPoolCache(HostVisible, ProfilingEnabled);
451 
452  // Remove full pool from the cache.
453  if (!ZePoolCache->empty()) {
454  if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) {
455  ZePoolCache->erase(ZePoolCache->begin());
456  }
457  }
458  if (ZePoolCache->empty()) {
459  ZePoolCache->push_back(nullptr);
460  }
461 
462  // We shall be adding an event to the front pool.
463  ze_event_pool_handle_t *ZePool = &ZePoolCache->front();
464  Index = 0;
465  // Create one event ZePool per MaxNumEventsPerPool events
466  if (*ZePool == nullptr) {
467  ZeStruct<ze_event_pool_desc_t> ZeEventPoolDesc;
468  ZeEventPoolDesc.count = MaxNumEventsPerPool;
469  ZeEventPoolDesc.flags = 0;
470  if (HostVisible)
471  ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
472  if (ProfilingEnabled)
473  ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
474  zePrint("ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags);
475 
476  std::vector<ze_device_handle_t> ZeDevices;
477  std::for_each(Devices.begin(), Devices.end(), [&](const pi_device &D) {
478  ZeDevices.push_back(D->ZeDevice);
479  });
480 
481  ZE_CALL(zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc, ZeDevices.size(),
482  &ZeDevices[0], ZePool));
483  NumEventsAvailableInEventPool[*ZePool] = MaxNumEventsPerPool - 1;
484  NumEventsUnreleasedInEventPool[*ZePool] = 1;
485  } else {
486  Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[*ZePool];
487  --NumEventsAvailableInEventPool[*ZePool];
488  ++NumEventsUnreleasedInEventPool[*ZePool];
489  }
490  Pool = *ZePool;
491  return PI_SUCCESS;
492 }
493 
495  std::shared_lock EventLock(Event->Mutex, std::defer_lock);
496  std::scoped_lock LockAll(ZeEventPoolCacheMutex, EventLock);
497  if (!Event->ZeEventPool) {
498  // This must be an interop event created on a users's pool.
499  // Do nothing.
500  return PI_SUCCESS;
501  }
502 
503  std::list<ze_event_pool_handle_t> *ZePoolCache =
504  getZeEventPoolCache(Event->isHostVisible(), Event->isProfilingEnabled());
505 
506  // Put the empty pool to the cache of the pools.
507  if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0)
508  die("Invalid event release: event pool doesn't have unreleased events");
509  if (--NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) {
510  if (ZePoolCache->front() != Event->ZeEventPool) {
511  ZePoolCache->push_back(Event->ZeEventPool);
512  }
513  NumEventsAvailableInEventPool[Event->ZeEventPool] = MaxNumEventsPerPool;
514  }
515 
516  return PI_SUCCESS;
517 }
518 
519 // Some opencl extensions we know are supported by all Level Zero devices.
520 constexpr char ZE_SUPPORTED_EXTENSIONS[] =
521  "cl_khr_il_program cl_khr_subgroups cl_intel_subgroups "
522  "cl_intel_subgroups_short cl_intel_required_subgroup_size ";
523 
524 // Forward declarations
525 static pi_result
526 enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst,
527  pi_bool BlockingWrite, size_t Size, const void *Src,
528  pi_uint32 NumEventsInWaitList,
529  const pi_event *EventWaitList, pi_event *Event,
530  bool PreferCopyEngine = false);
531 
533  pi_command_type CommandType, pi_queue Queue, void *SrcBuffer,
534  void *DstBuffer, pi_buff_rect_offset SrcOrigin,
535  pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region,
536  size_t SrcRowPitch, size_t SrcSlicePitch, size_t DstRowPitch,
537  size_t DstSlicePitch, pi_bool Blocking, pi_uint32 NumEventsInWaitList,
538  const pi_event *EventWaitList, pi_event *Event,
539  bool PreferCopyEngine = false);
540 
541 inline void zeParseError(ze_result_t ZeError, const char *&ErrorString) {
542  switch (ZeError) {
543 #define ZE_ERRCASE(ERR) \
544  case ERR: \
545  ErrorString = "" #ERR; \
546  break;
547 
548  ZE_ERRCASE(ZE_RESULT_SUCCESS)
549  ZE_ERRCASE(ZE_RESULT_NOT_READY)
550  ZE_ERRCASE(ZE_RESULT_ERROR_DEVICE_LOST)
551  ZE_ERRCASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY)
552  ZE_ERRCASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY)
553  ZE_ERRCASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE)
554  ZE_ERRCASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS)
555  ZE_ERRCASE(ZE_RESULT_ERROR_NOT_AVAILABLE)
556  ZE_ERRCASE(ZE_RESULT_ERROR_UNINITIALIZED)
557  ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION)
558  ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE)
559  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_ARGUMENT)
560  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE)
561  ZE_ERRCASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE)
562  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER)
563  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_SIZE)
564  ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE)
565  ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT)
566  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT)
567  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_ENUMERATION)
568  ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION)
569  ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT)
570  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY)
571  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME)
572  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME)
573  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME)
574  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION)
575  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION)
576  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX)
577  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE)
578  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE)
579  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE)
580  ZE_ERRCASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS)
581  ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED)
582  ZE_ERRCASE(ZE_RESULT_ERROR_UNKNOWN)
583 
584 #undef ZE_ERRCASE
585  default:
586  assert(false && "Unexpected Error code");
587  } // switch
588 }
589 
590 // Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR
591 constexpr size_t MaxMessageSize = 256;
592 thread_local pi_result ErrorMessageCode = PI_SUCCESS;
593 thread_local char ErrorMessage[MaxMessageSize];
594 
595 // Utility function for setting a message and warning
596 [[maybe_unused]] static void setErrorMessage(const char *message,
597  pi_result error_code) {
598  assert(strlen(message) <= MaxMessageSize);
599  strcpy(ErrorMessage, message);
600  ErrorMessageCode = error_code;
601 }
602 
603 // Returns plugin specific error and warning messages
605  *message = &ErrorMessage[0];
606  return ErrorMessageCode;
607 }
608 
609 ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName,
610  const char *ZeArgs, bool TraceError) {
611  zePrint("ZE ---> %s%s\n", ZeName, ZeArgs);
612 
613  if (ZeDebug & ZE_DEBUG_CALL_COUNT) {
614  ++(*ZeCallCount)[ZeName];
615  }
616 
617  if (ZeResult && TraceError) {
618  const char *ErrorString = "Unknown";
619  zeParseError(ZeResult, ErrorString);
620  zePrint("Error (%s) in %s\n", ErrorString, ZeName);
621  }
622  return ZeResult;
623 }
624 
625 #define PI_ASSERT(condition, error) \
626  if (!(condition)) \
627  return error;
628 
629 // This helper function creates a pi_event and associate a pi_queue.
630 // Note that the caller of this function must have acquired lock on the Queue
631 // that is passed in.
632 // \param Queue pi_queue to associate with a new event.
633 // \param Event a pointer to hold the newly created pi_event
634 // \param CommandType various command type determined by the caller
635 // \param CommandList is the command list where the event is added
636 // \param ForceHostVisible tells if the event must be created in
637 // the host-visible pool
639  pi_queue Queue, pi_event *Event, pi_command_type CommandType,
640  pi_command_list_ptr_t CommandList, bool ForceHostVisible = false) {
641 
642  PI_CALL(EventCreate(Queue->Context, Queue,
643  ForceHostVisible ? true : EventsScope == AllHostVisible,
644  Event));
645 
646  (*Event)->Queue = Queue;
647  (*Event)->CommandType = CommandType;
648 
649  // Append this Event to the CommandList, if any
650  if (CommandList != Queue->CommandListMap.end()) {
651  CommandList->second.append(*Event);
652  PI_CALL(piEventRetain(*Event));
653  }
654 
655  // We need to increment the reference counter here to avoid pi_queue
656  // being released before the associated pi_event is released because
657  // piEventRelease requires access to the associated pi_queue.
658  // In piEventRelease, the reference counter of the Queue is decremented
659  // to release it.
660  Queue->RefCount.increment();
661 
662  // SYCL RT does not track completion of the events, so it could
663  // release a PI event as soon as that's not being waited in the app.
664  // But we have to ensure that the event is not destroyed before
665  // it is really signalled, so retain it explicitly here and
666  // release in CleanupCompletedEvent(Event).
667  //
668  PI_CALL(piEventRetain(*Event));
669 
670  return PI_SUCCESS;
671 }
672 
673 pi_result _pi_device::initialize(int SubSubDeviceOrdinal,
674  int SubSubDeviceIndex) {
675  uint32_t numQueueGroups = 0;
676  ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
677  (ZeDevice, &numQueueGroups, nullptr));
678  if (numQueueGroups == 0) {
679  return PI_ERROR_UNKNOWN;
680  }
681  zePrint("NOTE: Number of queue groups = %d\n", numQueueGroups);
682  std::vector<ZeStruct<ze_command_queue_group_properties_t>>
683  QueueGroupProperties(numQueueGroups);
684  ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
685  (ZeDevice, &numQueueGroups, QueueGroupProperties.data()));
686 
687  // Initialize a sub-sub-device with its own ordinal and index
688  if (SubSubDeviceOrdinal >= 0) {
689  QueueGroup[queue_group_info_t::Compute].ZeOrdinal = SubSubDeviceOrdinal;
690  QueueGroup[queue_group_info_t::Compute].ZeIndex = SubSubDeviceIndex;
691  } else { // This is a root or a sub-device
692  for (uint32_t i = 0; i < numQueueGroups; i++) {
693  if (QueueGroupProperties[i].flags &
694  ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
697  QueueGroupProperties[i];
698  break;
699  }
700  }
701  // How is it possible that there are no "compute" capabilities?
702  if (QueueGroup[queue_group_info_t::Compute].ZeOrdinal < 0) {
703  return PI_ERROR_UNKNOWN;
704  }
705 
706  if (CopyEngineRequested) {
707  for (uint32_t i = 0; i < numQueueGroups; i++) {
708  if (((QueueGroupProperties[i].flags &
709  ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) &&
710  (QueueGroupProperties[i].flags &
711  ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) {
712  if (QueueGroupProperties[i].numQueues == 1) {
715  QueueGroupProperties[i];
716  } else {
719  QueueGroupProperties[i];
720  break;
721  }
722  }
723  }
724  if (QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal < 0)
725  zePrint("NOTE: main blitter/copy engine is not available\n");
726  else
727  zePrint("NOTE: main blitter/copy engine is available\n");
728 
729  if (QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal < 0)
730  zePrint("NOTE: link blitter/copy engines are not available\n");
731  else
732  zePrint("NOTE: link blitter/copy engines are available\n");
733  }
734  }
735 
736  // Maintain various device properties cache.
737  // Note that we just describe here how to compute the data.
738  // The real initialization is upon first access.
739  //
740  auto ZeDevice = this->ZeDevice;
741  ZeDeviceProperties.Compute = [ZeDevice](ze_device_properties_t &Properties) {
742  ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &Properties));
743  };
744 
746  [ZeDevice](ze_device_compute_properties_t &Properties) {
747  ZE_CALL_NOCHECK(zeDeviceGetComputeProperties, (ZeDevice, &Properties));
748  };
749 
751  [ZeDevice](ze_device_image_properties_t &Properties) {
752  ZE_CALL_NOCHECK(zeDeviceGetImageProperties, (ZeDevice, &Properties));
753  };
754 
756  [ZeDevice](ze_device_module_properties_t &Properties) {
757  ZE_CALL_NOCHECK(zeDeviceGetModuleProperties, (ZeDevice, &Properties));
758  };
759 
761  [ZeDevice](
762  std::vector<ZeStruct<ze_device_memory_properties_t>> &Properties) {
763  uint32_t Count = 0;
764  ZE_CALL_NOCHECK(zeDeviceGetMemoryProperties,
765  (ZeDevice, &Count, nullptr));
766 
767  Properties.resize(Count);
768  ZE_CALL_NOCHECK(zeDeviceGetMemoryProperties,
769  (ZeDevice, &Count, Properties.data()));
770  };
771 
773  [ZeDevice](ze_device_memory_access_properties_t &Properties) {
774  ZE_CALL_NOCHECK(zeDeviceGetMemoryAccessProperties,
775  (ZeDevice, &Properties));
776  };
777 
779  [ZeDevice](ze_device_cache_properties_t &Properties) {
780  // TODO: Since v1.0 there can be multiple cache properties.
781  // For now remember the first one, if any.
782  uint32_t Count = 0;
783  ZE_CALL_NOCHECK(zeDeviceGetCacheProperties,
784  (ZeDevice, &Count, nullptr));
785  if (Count > 0)
786  Count = 1;
787  ZE_CALL_NOCHECK(zeDeviceGetCacheProperties,
788  (ZeDevice, &Count, &Properties));
789  };
790  return PI_SUCCESS;
791 }
792 
793 pi_device _pi_context::getRootDevice() const {
794  assert(Devices.size() > 0);
795 
796  if (Devices.size() == 1)
797  return Devices[0];
798 
799  // Check if we have context with subdevices of the same device (context
800  // may include root device itself as well)
801  pi_device ContextRootDevice =
802  Devices[0]->RootDevice ? Devices[0]->RootDevice : Devices[0];
803 
804  // For context with sub subdevices, the ContextRootDevice might still
805  // not be the root device.
806  // Check whether the ContextRootDevice is the subdevice or root device.
807  if (ContextRootDevice->isSubDevice()) {
808  ContextRootDevice = ContextRootDevice->RootDevice;
809  }
810 
811  for (auto &Device : Devices) {
812  if ((!Device->RootDevice && Device != ContextRootDevice) ||
813  (Device->RootDevice && Device->RootDevice != ContextRootDevice)) {
814  ContextRootDevice = nullptr;
815  break;
816  }
817  }
818  return ContextRootDevice;
819 }
820 
822  // Create the immediate command list to be used for initializations
823  // Created as synchronous so level-zero performs implicit synchronization and
824  // there is no need to query for completion in the plugin
825  //
826  // TODO: get rid of using Devices[0] for the context with multiple
827  // root-devices. We should somehow make the data initialized on all devices.
829 
830  // NOTE: we always submit to the "0" index compute engine with immediate
831  // command list since this is one for context.
832  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
833  ZeCommandQueueDesc.ordinal =
835  ZeCommandQueueDesc.index = 0;
836  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
837  ZE_CALL(
838  zeCommandListCreateImmediate,
839  (ZeContext, Device->ZeDevice, &ZeCommandQueueDesc, &ZeCommandListInit));
840  return PI_SUCCESS;
841 }
842 
844  // This function is called when pi_context is deallocated, piContextRelease.
845  // There could be some memory that may have not been deallocated.
846  // For example, event pool caches would be still alive.
847  {
848  std::scoped_lock Lock(ZeEventPoolCacheMutex);
849  for (auto &ZePoolCache : ZeEventPoolCache) {
850  for (auto &ZePool : ZePoolCache)
851  ZE_CALL(zeEventPoolDestroy, (ZePool));
852  ZePoolCache.clear();
853  }
854  }
855 
856  // Destroy the command list used for initializations
857  ZE_CALL(zeCommandListDestroy, (ZeCommandListInit));
858 
859  std::scoped_lock Lock(ZeCommandListCacheMutex);
860  for (auto &List : ZeComputeCommandListCache) {
861  for (ze_command_list_handle_t &ZeCommandList : List.second) {
862  if (ZeCommandList)
863  ZE_CALL(zeCommandListDestroy, (ZeCommandList));
864  }
865  }
866  for (auto &List : ZeCopyCommandListCache) {
867  for (ze_command_list_handle_t &ZeCommandList : List.second) {
868  if (ZeCommandList)
869  ZE_CALL(zeCommandListDestroy, (ZeCommandList));
870  }
871  }
872  return PI_SUCCESS;
873 }
874 
876  return ZeQueueGroupOrdinal !=
877  (uint32_t)Queue->Device
878  ->QueueGroup[_pi_device::queue_group_info_t::type::Compute]
879  .ZeOrdinal;
880 }
881 
883  // If out-of-order queue property is not set, then this is a in-order queue.
884  return ((this->Properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0);
885 }
886 
887 pi_result
889  bool MakeAvailable,
890  std::vector<pi_event> &EventListToCleanup) {
891  bool UseCopyEngine = CommandList->second.isCopy(this);
892 
893  // Immediate commandlists do not have an associated fence.
894  if (CommandList->second.ZeFence != nullptr) {
895  // Fence had been signalled meaning the associated command-list completed.
896  // Reset the fence and put the command list into a cache for reuse in PI
897  // calls.
898  ZE_CALL(zeFenceReset, (CommandList->second.ZeFence));
899  ZE_CALL(zeCommandListReset, (CommandList->first));
900  CommandList->second.ZeFenceInUse = false;
901  }
902 
903  auto &EventList = CommandList->second.EventList;
904  // Remember all the events in this command list which needs to be
905  // released/cleaned up and clear event list associated with command list.
906  std::move(std::begin(EventList), std::end(EventList),
907  std::back_inserter(EventListToCleanup));
908  EventList.clear();
909 
910  // Standard commandlists move in and out of the cache as they are recycled.
911  // Immediate commandlists are always available.
912  if (CommandList->second.ZeFence != nullptr && MakeAvailable) {
913  std::scoped_lock Lock(this->Context->ZeCommandListCacheMutex);
914  auto &ZeCommandListCache =
915  UseCopyEngine
918  ZeCommandListCache.push_back(CommandList->first);
919  }
920 
921  return PI_SUCCESS;
922 }
923 
924 // Configuration of the command-list batching.
925 typedef struct CommandListBatchConfig {
926  // Default value of 0. This specifies to use dynamic batch size adjustment.
927  // Other values will try to collect specified amount of commands.
929 
930  // If doing dynamic batching, specifies start batch size.
932 
933  // The maximum size for dynamic batch.
935 
936  // The step size for dynamic batch increases.
938 
939  // Thresholds for when increase batch size (number of closed early is small
940  // and number of closed full is high).
943 
944  // Tells the starting size of a batch.
945  pi_uint32 startSize() const { return Size > 0 ? Size : DynamicSizeStart; }
946  // Tells is we are doing dynamic batch size adjustment.
947  bool dynamic() const { return Size == 0; }
949 
950 // Helper function to initialize static variables that holds batch config info
951 // for compute and copy command batching.
953  zeCommandListBatchConfig Config{}; // default initialize
954 
955  // Default value of 0. This specifies to use dynamic batch size adjustment.
956  const auto BatchSizeStr =
957  (IsCopy) ? std::getenv("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE")
958  : std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
959  if (BatchSizeStr) {
960  pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
961  // Level Zero may only support a limted number of commands per command
962  // list. The actual upper limit is not specified by the Level Zero
963  // Specification. For now we allow an arbitrary upper limit.
964  if (BatchSizeStrVal > 0) {
965  Config.Size = BatchSizeStrVal;
966  } else if (BatchSizeStrVal == 0) {
967  Config.Size = 0;
968  // We are requested to do dynamic batching. Collect specifics, if any.
969  // The extended format supported is ":" separated values.
970  //
971  // NOTE: these extra settings are experimental and are intended to
972  // be used only for finding a better default heuristic.
973  //
974  std::string BatchConfig(BatchSizeStr);
975  size_t Ord = 0;
976  size_t Pos = 0;
977  while (true) {
978  if (++Ord > 5)
979  break;
980 
981  Pos = BatchConfig.find(":", Pos);
982  if (Pos == std::string::npos)
983  break;
984  ++Pos; // past the ":"
985 
986  pi_uint32 Val;
987  try {
988  Val = std::stoi(BatchConfig.substr(Pos));
989  } catch (...) {
990  if (IsCopy)
991  zePrint(
992  "SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: failed to parse value\n");
993  else
994  zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: failed to parse value\n");
995  break;
996  }
997  switch (Ord) {
998  case 1:
999  Config.DynamicSizeStart = Val;
1000  break;
1001  case 2:
1002  Config.DynamicSizeMax = Val;
1003  break;
1004  case 3:
1005  Config.DynamicSizeStep = Val;
1006  break;
1007  case 4:
1008  Config.NumTimesClosedEarlyThreshold = Val;
1009  break;
1010  case 5:
1011  Config.NumTimesClosedFullThreshold = Val;
1012  break;
1013  default:
1014  die("Unexpected batch config");
1015  }
1016  if (IsCopy)
1017  zePrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: dynamic batch param "
1018  "#%d: %d\n",
1019  (int)Ord, (int)Val);
1020  else
1021  zePrint(
1022  "SYCL_PI_LEVEL_ZERO_BATCH_SIZE: dynamic batch param #%d: %d\n",
1023  (int)Ord, (int)Val);
1024  };
1025 
1026  } else {
1027  // Negative batch sizes are silently ignored.
1028  if (IsCopy)
1029  zePrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: ignored negative value\n");
1030  else
1031  zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: ignored negative value\n");
1032  }
1033  }
1034  return Config;
1035 }
1036 
1037 // Static variable that holds batch config info for compute command batching.
1039  using IsCopy = bool;
1040  return ZeCommandListBatchConfig(IsCopy{false});
1041 }();
1042 
1043 // Static variable that holds batch config info for copy command batching.
1045  using IsCopy = bool;
1046  return ZeCommandListBatchConfig(IsCopy{true});
1047 }();
1048 
1049 _pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
1050  std::vector<ze_command_queue_handle_t> &CopyQueues,
1051  pi_context Context, pi_device Device,
1052  bool OwnZeCommandQueue,
1054  : Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue},
1055  Properties(PiQueueProperties) {
1056 
1057  // Compute group initialization.
1058  // First, see if the queue's device allows for round-robin or it is
1059  // fixed to one particular compute CCS (it is so for sub-sub-devices).
1060  auto &ComputeQueueGroupInfo = Device->QueueGroup[queue_type::Compute];
1061  if (ComputeQueueGroupInfo.ZeIndex >= 0) {
1062  ComputeQueueGroup.LowerIndex = ComputeQueueGroupInfo.ZeIndex;
1063  ComputeQueueGroup.UpperIndex = ComputeQueueGroupInfo.ZeIndex;
1064  ComputeQueueGroup.NextIndex = ComputeQueueGroupInfo.ZeIndex;
1065  } else {
1066  ComputeQueueGroup.LowerIndex = 0;
1067  ComputeQueueGroup.UpperIndex = INT_MAX;
1068  ComputeQueueGroup.NextIndex = 0;
1069  }
1070 
1071  uint32_t FilterLowerIndex = getRangeOfAllowedComputeEngines.first;
1072  uint32_t FilterUpperIndex = getRangeOfAllowedComputeEngines.second;
1073  FilterUpperIndex =
1074  std::min((size_t)FilterUpperIndex, ComputeQueues.size() - 1);
1075  if (FilterLowerIndex <= FilterUpperIndex) {
1076  ComputeQueueGroup.ZeQueues = ComputeQueues;
1077  ComputeQueueGroup.LowerIndex = FilterLowerIndex;
1078  ComputeQueueGroup.UpperIndex = FilterUpperIndex;
1079  ComputeQueueGroup.NextIndex = ComputeQueueGroup.LowerIndex;
1080  // Create space to hold immediate commandlists corresponding to the ZeQueues
1081  if (UseImmediateCommandLists) {
1082  ComputeQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
1083  ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
1084  }
1085  } else {
1086  die("No compute queue available.");
1087  }
1088 
1089  // Copy group initialization.
1090  if (getRangeOfAllowedCopyEngines.first < 0 ||
1091  getRangeOfAllowedCopyEngines.second < 0) {
1092  // We are asked not to use copy engines, just do nothing.
1093  // Leave CopyQueueGroup.ZeQueues empty, and it won't be used.
1094  } else {
1095  uint32_t FilterLowerIndex = getRangeOfAllowedCopyEngines.first;
1096  uint32_t FilterUpperIndex = getRangeOfAllowedCopyEngines.second;
1097  FilterUpperIndex =
1098  std::min((size_t)FilterUpperIndex, CopyQueues.size() - 1);
1099  if (FilterLowerIndex <= FilterUpperIndex) {
1100  CopyQueueGroup.ZeQueues = CopyQueues;
1101  CopyQueueGroup.LowerIndex = FilterLowerIndex;
1102  CopyQueueGroup.UpperIndex = FilterUpperIndex;
1103  CopyQueueGroup.NextIndex = CopyQueueGroup.LowerIndex;
1104  // Create space to hold immediate commandlists corresponding to the
1105  // ZeQueues
1106  if (UseImmediateCommandLists) {
1107  CopyQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
1108  CopyQueueGroup.ZeQueues.size(), CommandListMap.end());
1109  }
1110  }
1111  }
1112 
1113  // Initialize compute/copy command batches.
1114  ComputeCommandBatch.OpenCommandList = CommandListMap.end();
1115  CopyCommandBatch.OpenCommandList = CommandListMap.end();
1116  ComputeCommandBatch.QueueBatchSize =
1118  CopyCommandBatch.QueueBatchSize = ZeCommandListBatchCopyConfig.startSize();
1119 }
1120 
1122 
1123 // Reset signalled command lists in the queue and put them to the cache of
1124 // command lists. A caller must not lock the queue mutex.
1126  // We need events to be cleaned up out of scope where queue is locked to avoid
1127  // nested locks, because event cleanup requires event to be locked. Nested
1128  // locks are hard to control and can cause deadlocks if mutexes are locked in
1129  // different order.
1130  std::vector<pi_event> EventListToCleanup;
1131  {
1132  // We check for command lists that have been already signalled, but have not
1133  // been added to the available list yet. Each command list has a fence
1134  // associated which tracks if a command list has completed dispatch of its
1135  // commands and is ready for reuse. If a command list is found to have been
1136  // signalled, then the command list & fence are reset and command list is
1137  // returned to the command list cache. All events associated with command
1138  // list are cleaned up if command list was reset.
1139  std::scoped_lock Lock(Queue->Mutex);
1140  for (auto &&it = Queue->CommandListMap.begin();
1141  it != Queue->CommandListMap.end(); ++it) {
1142  // It is possible that the fence was already noted as signalled and
1143  // reset. In that case the ZeFenceInUse flag will be false.
1144  if (it->second.ZeFence != nullptr && it->second.ZeFenceInUse) {
1145  ze_result_t ZeResult =
1146  ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence));
1147  if (ZeResult == ZE_RESULT_SUCCESS) {
1148  PI_CALL(Queue->resetCommandList(it, true, EventListToCleanup));
1149  }
1150  }
1151  }
1152  }
1153  for (auto Event : EventListToCleanup) {
1154  // We don't need to synchronize the events since the fence
1155  // synchronized above already does that.
1156  {
1157  std::scoped_lock EventLock(Event->Mutex);
1158  Event->Completed = true;
1159  }
1161  // This event was removed from the command list, so decrement ref count
1162  // (it was incremented when they were added to the command list).
1163  PI_CALL(piEventRelease(Event));
1164  }
1165  return PI_SUCCESS;
1166 }
1167 
1168 // Retrieve an available command list to be used in a PI call.
1169 pi_result
1171  pi_command_list_ptr_t &CommandList,
1172  bool UseCopyEngine, bool AllowBatching) {
1173  // Immediate commandlists have been pre-allocated and are always available.
1174  if (UseImmediateCommandLists) {
1175  CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList();
1176  return PI_SUCCESS;
1177  }
1178 
1179  auto &CommandBatch =
1180  UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
1181  // Handle batching of commands
1182  // First see if there is an command-list open for batching commands
1183  // for this queue.
1184  if (Queue->hasOpenCommandList(UseCopyEngine)) {
1185  if (AllowBatching) {
1186  CommandList = CommandBatch.OpenCommandList;
1187  return PI_SUCCESS;
1188  }
1189  // If this command isn't allowed to be batched, then we need to
1190  // go ahead and execute what is already in the batched list,
1191  // and then go on to process this. On exit from executeOpenCommandList
1192  // OpenCommandList will be invalidated.
1193  if (auto Res = Queue->executeOpenCommandList(UseCopyEngine))
1194  return Res;
1195  }
1196 
1197  // Create/Reuse the command list, because in Level Zero commands are added to
1198  // the command lists, and later are then added to the command queue.
1199  // Each command list is paired with an associated fence to track when the
1200  // command list is available for reuse.
1201  _pi_result pi_result = PI_ERROR_OUT_OF_RESOURCES;
1202  ZeStruct<ze_fence_desc_t> ZeFenceDesc;
1203  // Initally, we need to check if a command list has already been created
1204  // on this device that is available for use. If so, then reuse that
1205  // Level-Zero Command List and Fence for this PI call.
1206  {
1207  // Make sure to acquire the lock before checking the size, or there
1208  // will be a race condition.
1209  std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex);
1210  // Under mutex since operator[] does insertion on the first usage for every
1211  // unique ZeDevice.
1212  auto &ZeCommandListCache =
1213  UseCopyEngine
1214  ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]
1215  : Queue->Context
1217 
1218  if (ZeCommandListCache.size() > 0) {
1219  auto &ZeCommandList = ZeCommandListCache.front();
1220  auto it = Queue->CommandListMap.find(ZeCommandList);
1221  if (it != Queue->CommandListMap.end()) {
1222  CommandList = it;
1223  if (CommandList->second.ZeFence != nullptr)
1224  CommandList->second.ZeFenceInUse = true;
1225  } else {
1226  // If there is a command list available on this context, but it
1227  // wasn't yet used in this queue then create a new entry in this
1228  // queue's map to hold the fence and other associated command
1229  // list information.
1230  uint32_t QueueGroupOrdinal;
1231  auto &ZeCommandQueue =
1232  Queue->getQueueGroup(UseCopyEngine).getZeQueue(&QueueGroupOrdinal);
1233 
1234  ze_fence_handle_t ZeFence;
1235  ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
1236  CommandList =
1237  Queue->CommandListMap
1238  .emplace(ZeCommandList,
1239  pi_command_list_info_t{ZeFence, true, ZeCommandQueue,
1240  QueueGroupOrdinal})
1241  .first;
1242  }
1243  ZeCommandListCache.pop_front();
1244  return PI_SUCCESS;
1245  }
1246  }
1247 
1248  // If there are no available command lists nor signalled command lists, then
1249  // we must create another command list.
1250  // Once created, this command list & fence are added to the command list fence
1251  // map.
1252  ze_command_list_handle_t ZeCommandList;
1253  ze_fence_handle_t ZeFence;
1254 
1255  uint32_t QueueGroupOrdinal;
1256  auto &ZeCommandQueue =
1257  Queue->getQueueGroup(UseCopyEngine).getZeQueue(&QueueGroupOrdinal);
1258 
1259  ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
1260  ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
1261 
1262  ZE_CALL(zeCommandListCreate,
1263  (Queue->Context->ZeContext, Queue->Device->ZeDevice,
1264  &ZeCommandListDesc, &ZeCommandList));
1265 
1266  ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
1267  std::tie(CommandList, std::ignore) = Queue->CommandListMap.insert(
1268  std::pair<ze_command_list_handle_t, pi_command_list_info_t>(
1269  ZeCommandList, {ZeFence, true, ZeCommandQueue, QueueGroupOrdinal}));
1270  pi_result = PI_SUCCESS;
1271 
1272  return pi_result;
1273 }
1274 
1277  auto &ZeCommandListBatchConfig =
1279  pi_uint32 &QueueBatchSize = CommandBatch.QueueBatchSize;
1280  // QueueBatchSize of 0 means never allow batching.
1281  if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic())
1282  return;
1284 
1285  // If the number of times the list has been closed early is low, and
1286  // the number of times it has been closed full is high, then raise
1287  // the batching size slowly. Don't raise it if it is already pretty
1288  // high.
1293  if (QueueBatchSize < ZeCommandListBatchConfig.DynamicSizeMax) {
1294  QueueBatchSize += ZeCommandListBatchConfig.DynamicSizeStep;
1295  zePrint("Raising QueueBatchSize to %d\n", QueueBatchSize);
1296  }
1299  }
1300 }
1301 
1304  auto &ZeCommandListBatchConfig =
1306  pi_uint32 &QueueBatchSize = CommandBatch.QueueBatchSize;
1307  // QueueBatchSize of 0 means never allow batching.
1308  if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic())
1309  return;
1311 
1312  // If we are closing early more than about 3x the number of times
1313  // it is closing full, lower the batch size to the value of the
1314  // current open command list. This is trying to quickly get to a
1315  // batch size that will be able to be closed full at least once
1316  // in a while.
1318  (CommandBatch.NumTimesClosedFull + 1) * 3) {
1319  QueueBatchSize = CommandBatch.OpenCommandList->second.size() - 1;
1320  if (QueueBatchSize < 1)
1321  QueueBatchSize = 1;
1322  zePrint("Lowering QueueBatchSize to %d\n", QueueBatchSize);
1325  }
1326 }
1327 
1329  for (auto &Kernel : KernelsToBeSubmitted) {
1330  if (!Kernel->hasIndirectAccess())
1331  continue;
1332 
1333  auto &Contexts = Device->Platform->Contexts;
1334  for (auto &Ctx : Contexts) {
1335  for (auto &Elem : Ctx->MemAllocs) {
1336  const auto &Pair = Kernel->MemAllocs.insert(&Elem);
1337  // Kernel is referencing this memory allocation from now.
1338  // If this memory allocation was already captured for this kernel, it
1339  // means that kernel is submitted several times. Increase reference
1340  // count only once because we release all allocations only when
1341  // SubmissionsCount turns to 0. We don't want to know how many times
1342  // allocation was retained by each submission.
1343  if (Pair.second)
1344  Elem.second.RefCount.increment();
1345  }
1346  }
1347  Kernel->SubmissionsCount++;
1348  }
1349  KernelsToBeSubmitted.clear();
1350 }
1351 
1353  bool IsBlocking,
1354  bool OKToBatchCommand) {
1355  bool UseCopyEngine = CommandList->second.isCopy(this);
1356 
1357  // If the current LastCommandEvent is the nullptr, then it means
1358  // either that no command has ever been issued to the queue
1359  // or it means that the LastCommandEvent has been signalled and
1360  // therefore that this Queue is idle.
1361  //
1362  // NOTE: this behavior adds some flakyness to the batching
1363  // since last command's event may or may not be completed by the
1364  // time we get here depending on timings and system/gpu load.
1365  // So, disable it for modes where we print PI traces. Printing
1366  // traces incurs much different timings than real execution
1367  // ansyway, and many regression tests use it.
1368  //
1369  bool CurrentlyEmpty = !PrintPiTrace && this->LastCommandEvent == nullptr;
1370 
1371  // The list can be empty if command-list only contains signals of proxy
1372  // events.
1373  if (!CommandList->second.EventList.empty())
1374  this->LastCommandEvent = CommandList->second.EventList.back();
1375 
1376  if (!UseImmediateCommandLists) {
1377  // Batch if allowed to, but don't batch if we know there are no kernels
1378  // from this queue that are currently executing. This is intended to get
1379  // kernels started as soon as possible when there are no kernels from this
1380  // queue awaiting execution, while allowing batching to occur when there
1381  // are kernels already executing. Also, if we are using fixed size batching,
1382  // as indicated by !ZeCommandListBatch.dynamic(), then just ignore
1383  // CurrentlyEmpty as we want to strictly follow the batching the user
1384  // specified.
1385  auto &CommandBatch = UseCopyEngine ? CopyCommandBatch : ComputeCommandBatch;
1386  auto &ZeCommandListBatchConfig = UseCopyEngine
1389  if (OKToBatchCommand && this->isBatchingAllowed(UseCopyEngine) &&
1390  (!ZeCommandListBatchConfig.dynamic() || !CurrentlyEmpty)) {
1391 
1392  if (hasOpenCommandList(UseCopyEngine) &&
1393  CommandBatch.OpenCommandList != CommandList)
1394  die("executeCommandList: OpenCommandList should be equal to"
1395  "null or CommandList");
1396 
1397  if (CommandList->second.size() < CommandBatch.QueueBatchSize) {
1398  CommandBatch.OpenCommandList = CommandList;
1399  return PI_SUCCESS;
1400  }
1401 
1402  adjustBatchSizeForFullBatch(UseCopyEngine);
1404  }
1405  }
1406 
1407  auto &ZeCommandQueue = CommandList->second.ZeQueue;
1408  // Scope of the lock must be till the end of the function, otherwise new mem
1409  // allocs can be created between the moment when we made a snapshot and the
1410  // moment when command list is closed and executed. But mutex is locked only
1411  // if indirect access tracking enabled, because std::defer_lock is used.
1412  // unique_lock destructor at the end of the function will unlock the mutex
1413  // if it was locked (which happens only if IndirectAccessTrackingEnabled is
1414  // true).
1415  std::unique_lock<pi_shared_mutex> ContextsLock(
1416  Device->Platform->ContextsMutex, std::defer_lock);
1417 
1418  if (IndirectAccessTrackingEnabled) {
1419  // We are going to submit kernels for execution. If indirect access flag is
1420  // set for a kernel then we need to make a snapshot of existing memory
1421  // allocations in all contexts in the platform. We need to lock the mutex
1422  // guarding the list of contexts in the platform to prevent creation of new
1423  // memory alocations in any context before we submit the kernel for
1424  // execution.
1425  ContextsLock.lock();
1427  }
1428 
1429  if (!UseImmediateCommandLists) {
1430  // In this mode all inner-batch events have device visibility only,
1431  // and we want the last command in the batch to signal a host-visible
1432  // event that anybody waiting for any event in the batch will
1433  // really be using.
1434  // We need to create a proxy host-visible event only if the list of events
1435  // in the command list is not empty, otherwise we are going to just create
1436  // and remove proxy event right away and dereference deleted object
1437  // afterwards.
1438  if (EventsScope == LastCommandInBatchHostVisible &&
1439  !CommandList->second.EventList.empty()) {
1440  // Create a "proxy" host-visible event.
1441  //
1442  pi_event HostVisibleEvent;
1443  auto Res = createEventAndAssociateQueue(
1444  this, &HostVisibleEvent, PI_COMMAND_TYPE_USER, CommandList, true);
1445  if (Res)
1446  return Res;
1447 
1448  // Update each command's event in the command-list to "see" this
1449  // proxy event as a host-visible counterpart.
1450  for (auto &Event : CommandList->second.EventList) {
1451  std::scoped_lock EventLock(Event->Mutex);
1452  if (!Event->HostVisibleEvent) {
1453  Event->HostVisibleEvent = HostVisibleEvent;
1454  PI_CALL(piEventRetain(HostVisibleEvent));
1455  }
1456  }
1457 
1458  // Decrement the reference count of the event such that all the remaining
1459  // references are from the other commands in this batch and from the
1460  // command-list itself. This host-visible event will not be
1461  // waited/released by SYCL RT, so it must be destroyed after all events in
1462  // the batch are gone.
1463  // We know that refcount is more than 2 because we check that EventList of
1464  // the command list is not empty above, i.e. after
1465  // createEventAndAssociateQueue ref count is 2 and then +1 for each event
1466  // in the EventList.
1467  PI_CALL(piEventRelease(HostVisibleEvent));
1468  PI_CALL(piEventRelease(HostVisibleEvent));
1469 
1470  // Indicate no cleanup is needed for this PI event as it is special.
1471  HostVisibleEvent->CleanedUp = true;
1472 
1473  // Finally set to signal the host-visible event at the end of the
1474  // command-list.
1475  // TODO: see if we need a barrier here (or explicit wait for all events in
1476  // the batch).
1477  ZE_CALL(zeCommandListAppendSignalEvent,
1478  (CommandList->first, HostVisibleEvent->ZeEvent));
1479  }
1480 
1481  // Close the command list and have it ready for dispatch.
1482  ZE_CALL(zeCommandListClose, (CommandList->first));
1483  // Offload command list to the GPU for asynchronous execution
1484  auto ZeCommandList = CommandList->first;
1485  auto ZeResult = ZE_CALL_NOCHECK(
1486  zeCommandQueueExecuteCommandLists,
1487  (ZeCommandQueue, 1, &ZeCommandList, CommandList->second.ZeFence));
1488  if (ZeResult != ZE_RESULT_SUCCESS) {
1489  this->Healthy = false;
1490  if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
1491  // Turn into a more informative end-user error.
1492  return PI_ERROR_COMMAND_EXECUTION_FAILURE;
1493  }
1494  return mapError(ZeResult);
1495  }
1496  }
1497 
1498  // Check global control to make every command blocking for debugging.
1499  if (IsBlocking || (ZeSerialize & ZeSerializeBlock) != 0) {
1500  if (UseImmediateCommandLists) {
1501  synchronize();
1502  } else {
1503  // Wait until command lists attached to the command queue are executed.
1504  ZE_CALL(zeHostSynchronize, (ZeCommandQueue));
1505  }
1506  }
1507  return PI_SUCCESS;
1508 }
1509 
1510 bool _pi_queue::isBatchingAllowed(bool IsCopy) const {
1512  return (CommandBatch.QueueBatchSize > 0 &&
1513  ((ZeSerialize & ZeSerializeBlock) == 0));
1514 }
1515 
1516 // Return the index of the next queue to use based on a
1517 // round robin strategy and the queue group ordinal.
1518 uint32_t _pi_queue::pi_queue_group_t::getQueueIndex(uint32_t *QueueGroupOrdinal,
1519  uint32_t *QueueIndex) {
1520 
1521  auto CurrentIndex = NextIndex;
1522  ++NextIndex;
1523  if (NextIndex > UpperIndex)
1525 
1526  // Find out the right queue group ordinal (first queue might be "main" or
1527  // "link")
1528  auto QueueType = Type;
1529  if (QueueType != queue_type::Compute)
1530  QueueType = (CurrentIndex == 0 && Queue->Device->hasMainCopyEngine())
1531  ? queue_type::MainCopy
1532  : queue_type::LinkCopy;
1533 
1534  *QueueGroupOrdinal = Queue->Device->QueueGroup[QueueType].ZeOrdinal;
1535  // Adjust the index to the L0 queue group since we represent "main" and
1536  // "link"
1537  // L0 groups with a single copy group ("main" would take "0" index).
1538  auto ZeCommandQueueIndex = CurrentIndex;
1539  if (QueueType == queue_type::LinkCopy && Queue->Device->hasMainCopyEngine()) {
1540  ZeCommandQueueIndex -= 1;
1541  }
1542  *QueueIndex = ZeCommandQueueIndex;
1543 
1544  return CurrentIndex;
1545 }
1546 
1547 // This function will return one of possibly multiple available native
1548 // queues and the value of the queue group ordinal.
1550 _pi_queue::pi_queue_group_t::getZeQueue(uint32_t *QueueGroupOrdinal) {
1551 
1552  // QueueIndex is the proper L0 index.
1553  // Index is the plugins concept of index, with main and link copy engines in
1554  // one range.
1555  uint32_t QueueIndex;
1556  auto Index = getQueueIndex(QueueGroupOrdinal, &QueueIndex);
1557 
1558  ze_command_queue_handle_t &ZeQueue = ZeQueues[Index];
1559  if (ZeQueue)
1560  return ZeQueue;
1561 
1562  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
1563  ZeCommandQueueDesc.ordinal = *QueueGroupOrdinal;
1564  ZeCommandQueueDesc.index = QueueIndex;
1565  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
1566 
1567  // Evaluate performance of explicit usage for "0" index.
1568  if (QueueIndex != 0) {
1569  ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
1570  }
1571 
1572  zePrint("[getZeQueue]: create queue ordinal = %d, index = %d "
1573  "(round robin in [%d, %d])\n",
1574  ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
1575  UpperIndex);
1576 
1577  auto ZeResult = ZE_CALL_NOCHECK(
1578  zeCommandQueueCreate, (Queue->Context->ZeContext, Queue->Device->ZeDevice,
1579  &ZeCommandQueueDesc, &ZeQueue));
1580  if (ZeResult) {
1581  die("[L0] getZeQueue: failed to create queue");
1582  }
1583 
1584  return ZeQueue;
1585 }
1586 
1587 // This function will return one of possibly multiple available
1588 // immediate commandlists associated with this Queue.
1590 
1591  uint32_t QueueIndex, QueueOrdinal;
1592  auto Index = getQueueIndex(&QueueOrdinal, &QueueIndex);
1593 
1594  if (ImmCmdLists[Index] != Queue->CommandListMap.end())
1595  return ImmCmdLists[Index];
1596 
1597  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
1598  ZeCommandQueueDesc.ordinal = QueueOrdinal;
1599  ZeCommandQueueDesc.index = QueueIndex;
1600  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
1601 
1602  // Evaluate performance of explicit usage for "0" index.
1603  if (QueueIndex != 0) {
1604  ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
1605  }
1606 
1607  zePrint("[getZeQueue]: create queue ordinal = %d, index = %d "
1608  "(round robin in [%d, %d])\n",
1609  ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
1610  UpperIndex);
1611 
1612  ze_command_list_handle_t ZeCommandList;
1613  ZE_CALL_NOCHECK(zeCommandListCreateImmediate,
1614  (Queue->Context->ZeContext, Queue->Device->ZeDevice,
1615  &ZeCommandQueueDesc, &ZeCommandList));
1616  ImmCmdLists[Index] =
1617  Queue->CommandListMap
1618  .insert(std::pair<ze_command_list_handle_t, pi_command_list_info_t>{
1619  ZeCommandList, {nullptr, true, nullptr, QueueOrdinal}})
1620  .first;
1621  // Add this commandlist to the cache so it can be destroyed as part of
1622  // piQueueReleaseInternal
1623  auto QueueType = Type;
1624  std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex);
1625  auto &ZeCommandListCache =
1626  QueueType == queue_type::Compute
1627  ? Queue->Context->ZeComputeCommandListCache[Queue->Device->ZeDevice]
1628  : Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice];
1629  ZeCommandListCache.push_back(ZeCommandList);
1630 
1631  return ImmCmdLists[Index];
1632 }
1633 
1635  using IsCopy = bool;
1636 
1637  if (UseImmediateCommandLists) {
1638  // When using immediate commandlists there are no open command lists.
1639  return CommandListMap.end();
1640  }
1641 
1642  const auto &ComputeEventList =
1643  ComputeCommandBatch.OpenCommandList->second.EventList;
1644  if (hasOpenCommandList(IsCopy{false}) &&
1645  std::find(ComputeEventList.begin(), ComputeEventList.end(), Event) !=
1646  ComputeEventList.end()) {
1648  }
1649  const auto &CopyEventList =
1650  CopyCommandBatch.OpenCommandList->second.EventList;
1651  if (hasOpenCommandList(IsCopy{true}) &&
1652  std::find(CopyEventList.begin(), CopyEventList.end(), Event) !=
1653  CopyEventList.end()) {
1655  }
1656  return CommandListMap.end();
1657 }
1658 
1661  // If there are any commands still in the open command list for this
1662  // queue, then close and execute that command list now.
1663  if (hasOpenCommandList(IsCopy)) {
1665  auto Res = executeCommandList(CommandBatch.OpenCommandList, false, false);
1667  return Res;
1668  }
1669 
1670  return PI_SUCCESS;
1671 }
1672 
1673 static const bool FilterEventWaitList = [] {
1674  const char *Ret = std::getenv("SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST");
1675  const bool RetVal = Ret ? std::stoi(Ret) : 1;
1676  return RetVal;
1677 }();
1678 
1680  pi_uint32 EventListLength, const pi_event *EventList, pi_queue CurQueue,
1681  bool UseCopyEngine) {
1682  this->Length = 0;
1683  this->ZeEventList = nullptr;
1684  this->PiEventList = nullptr;
1685 
1686  try {
1687  if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) {
1688  this->ZeEventList = new ze_event_handle_t[EventListLength + 1];
1689  this->PiEventList = new pi_event[EventListLength + 1];
1690  } else if (EventListLength > 0) {
1691  this->ZeEventList = new ze_event_handle_t[EventListLength];
1692  this->PiEventList = new pi_event[EventListLength];
1693  }
1694 
1695  pi_uint32 TmpListLength = 0;
1696 
1697  if (EventListLength > 0) {
1698  for (pi_uint32 I = 0; I < EventListLength; I++) {
1699  PI_ASSERT(EventList[I] != nullptr, PI_ERROR_INVALID_VALUE);
1700  {
1701  std::shared_lock Lock(EventList[I]->Mutex);
1702  if (EventList[I]->Completed)
1703  continue;
1704 
1705  // Poll of the host-visible events.
1706  auto HostVisibleEvent = EventList[I]->HostVisibleEvent;
1707  if (FilterEventWaitList && HostVisibleEvent) {
1708  auto Res = ZE_CALL_NOCHECK(zeEventQueryStatus,
1709  (HostVisibleEvent->ZeEvent));
1710  if (Res == ZE_RESULT_SUCCESS) {
1711  // Event has already completed, don't put it into the list
1712  continue;
1713  }
1714  }
1715  }
1716 
1717  auto Queue = EventList[I]->Queue;
1718  if (Queue) {
1719  // The caller of createAndRetainPiZeEventList must already hold
1720  // a lock of the CurQueue. Additionally lock the Queue if it
1721  // is different from CurQueue.
1722  // TODO: rework this to avoid deadlock when another thread is
1723  // locking the same queues but in a different order.
1724  auto Lock = ((Queue == CurQueue) ? std::unique_lock<pi_shared_mutex>()
1725  : std::unique_lock(Queue->Mutex));
1726 
1727  // If the event that is going to be waited is in an open batch
1728  // different from where this next command is going to be added,
1729  // then we have to force execute of that open command-list
1730  // to avoid deadlocks.
1731  //
1732  const auto &OpenCommandList =
1733  Queue->eventOpenCommandList(EventList[I]);
1734  if (OpenCommandList != Queue->CommandListMap.end()) {
1735 
1736  if (Queue == CurQueue &&
1737  OpenCommandList->second.isCopy(Queue) == UseCopyEngine) {
1738  // Don't force execute the batch yet since the new command
1739  // is going to the same open batch as the dependent event.
1740  } else {
1741  if (auto Res = Queue->executeOpenCommandList(
1742  OpenCommandList->second.isCopy(Queue)))
1743  return Res;
1744  }
1745  }
1746  } else {
1747  // There is a dependency on an interop-event.
1748  // Similarily to the above to avoid dead locks ensure that
1749  // execution of all prior commands in the current command-
1750  // batch is visible to the host. This may not be the case
1751  // when we intended to have only last command in the batch
1752  // produce host-visible event, e.g.
1753  //
1754  // event0 = interop event
1755  // event1 = command1 (already in batch, no deps)
1756  // event2 = command2 (is being added, dep on event0)
1757  // event3 = signal host-visible event for the batch
1758  // event1.wait()
1759  // event0.signal()
1760  //
1761  // Make sure that event1.wait() will wait for a host-visible
1762  // event that is signalled before the command2 is enqueued.
1763  if (EventsScope != AllHostVisible) {
1764  CurQueue->executeAllOpenCommandLists();
1765  }
1766  }
1767 
1768  std::shared_lock Lock(EventList[I]->Mutex);
1769  this->ZeEventList[TmpListLength] = EventList[I]->ZeEvent;
1770  this->PiEventList[TmpListLength] = EventList[I];
1771  TmpListLength += 1;
1772  }
1773  }
1774 
1775  // For in-order queues, every command should be executed only after the
1776  // previous command has finished. The event associated with the last
1777  // enqueued command is added into the waitlist to ensure in-order semantics.
1778  if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) {
1779 
1780  // Ensure LastCommandEvent's batch is submitted if it is differrent
1781  // from the one this command is going to.
1782  const auto &OpenCommandList =
1783  CurQueue->eventOpenCommandList(CurQueue->LastCommandEvent);
1784  if (OpenCommandList != CurQueue->CommandListMap.end() &&
1785  OpenCommandList->second.isCopy(CurQueue) != UseCopyEngine) {
1786 
1787  if (auto Res = CurQueue->executeOpenCommandList(
1788  OpenCommandList->second.isCopy(CurQueue)))
1789  return Res;
1790  }
1791  std::shared_lock Lock(CurQueue->LastCommandEvent->Mutex);
1792  this->ZeEventList[TmpListLength] = CurQueue->LastCommandEvent->ZeEvent;
1793  this->PiEventList[TmpListLength] = CurQueue->LastCommandEvent;
1794  TmpListLength += 1;
1795  }
1796 
1797  this->Length = TmpListLength;
1798 
1799  } catch (...) {
1800  return PI_ERROR_OUT_OF_HOST_MEMORY;
1801  }
1802 
1803  for (pi_uint32 I = 0; I < this->Length; I++) {
1804  PI_CALL(piEventRetain(this->PiEventList[I]));
1805  }
1806 
1807  return PI_SUCCESS;
1808 }
1809 
1810 static void printZeEventList(const _pi_ze_event_list_t &PiZeEventList) {
1811  zePrint(" NumEventsInWaitList %d:", PiZeEventList.Length);
1812 
1813  for (pi_uint32 I = 0; I < PiZeEventList.Length; I++) {
1814  zePrint(" %#lx", pi_cast<std::uintptr_t>(PiZeEventList.ZeEventList[I]));
1815  }
1816 
1817  zePrint("\n");
1818 }
1819 
1821  std::list<pi_event> &EventsToBeReleased) {
1822  // acquire a lock before reading the length and list fields.
1823  // Acquire the lock, copy the needed data locally, and reset
1824  // the fields, then release the lock.
1825  // Only then do we do the actual actions to release and destroy,
1826  // holding the lock for the minimum time necessary.
1827  pi_uint32 LocLength = 0;
1828  ze_event_handle_t *LocZeEventList = nullptr;
1829  pi_event *LocPiEventList = nullptr;
1830 
1831  {
1832  // acquire the lock and copy fields locally
1833  // Lock automatically releases when this goes out of scope.
1834  std::scoped_lock lock(this->PiZeEventListMutex);
1835 
1836  LocLength = Length;
1837  LocZeEventList = ZeEventList;
1838  LocPiEventList = PiEventList;
1839 
1840  Length = 0;
1841  ZeEventList = nullptr;
1842  PiEventList = nullptr;
1843 
1844  // release lock by ending scope.
1845  }
1846 
1847  for (pi_uint32 I = 0; I < LocLength; I++) {
1848  // Add the event to be released to the list
1849  EventsToBeReleased.push_back(LocPiEventList[I]);
1850  }
1851 
1852  if (LocZeEventList != nullptr) {
1853  delete[] LocZeEventList;
1854  }
1855  if (LocPiEventList != nullptr) {
1856  delete[] LocPiEventList;
1857  }
1858 
1859  return PI_SUCCESS;
1860 }
1861 
1862 extern "C" {
1863 
1864 // Forward declarations
1866 
1867 static ze_result_t
1869  ze_module_build_log_handle_t *ZeBuildLog);
1870 
1871 // This function will ensure compatibility with both Linux and Windows for
1872 // setting environment variables.
1873 static bool setEnvVar(const char *name, const char *value) {
1874 #ifdef _WIN32
1875  int Res = _putenv_s(name, value);
1876 #else
1877  int Res = setenv(name, value, 1);
1878 #endif
1879  if (Res != 0) {
1880  zePrint(
1881  "Level Zero plugin was unable to set the environment variable: %s\n",
1882  name);
1883  return false;
1884  }
1885  return true;
1886 }
1887 
1888 static class ZeUSMImportExtension {
1889  // Pointers to functions that import/release host memory into USM
1890  ze_result_t (*zexDriverImportExternalPointer)(ze_driver_handle_t hDriver,
1891  void *, size_t);
1892  ze_result_t (*zexDriverReleaseImportedPointer)(ze_driver_handle_t, void *);
1893 
1894 public:
1895  // Whether user has requested Import/Release, and platform supports it.
1896  bool Enabled;
1897 
1898  ZeUSMImportExtension() : Enabled{false} {}
1899 
1900  void setZeUSMImport(pi_platform Platform) {
1901  // Whether env var SYCL_USM_HOSTPTR_IMPORT has been set requesting
1902  // host ptr import during buffer creation.
1903  const char *USMHostPtrImportStr = std::getenv("SYCL_USM_HOSTPTR_IMPORT");
1904  if (!USMHostPtrImportStr || std::atoi(USMHostPtrImportStr) == 0)
1905  return;
1906 
1907  // Check if USM hostptr import feature is available.
1908  ze_driver_handle_t driverHandle = Platform->ZeDriver;
1909  if (ZE_CALL_NOCHECK(zeDriverGetExtensionFunctionAddress,
1910  (driverHandle, "zexDriverImportExternalPointer",
1911  reinterpret_cast<void **>(
1912  &zexDriverImportExternalPointer))) == 0) {
1914  zeDriverGetExtensionFunctionAddress,
1915  (driverHandle, "zexDriverReleaseImportedPointer",
1916  reinterpret_cast<void **>(&zexDriverReleaseImportedPointer)));
1917  // Hostptr import/release is turned on because it has been requested
1918  // by the env var, and this platform supports the APIs.
1919  Enabled = true;
1920  // Hostptr import is only possible if piMemBufferCreate receives a
1921  // hostptr as an argument. The SYCL runtime passes a host ptr
1922  // only when SYCL_HOST_UNIFIED_MEMORY is enabled. Therefore we turn it on.
1923  setEnvVar("SYCL_HOST_UNIFIED_MEMORY", "1");
1924  }
1925  }
1926  void doZeUSMImport(ze_driver_handle_t driverHandle, void *HostPtr,
1927  size_t Size) {
1928  ZE_CALL_NOCHECK(zexDriverImportExternalPointer,
1929  (driverHandle, HostPtr, Size));
1930  }
1931  void doZeUSMRelease(ze_driver_handle_t driverHandle, void *HostPtr) {
1932  ZE_CALL_NOCHECK(zexDriverReleaseImportedPointer, (driverHandle, HostPtr));
1933  }
1934 } ZeUSMImport;
1935 
1937  // Cache driver properties
1938  ZeStruct<ze_driver_properties_t> ZeDriverProperties;
1939  ZE_CALL(zeDriverGetProperties, (ZeDriver, &ZeDriverProperties));
1940  uint32_t DriverVersion = ZeDriverProperties.driverVersion;
1941  // Intel Level-Zero GPU driver stores version as:
1942  // | 31 - 24 | 23 - 16 | 15 - 0 |
1943  // | Major | Minor | Build |
1944  auto VersionMajor = std::to_string((DriverVersion & 0xFF000000) >> 24);
1945  auto VersionMinor = std::to_string((DriverVersion & 0x00FF0000) >> 16);
1946  auto VersionBuild = std::to_string(DriverVersion & 0x0000FFFF);
1947  ZeDriverVersion = VersionMajor + "." + VersionMinor + "." + VersionBuild;
1948 
1949  ZE_CALL(zeDriverGetApiVersion, (ZeDriver, &ZeApiVersion));
1950  ZeDriverApiVersion = std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + "." +
1951  std::to_string(ZE_MINOR_VERSION(ZeApiVersion));
1952 
1953  // Cache driver extension properties
1954  uint32_t Count = 0;
1955  ZE_CALL(zeDriverGetExtensionProperties, (ZeDriver, &Count, nullptr));
1956 
1957  std::vector<ze_driver_extension_properties_t> zeExtensions(Count);
1958 
1959  ZE_CALL(zeDriverGetExtensionProperties,
1960  (ZeDriver, &Count, zeExtensions.data()));
1961 
1962  for (auto extension : zeExtensions) {
1963  // Check if global offset extension is available
1964  if (strncmp(extension.name, ZE_GLOBAL_OFFSET_EXP_NAME,
1965  strlen(ZE_GLOBAL_OFFSET_EXP_NAME) + 1) == 0) {
1966  if (extension.version == ZE_GLOBAL_OFFSET_EXP_VERSION_1_0) {
1968  }
1969  }
1970  // Check if extension is available for "static linking" (compiling multiple
1971  // SPIR-V modules together into one Level Zero module).
1972  if (strncmp(extension.name, ZE_MODULE_PROGRAM_EXP_NAME,
1973  strlen(ZE_MODULE_PROGRAM_EXP_NAME) + 1) == 0) {
1974  if (extension.version == ZE_MODULE_PROGRAM_EXP_VERSION_1_0) {
1976  }
1977  }
1978  zeDriverExtensionMap[extension.name] = extension.version;
1979  }
1980 
1981  // Check if import user ptr into USM feature has been requested.
1982  // If yes, then set up L0 API pointers if the platform supports it.
1984 
1985  return PI_SUCCESS;
1986 }
1987 
1989  pi_uint32 *NumPlatforms) {
1990 
1991  static const char *PiTrace = std::getenv("SYCL_PI_TRACE");
1992  static const int PiTraceValue = PiTrace ? std::stoi(PiTrace) : 0;
1993  if (PiTraceValue == -1 || PiTraceValue == 2) { // Means print all PI traces
1994  PrintPiTrace = true;
1995  }
1996 
1997  static std::once_flag ZeCallCountInitialized;
1998  try {
1999  std::call_once(ZeCallCountInitialized, []() {
2000  if (ZeDebug & ZE_DEBUG_CALL_COUNT) {
2001  ZeCallCount = new std::map<const char *, int>;
2002  }
2003  });
2004  } catch (const std::bad_alloc &) {
2005  return PI_ERROR_OUT_OF_HOST_MEMORY;
2006  } catch (...) {
2007  return PI_ERROR_UNKNOWN;
2008  }
2009 
2010  if (NumEntries == 0 && Platforms != nullptr) {
2011  return PI_ERROR_INVALID_VALUE;
2012  }
2013  if (Platforms == nullptr && NumPlatforms == nullptr) {
2014  return PI_ERROR_INVALID_VALUE;
2015  }
2016 
2017  // Setting these environment variables before running zeInit will enable the
2018  // validation layer in the Level Zero loader.
2019  if (ZeDebug & ZE_DEBUG_VALIDATION) {
2020  setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1");
2021  setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1");
2022  }
2023 
2024  // Enable SYSMAN support for obtaining the PCI address
2025  // and maximum memory bandwidth.
2026  if (getenv("SYCL_ENABLE_PCI") != nullptr) {
2027  setEnvVar("ZES_ENABLE_SYSMAN", "1");
2028  }
2029 
2030  // TODO: We can still safely recover if something goes wrong during the init.
2031  // Implement handling segfault using sigaction.
2032 
2033  // We must only initialize the driver once, even if piPlatformsGet() is called
2034  // multiple times. Declaring the return value as "static" ensures it's only
2035  // called once.
2036  static ze_result_t ZeResult = ZE_CALL_NOCHECK(zeInit, (0));
2037 
2038  // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms.
2039  if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
2040  PI_ASSERT(NumPlatforms != 0, PI_ERROR_INVALID_VALUE);
2041  *NumPlatforms = 0;
2042  return PI_SUCCESS;
2043  }
2044 
2045  if (ZeResult != ZE_RESULT_SUCCESS) {
2046  zePrint("zeInit: Level Zero initialization failure\n");
2047  return mapError(ZeResult);
2048  }
2049 
2050  // Cache pi_platforms for reuse in the future
2051  // It solves two problems;
2052  // 1. sycl::platform equality issue; we always return the same pi_platform.
2053  // 2. performance; we can save time by immediately return from cache.
2054  //
2055 
2056  const std::lock_guard<sycl::detail::SpinLock> Lock{*PiPlatformsCacheMutex};
2057  if (!PiPlatformCachePopulated) {
2058  try {
2059  // Level Zero does not have concept of Platforms, but Level Zero driver is
2060  // the closest match.
2061  uint32_t ZeDriverCount = 0;
2062  ZE_CALL(zeDriverGet, (&ZeDriverCount, nullptr));
2063  if (ZeDriverCount == 0) {
2064  PiPlatformCachePopulated = true;
2065  } else {
2066  std::vector<ze_driver_handle_t> ZeDrivers;
2067  ZeDrivers.resize(ZeDriverCount);
2068 
2069  ZE_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data()));
2070  for (uint32_t I = 0; I < ZeDriverCount; ++I) {
2071  pi_platform Platform = new _pi_platform(ZeDrivers[I]);
2072  pi_result Result = Platform->initialize();
2073  if (Result != PI_SUCCESS) {
2074  return Result;
2075  }
2076  // Save a copy in the cache for future uses.
2077  PiPlatformsCache->push_back(Platform);
2078  }
2079  PiPlatformCachePopulated = true;
2080  }
2081  } catch (const std::bad_alloc &) {
2082  return PI_ERROR_OUT_OF_HOST_MEMORY;
2083  } catch (...) {
2084  return PI_ERROR_UNKNOWN;
2085  }
2086  }
2087 
2088  // Populate returned platforms from the cache.
2089  if (Platforms) {
2090  PI_ASSERT(NumEntries <= PiPlatformsCache->size(),
2091  PI_ERROR_INVALID_PLATFORM);
2092  std::copy_n(PiPlatformsCache->begin(), NumEntries, Platforms);
2093  }
2094 
2095  if (NumPlatforms) {
2096  *NumPlatforms = PiPlatformsCache->size();
2097  }
2098 
2099  zePrint("Using events scope: %s\n",
2100  EventsScope == AllHostVisible ? "all host-visible"
2101  : EventsScope == OnDemandHostVisibleProxy
2102  ? "on demand host-visible proxy"
2103  : "only last command in a batch is host-visible");
2104  return PI_SUCCESS;
2105 }
2106 
2108  size_t ParamValueSize, void *ParamValue,
2109  size_t *ParamValueSizeRet) {
2110 
2111  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
2112 
2113  zePrint("==========================\n");
2114  zePrint("SYCL over Level-Zero %s\n", Platform->ZeDriverVersion.c_str());
2115  zePrint("==========================\n");
2116 
2117  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
2118 
2119  switch (ParamName) {
2120  case PI_PLATFORM_INFO_NAME:
2121  // TODO: Query Level Zero driver when relevant info is added there.
2122  return ReturnValue("Intel(R) Level-Zero");
2124  // TODO: Query Level Zero driver when relevant info is added there.
2125  return ReturnValue("Intel(R) Corporation");
2127  // Convention adopted from OpenCL:
2128  // "Returns a space-separated list of extension names (the extension
2129  // names themselves do not contain any spaces) supported by the platform.
2130  // Extensions defined here must be supported by all devices associated
2131  // with this platform."
2132  //
2133  // TODO: Check the common extensions supported by all connected devices and
2134  // return them. For now, hardcoding some extensions we know are supported by
2135  // all Level Zero devices.
2136  return ReturnValue(ZE_SUPPORTED_EXTENSIONS);
2138  // TODO: figure out what this means and how is this used
2139  return ReturnValue("FULL_PROFILE");
2141  // TODO: this should query to zeDriverGetDriverVersion
2142  // but we don't yet have the driver handle here.
2143  //
2144  // From OpenCL 2.1: "This version string has the following format:
2145  // OpenCL<space><major_version.minor_version><space><platform-specific
2146  // information>. Follow the same notation here.
2147  //
2148  return ReturnValue(Platform->ZeDriverApiVersion.c_str());
2149  default:
2150  zePrint("piPlatformGetInfo: unrecognized ParamName\n");
2151  return PI_ERROR_INVALID_VALUE;
2152  }
2153 
2154  return PI_SUCCESS;
2155 }
2156 
2158  pi_native_handle *NativeHandle) {
2159  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
2160  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2161 
2162  auto ZeDriver = pi_cast<ze_driver_handle_t *>(NativeHandle);
2163  // Extract the Level Zero driver handle from the given PI platform
2164  *ZeDriver = Platform->ZeDriver;
2165  return PI_SUCCESS;
2166 }
2167 
2169  pi_platform *Platform) {
2170  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
2171  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2172 
2173  auto ZeDriver = pi_cast<ze_driver_handle_t>(NativeHandle);
2174 
2175  pi_uint32 NumPlatforms = 0;
2176  pi_result Res = piPlatformsGet(0, nullptr, &NumPlatforms);
2177  if (Res != PI_SUCCESS) {
2178  return Res;
2179  }
2180 
2181  if (NumPlatforms) {
2182  std::vector<pi_platform> Platforms(NumPlatforms);
2183  PI_CALL(piPlatformsGet(NumPlatforms, Platforms.data(), nullptr));
2184 
2185  // The SYCL spec requires that the set of platforms must remain fixed for
2186  // the duration of the application's execution. We assume that we found all
2187  // of the Level Zero drivers when we initialized the platform cache, so the
2188  // "NativeHandle" must already be in the cache. If it is not, this must not
2189  // be a valid Level Zero driver.
2190  for (const pi_platform &CachedPlatform : Platforms) {
2191  if (CachedPlatform->ZeDriver == ZeDriver) {
2192  *Platform = CachedPlatform;
2193  return PI_SUCCESS;
2194  }
2195  }
2196  }
2197 
2198  return PI_ERROR_INVALID_VALUE;
2199 }
2200 
2201 // Get the cached PI device created for the L0 device handle.
2202 // Return NULL if no such PI device found.
2204 
2205  pi_result Res = populateDeviceCacheIfNeeded();
2206  if (Res != PI_SUCCESS) {
2207  return nullptr;
2208  }
2209 
2210  // TODO: our sub-sub-device representation is currently [Level-Zero device
2211  // handle + Level-Zero compute group/engine index], so there is now no 1:1
2212  // mapping from L0 device handle to PI device assumed in this function. Until
2213  // Level-Zero adds unique ze_device_handle_t for sub-sub-devices, here we
2214  // filter out PI sub-sub-devices.
2215  std::shared_lock Lock(PiDevicesCacheMutex);
2216  auto it = std::find_if(PiDevicesCache.begin(), PiDevicesCache.end(),
2217  [&](std::unique_ptr<_pi_device> &D) {
2218  return D.get()->ZeDevice == ZeDevice &&
2219  (D.get()->RootDevice == nullptr ||
2220  D.get()->RootDevice->RootDevice == nullptr);
2221  });
2222  if (it != PiDevicesCache.end()) {
2223  return (*it).get();
2224  }
2225  return nullptr;
2226 }
2227 
2229  pi_uint32 NumEntries, pi_device *Devices,
2230  pi_uint32 *NumDevices) {
2231 
2232  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
2233 
2234  pi_result Res = Platform->populateDeviceCacheIfNeeded();
2235  if (Res != PI_SUCCESS) {
2236  return Res;
2237  }
2238 
2239  // Filter available devices based on input DeviceType.
2240  std::vector<pi_device> MatchedDevices;
2241  std::shared_lock Lock(Platform->PiDevicesCacheMutex);
2242  for (auto &D : Platform->PiDevicesCache) {
2243  // Only ever return root-devices from piDevicesGet, but the
2244  // devices cache also keeps sub-devices.
2245  if (D->isSubDevice())
2246  continue;
2247 
2248  bool Matched = false;
2249  switch (DeviceType) {
2250  case PI_DEVICE_TYPE_ALL:
2251  Matched = true;
2252  break;
2253  case PI_DEVICE_TYPE_GPU:
2255  Matched = (D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_GPU);
2256  break;
2257  case PI_DEVICE_TYPE_CPU:
2258  Matched = (D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_CPU);
2259  break;
2260  case PI_DEVICE_TYPE_ACC:
2261  Matched = (D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_MCA ||
2262  D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_FPGA);
2263  break;
2264  default:
2265  Matched = false;
2266  zePrint("Unknown device type");
2267  break;
2268  }
2269  if (Matched)
2270  MatchedDevices.push_back(D.get());
2271  }
2272 
2273  uint32_t ZeDeviceCount = MatchedDevices.size();
2274 
2275  if (NumDevices)
2276  *NumDevices = ZeDeviceCount;
2277 
2278  if (NumEntries == 0) {
2279  // Devices should be nullptr when querying the number of devices.
2280  PI_ASSERT(Devices == nullptr, PI_ERROR_INVALID_VALUE);
2281  return PI_SUCCESS;
2282  }
2283 
2284  // Return the devices from the cache.
2285  if (Devices) {
2286  PI_ASSERT(NumEntries <= ZeDeviceCount, PI_ERROR_INVALID_DEVICE);
2287  std::copy_n(MatchedDevices.begin(), NumEntries, Devices);
2288  }
2289 
2290  return PI_SUCCESS;
2291 }
2292 
2293 // Check the device cache and load it if necessary.
2295  std::scoped_lock Lock(PiDevicesCacheMutex);
2296 
2297  if (DeviceCachePopulated) {
2298  return PI_SUCCESS;
2299  }
2300 
2301  uint32_t ZeDeviceCount = 0;
2302  ZE_CALL(zeDeviceGet, (ZeDriver, &ZeDeviceCount, nullptr));
2303 
2304  try {
2305  std::vector<ze_device_handle_t> ZeDevices(ZeDeviceCount);
2306  ZE_CALL(zeDeviceGet, (ZeDriver, &ZeDeviceCount, ZeDevices.data()));
2307 
2308  for (uint32_t I = 0; I < ZeDeviceCount; ++I) {
2309  std::unique_ptr<_pi_device> Device(new _pi_device(ZeDevices[I], this));
2310  pi_result Result = Device->initialize();
2311  if (Result != PI_SUCCESS) {
2312  return Result;
2313  }
2314 
2315  // Additionally we need to cache all sub-devices too, such that they
2316  // are readily visible to the piextDeviceCreateWithNativeHandle.
2317  //
2318  pi_uint32 SubDevicesCount = 0;
2319  ZE_CALL(zeDeviceGetSubDevices,
2320  (Device->ZeDevice, &SubDevicesCount, nullptr));
2321 
2322  auto ZeSubdevices = new ze_device_handle_t[SubDevicesCount];
2323  ZE_CALL(zeDeviceGetSubDevices,
2324  (Device->ZeDevice, &SubDevicesCount, ZeSubdevices));
2325 
2326  // Wrap the Level Zero sub-devices into PI sub-devices, and add them to
2327  // cache.
2328  for (uint32_t I = 0; I < SubDevicesCount; ++I) {
2329  std::unique_ptr<_pi_device> PiSubDevice(
2330  new _pi_device(ZeSubdevices[I], this, Device.get()));
2331  pi_result Result = PiSubDevice->initialize();
2332  if (Result != PI_SUCCESS) {
2333  delete[] ZeSubdevices;
2334  return Result;
2335  }
2336 
2337  // collect all the ordinals for the sub-sub-devices
2338  std::vector<int> Ordinals;
2339 
2340  uint32_t numQueueGroups = 0;
2341  ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
2342  (PiSubDevice->ZeDevice, &numQueueGroups, nullptr));
2343  if (numQueueGroups == 0) {
2344  return PI_ERROR_UNKNOWN;
2345  }
2346  std::vector<ze_command_queue_group_properties_t> QueueGroupProperties(
2347  numQueueGroups);
2348  ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
2349  (PiSubDevice->ZeDevice, &numQueueGroups,
2350  QueueGroupProperties.data()));
2351 
2352  for (uint32_t i = 0; i < numQueueGroups; i++) {
2353  if (QueueGroupProperties[i].flags &
2354  ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE &&
2355  QueueGroupProperties[i].numQueues > 1) {
2356  Ordinals.push_back(i);
2357  }
2358  }
2359 
2360  // Create PI sub-sub-devices with the sub-device for all the ordinals.
2361  // Each {ordinal, index} points to a specific CCS which constructs
2362  // a sub-sub-device at this point.
2363  // FIXME: Level Zero creates multiple PiDevices for a single physical
2364  // device when sub-device is partitioned into sub-sub-devices.
2365  // Sub-sub-device is technically a command queue and we should not build
2366  // program for each command queue. PiDevice is probably not the right
2367  // abstraction for a Level Zero command queue.
2368  for (uint32_t J = 0; J < Ordinals.size(); ++J) {
2369  for (uint32_t K = 0; K < QueueGroupProperties[Ordinals[J]].numQueues;
2370  ++K) {
2371  std::unique_ptr<_pi_device> PiSubSubDevice(
2372  new _pi_device(ZeSubdevices[I], this, PiSubDevice.get()));
2373  pi_result Result = PiSubSubDevice->initialize(Ordinals[J], K);
2374  if (Result != PI_SUCCESS) {
2375  return Result;
2376  }
2377 
2378  // save pointers to sub-sub-devices for quick retrieval in the
2379  // future.
2380  PiSubDevice->SubDevices.push_back(PiSubSubDevice.get());
2381  PiDevicesCache.push_back(std::move(PiSubSubDevice));
2382  }
2383  }
2384 
2385  // save pointers to sub-devices for quick retrieval in the future.
2386  Device->SubDevices.push_back(PiSubDevice.get());
2387  PiDevicesCache.push_back(std::move(PiSubDevice));
2388  }
2389  delete[] ZeSubdevices;
2390 
2391  // Save the root device in the cache for future uses.
2392  PiDevicesCache.push_back(std::move(Device));
2393  }
2394  } catch (const std::bad_alloc &) {
2395  return PI_ERROR_OUT_OF_HOST_MEMORY;
2396  } catch (...) {
2397  return PI_ERROR_UNKNOWN;
2398  }
2399  DeviceCachePopulated = true;
2400  return PI_SUCCESS;
2401 }
2402 
2404  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
2405 
2406  // The root-device ref-count remains unchanged (always 1).
2407  if (Device->isSubDevice()) {
2408  Device->RefCount.increment();
2409  }
2410  return PI_SUCCESS;
2411 }
2412 
2414  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
2415 
2416  // Root devices are destroyed during the piTearDown process.
2417  if (Device->isSubDevice()) {
2418  if (Device->RefCount.decrementAndTest()) {
2419  delete Device;
2420  }
2421  }
2422 
2423  return PI_SUCCESS;
2424 }
2425 
2427  size_t ParamValueSize, void *ParamValue,
2428  size_t *ParamValueSizeRet) {
2429 
2430  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
2431 
2432  ze_device_handle_t ZeDevice = Device->ZeDevice;
2433 
2434  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
2435 
2436  switch (ParamName) {
2437  case PI_DEVICE_INFO_TYPE: {
2438  switch (Device->ZeDeviceProperties->type) {
2439  case ZE_DEVICE_TYPE_GPU:
2440  return ReturnValue(PI_DEVICE_TYPE_GPU);
2441  case ZE_DEVICE_TYPE_CPU:
2442  return ReturnValue(PI_DEVICE_TYPE_CPU);
2443  case ZE_DEVICE_TYPE_MCA:
2444  case ZE_DEVICE_TYPE_FPGA:
2445  return ReturnValue(PI_DEVICE_TYPE_ACC);
2446  default:
2447  zePrint("This device type is not supported\n");
2448  return PI_ERROR_INVALID_VALUE;
2449  }
2450  }
2452  return ReturnValue(Device->RootDevice);
2454  return ReturnValue(Device->Platform);
2456  return ReturnValue(pi_uint32{Device->ZeDeviceProperties->vendorId});
2457  case PI_DEVICE_INFO_UUID:
2458  // Intel extension for device UUID. This returns the UUID as
2459  // std::array<std::byte, 16>. For details about this extension,
2460  // see sycl/doc/extensions/supported/sycl_ext_intel_device_info.md.
2461  return ReturnValue(Device->ZeDeviceProperties->uuid.id);
2463  return ReturnValue(pi_bool{Device->ZeDeviceModuleProperties->flags &
2464  ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS});
2466  // Convention adopted from OpenCL:
2467  // "Returns a space separated list of extension names (the extension
2468  // names themselves do not contain any spaces) supported by the device."
2469  //
2470  // TODO: Use proper mechanism to get this information from Level Zero after
2471  // it is added to Level Zero.
2472  // Hardcoding the few we know are supported by the current hardware.
2473  //
2474  //
2475  std::string SupportedExtensions;
2476 
2477  // cl_khr_il_program - OpenCL 2.0 KHR extension for SPIR-V support. Core
2478  // feature in >OpenCL 2.1
2479  // cl_khr_subgroups - Extension adds support for implementation-controlled
2480  // subgroups.
2481  // cl_intel_subgroups - Extension adds subgroup features, defined by Intel.
2482  // cl_intel_subgroups_short - Extension adds subgroup functions described in
2483  // the cl_intel_subgroups extension to support 16-bit integer data types
2484  // for performance.
2485  // cl_intel_required_subgroup_size - Extension to allow programmers to
2486  // optionally specify the required subgroup size for a kernel function.
2487  // cl_khr_fp16 - Optional half floating-point support.
2488  // cl_khr_fp64 - Support for double floating-point precision.
2489  // cl_khr_int64_base_atomics, cl_khr_int64_extended_atomics - Optional
2490  // extensions that implement atomic operations on 64-bit signed and
2491  // unsigned integers to locations in __global and __local memory.
2492  // cl_khr_3d_image_writes - Extension to enable writes to 3D image memory
2493  // objects.
2494  //
2495  // Hardcoding some extensions we know are supported by all Level Zero
2496  // devices.
2497  SupportedExtensions += (ZE_SUPPORTED_EXTENSIONS);
2498  if (Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP16)
2499  SupportedExtensions += ("cl_khr_fp16 ");
2500  if (Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP64)
2501  SupportedExtensions += ("cl_khr_fp64 ");
2502  if (Device->ZeDeviceModuleProperties->flags &
2503  ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS)
2504  // int64AtomicsSupported indicates support for both.
2505  SupportedExtensions +=
2506  ("cl_khr_int64_base_atomics cl_khr_int64_extended_atomics ");
2507  if (Device->ZeDeviceImageProperties->maxImageDims3D > 0)
2508  // Supports reading and writing of images.
2509  SupportedExtensions += ("cl_khr_3d_image_writes ");
2510 
2511  return ReturnValue(SupportedExtensions.c_str());
2512  }
2513  case PI_DEVICE_INFO_NAME:
2514  return ReturnValue(Device->ZeDeviceProperties->name);
2515  // zeModuleCreate allows using root device module for sub-devices:
2516  // > The application must only use the module for the device, or its
2517  // > sub-devices, which was provided during creation.
2519  return ReturnValue(PI_FALSE);
2521  return ReturnValue(pi_bool{1});
2523  return ReturnValue(pi_bool{1});
2525  pi_uint32 MaxComputeUnits =
2526  Device->ZeDeviceProperties->numEUsPerSubslice *
2527  Device->ZeDeviceProperties->numSubslicesPerSlice *
2528  Device->ZeDeviceProperties->numSlices;
2529  return ReturnValue(pi_uint32{MaxComputeUnits});
2530  }
2532  // Level Zero spec defines only three dimensions
2533  return ReturnValue(pi_uint32{3});
2535  return ReturnValue(
2536  pi_uint64{Device->ZeDeviceComputeProperties->maxTotalGroupSize});
2538  struct {
2539  size_t Arr[3];
2540  } MaxGroupSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX,
2541  Device->ZeDeviceComputeProperties->maxGroupSizeY,
2542  Device->ZeDeviceComputeProperties->maxGroupSizeZ}};
2543  return ReturnValue(MaxGroupSize);
2544  }
2546  struct {
2547  size_t Arr[3];
2548  } MaxGroupCounts = {{Device->ZeDeviceComputeProperties->maxGroupCountX,
2549  Device->ZeDeviceComputeProperties->maxGroupCountY,
2550  Device->ZeDeviceComputeProperties->maxGroupCountZ}};
2551  return ReturnValue(MaxGroupCounts);
2552  }
2554  return ReturnValue(pi_uint32{Device->ZeDeviceProperties->coreClockRate});
2556  // TODO: To confirm with spec.
2557  return ReturnValue(pi_uint32{64});
2558  }
2560  return ReturnValue(pi_uint64{Device->ZeDeviceProperties->maxMemAllocSize});
2562  uint64_t GlobalMemSize = 0;
2563  for (uint32_t I = 0; I < Device->ZeDeviceMemoryProperties->size(); I++) {
2564  GlobalMemSize +=
2565  (*Device->ZeDeviceMemoryProperties.operator->())[I].totalSize;
2566  }
2567  return ReturnValue(pi_uint64{GlobalMemSize});
2568  }
2570  return ReturnValue(
2571  pi_uint64{Device->ZeDeviceComputeProperties->maxSharedLocalMemory});
2573  return ReturnValue(
2574  pi_bool{Device->ZeDeviceImageProperties->maxImageDims1D > 0});
2576  return ReturnValue(pi_bool{(Device->ZeDeviceProperties->flags &
2577  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0});
2579  return ReturnValue(pi_bool{ZeDevice ? true : false});
2580  case PI_DEVICE_INFO_VENDOR:
2581  // TODO: Level-Zero does not return vendor's name at the moment
2582  // only the ID.
2583  return ReturnValue("Intel(R) Corporation");
2585  return ReturnValue(Device->Platform->ZeDriverVersion.c_str());
2587  return ReturnValue(Device->Platform->ZeDriverApiVersion.c_str());
2590  if (Res != PI_SUCCESS) {
2591  return Res;
2592  }
2593  return ReturnValue(pi_uint32{(unsigned int)(Device->SubDevices.size())});
2594  }
2596  return ReturnValue(pi_uint32{Device->RefCount.load()});
2598  // SYCL spec says: if this SYCL device cannot be partitioned into at least
2599  // two sub devices then the returned vector must be empty.
2601  if (Res != PI_SUCCESS) {
2602  return Res;
2603  }
2604 
2605  uint32_t ZeSubDeviceCount = Device->SubDevices.size();
2606  if (ZeSubDeviceCount < 2) {
2607  return ReturnValue(pi_device_partition_property{0});
2608  }
2609  // It is debatable if SYCL sub-device and partitioning APIs sufficient to
2610  // expose Level Zero sub-devices? We start with support of
2611  // "partition_by_affinity_domain" and "next_partitionable" but if that
2612  // doesn't seem to be a good fit we could look at adding a more descriptive
2613  // partitioning type.
2614  struct {
2616  } PartitionProperties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, 0}};
2617  return ReturnValue(PartitionProperties);
2618  }
2620  return ReturnValue(pi_device_affinity_domain{
2624  if (Device->isSubDevice()) {
2625  struct {
2627  } PartitionProperties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
2629  0}};
2630  return ReturnValue(PartitionProperties);
2631  }
2632  // For root-device there is no partitioning to report.
2633  return ReturnValue(pi_device_partition_property{0});
2634  }
2635 
2636  // Everything under here is not supported yet
2637 
2639  return ReturnValue("");
2641  return ReturnValue(pi_bool{true});
2643  return ReturnValue(
2644  size_t{Device->ZeDeviceModuleProperties->printfBufferSize});
2646  return ReturnValue("FULL_PROFILE");
2648  // TODO: To find out correct value
2649  return ReturnValue("");
2651  return ReturnValue(pi_queue_properties{
2654  return ReturnValue(
2657  return ReturnValue(pi_bool{true});
2659  return ReturnValue(pi_bool{Device->ZeDeviceProperties->flags &
2660  ZE_DEVICE_PROPERTY_FLAG_ECC});
2662  return ReturnValue(size_t{Device->ZeDeviceProperties->timerResolution});
2664  return ReturnValue(PI_DEVICE_LOCAL_MEM_TYPE_LOCAL);
2666  return ReturnValue(pi_uint32{64});
2668  return ReturnValue(
2669  pi_uint64{Device->ZeDeviceImageProperties->maxImageBufferSize});
2671  return ReturnValue(PI_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE);
2673  return ReturnValue(
2674  // TODO[1.0]: how to query cache line-size?
2675  pi_uint32{1});
2677  return ReturnValue(pi_uint64{Device->ZeDeviceCacheProperties->cacheSize});
2679  return ReturnValue(
2680  size_t{Device->ZeDeviceModuleProperties->maxArgumentsSize});
2682  // SYCL/OpenCL spec is vague on what this means exactly, but seems to
2683  // be for "alignment requirement (in bits) for sub-buffer offsets."
2684  // An OpenCL implementation returns 8*128, but Level Zero can do just 8,
2685  // meaning unaligned access for values of types larger than 8 bits.
2686  return ReturnValue(pi_uint32{8});
2688  return ReturnValue(pi_uint32{Device->ZeDeviceImageProperties->maxSamplers});
2690  return ReturnValue(
2691  pi_uint32{Device->ZeDeviceImageProperties->maxReadImageArgs});
2693  return ReturnValue(
2694  pi_uint32{Device->ZeDeviceImageProperties->maxWriteImageArgs});
2696  uint64_t SingleFPValue = 0;
2697  ze_device_fp_flags_t ZeSingleFPCapabilities =
2698  Device->ZeDeviceModuleProperties->fp32flags;
2699  if (ZE_DEVICE_FP_FLAG_DENORM & ZeSingleFPCapabilities) {
2700  SingleFPValue |= PI_FP_DENORM;
2701  }
2702  if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeSingleFPCapabilities) {
2703  SingleFPValue |= PI_FP_INF_NAN;
2704  }
2705  if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeSingleFPCapabilities) {
2706  SingleFPValue |= PI_FP_ROUND_TO_NEAREST;
2707  }
2708  if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeSingleFPCapabilities) {
2709  SingleFPValue |= PI_FP_ROUND_TO_ZERO;
2710  }
2711  if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeSingleFPCapabilities) {
2712  SingleFPValue |= PI_FP_ROUND_TO_INF;
2713  }
2714  if (ZE_DEVICE_FP_FLAG_FMA & ZeSingleFPCapabilities) {
2715  SingleFPValue |= PI_FP_FMA;
2716  }
2717  if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeSingleFPCapabilities) {
2718  SingleFPValue |= PI_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
2719  }
2720  return ReturnValue(pi_uint64{SingleFPValue});
2721  }
2723  uint64_t HalfFPValue = 0;
2724  ze_device_fp_flags_t ZeHalfFPCapabilities =
2725  Device->ZeDeviceModuleProperties->fp16flags;
2726  if (ZE_DEVICE_FP_FLAG_DENORM & ZeHalfFPCapabilities) {
2727  HalfFPValue |= PI_FP_DENORM;
2728  }
2729  if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeHalfFPCapabilities) {
2730  HalfFPValue |= PI_FP_INF_NAN;
2731  }
2732  if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeHalfFPCapabilities) {
2733  HalfFPValue |= PI_FP_ROUND_TO_NEAREST;
2734  }
2735  if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeHalfFPCapabilities) {
2736  HalfFPValue |= PI_FP_ROUND_TO_ZERO;
2737  }
2738  if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeHalfFPCapabilities) {
2739  HalfFPValue |= PI_FP_ROUND_TO_INF;
2740  }
2741  if (ZE_DEVICE_FP_FLAG_FMA & ZeHalfFPCapabilities) {
2742  HalfFPValue |= PI_FP_FMA;
2743  }
2744  if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeHalfFPCapabilities) {
2745  HalfFPValue |= PI_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
2746  }
2747  return ReturnValue(pi_uint64{HalfFPValue});
2748  }
2750  uint64_t DoubleFPValue = 0;
2751  ze_device_fp_flags_t ZeDoubleFPCapabilities =
2752  Device->ZeDeviceModuleProperties->fp64flags;
2753  if (ZE_DEVICE_FP_FLAG_DENORM & ZeDoubleFPCapabilities) {
2754  DoubleFPValue |= PI_FP_DENORM;
2755  }
2756  if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeDoubleFPCapabilities) {
2757  DoubleFPValue |= PI_FP_INF_NAN;
2758  }
2759  if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeDoubleFPCapabilities) {
2760  DoubleFPValue |= PI_FP_ROUND_TO_NEAREST;
2761  }
2762  if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeDoubleFPCapabilities) {
2763  DoubleFPValue |= PI_FP_ROUND_TO_ZERO;
2764  }
2765  if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeDoubleFPCapabilities) {
2766  DoubleFPValue |= PI_FP_ROUND_TO_INF;
2767  }
2768  if (ZE_DEVICE_FP_FLAG_FMA & ZeDoubleFPCapabilities) {
2769  DoubleFPValue |= PI_FP_FMA;
2770  }
2771  if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeDoubleFPCapabilities) {
2772  DoubleFPValue |= PI_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
2773  }
2774  return ReturnValue(pi_uint64{DoubleFPValue});
2775  }
2777  return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims2D});
2779  return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims2D});
2781  return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D});
2783  return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D});
2785  return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D});
2787  return ReturnValue(
2788  size_t{Device->ZeDeviceImageProperties->maxImageBufferSize});
2790  return ReturnValue(
2791  size_t{Device->ZeDeviceImageProperties->maxImageArraySlices});
2792  // Handle SIMD widths.
2793  // TODO: can we do better than this?
2796  return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 1);
2799  return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 2);
2802  return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 4);
2805  return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 8);
2808  return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 4);
2811  return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 8);
2814  return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 2);
2816  // Max_num_sub_Groups = maxTotalGroupSize/min(set of subGroupSizes);
2817  uint32_t MinSubGroupSize =
2818  Device->ZeDeviceComputeProperties->subGroupSizes[0];
2819  for (uint32_t I = 1;
2820  I < Device->ZeDeviceComputeProperties->numSubGroupSizes; I++) {
2821  if (MinSubGroupSize > Device->ZeDeviceComputeProperties->subGroupSizes[I])
2822  MinSubGroupSize = Device->ZeDeviceComputeProperties->subGroupSizes[I];
2823  }
2824  return ReturnValue(Device->ZeDeviceComputeProperties->maxTotalGroupSize /
2825  MinSubGroupSize);
2826  }
2828  // TODO: Not supported yet. Needs to be updated after support is added.
2829  return ReturnValue(pi_bool{false});
2830  }
2832  // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the
2833  // expected return is size_t datatype. size_t can be 8 bytes of data.
2834  return getInfoArray<uint32_t, size_t>(
2835  Device->ZeDeviceComputeProperties->numSubGroupSizes, ParamValueSize,
2836  ParamValue, ParamValueSizeRet,
2837  Device->ZeDeviceComputeProperties->subGroupSizes);
2838  }
2840  // Set to a space separated list of IL version strings of the form
2841  // <IL_Prefix>_<Major_version>.<Minor_version>.
2842  // "SPIR-V" is a required IL prefix when cl_khr_il_progam extension is
2843  // reported.
2844  uint32_t SpirvVersion =
2845  Device->ZeDeviceModuleProperties->spirvVersionSupported;
2846  uint32_t SpirvVersionMajor = ZE_MAJOR_VERSION(SpirvVersion);
2847  uint32_t SpirvVersionMinor = ZE_MINOR_VERSION(SpirvVersion);
2848 
2849  char SpirvVersionString[50];
2850  int Len = sprintf(SpirvVersionString, "SPIR-V_%d.%d ", SpirvVersionMajor,
2851  SpirvVersionMinor);
2852  // returned string to contain only len number of characters.
2853  std::string ILVersion(SpirvVersionString, Len);
2854  return ReturnValue(ILVersion.c_str());
2855  }
2861  auto MapCaps = [](const ze_memory_access_cap_flags_t &ZeCapabilities) {
2862  pi_uint64 Capabilities = 0;
2863  if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_RW)
2864  Capabilities |= PI_USM_ACCESS;
2865  if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC)
2866  Capabilities |= PI_USM_ATOMIC_ACCESS;
2867  if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT)
2868  Capabilities |= PI_USM_CONCURRENT_ACCESS;
2869  if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC)
2870  Capabilities |= PI_USM_CONCURRENT_ATOMIC_ACCESS;
2871  return Capabilities;
2872  };
2873  auto &Props = Device->ZeDeviceMemoryAccessProperties;
2874  switch (ParamName) {
2876  return ReturnValue(MapCaps(Props->hostAllocCapabilities));
2878  return ReturnValue(MapCaps(Props->deviceAllocCapabilities));
2880  return ReturnValue(MapCaps(Props->sharedSingleDeviceAllocCapabilities));
2882  return ReturnValue(MapCaps(Props->sharedCrossDeviceAllocCapabilities));
2884  return ReturnValue(MapCaps(Props->sharedSystemAllocCapabilities));
2885  default:
2886  die("piDeviceGetInfo: enexpected ParamName.");
2887  }
2888  }
2889 
2890  // intel extensions for GPU information
2892  if (getenv("ZES_ENABLE_SYSMAN") == nullptr) {
2893  zePrint("Set SYCL_ENABLE_PCI=1 to obtain PCI data.\n");
2894  return PI_ERROR_INVALID_VALUE;
2895  }
2896  ZesStruct<zes_pci_properties_t> ZeDevicePciProperties;
2897  ZE_CALL(zesDevicePciGetProperties, (ZeDevice, &ZeDevicePciProperties));
2898  constexpr size_t AddressBufferSize = 13;
2899  char AddressBuffer[AddressBufferSize];
2900  std::snprintf(AddressBuffer, AddressBufferSize, "%04x:%02x:%02x.%01x",
2901  ZeDevicePciProperties.address.domain,
2902  ZeDevicePciProperties.address.bus,
2903  ZeDevicePciProperties.address.device,
2904  ZeDevicePciProperties.address.function);
2905  return ReturnValue(AddressBuffer);
2906  }
2908  pi_uint32 count = Device->ZeDeviceProperties->numEUsPerSubslice *
2909  Device->ZeDeviceProperties->numSubslicesPerSlice *
2910  Device->ZeDeviceProperties->numSlices;
2911  return ReturnValue(pi_uint32{count});
2912  }
2914  return ReturnValue(
2915  pi_uint32{Device->ZeDeviceProperties->physicalEUSimdWidth});
2917  return ReturnValue(pi_uint32{Device->ZeDeviceProperties->numSlices});
2919  return ReturnValue(
2920  pi_uint32{Device->ZeDeviceProperties->numSubslicesPerSlice});
2922  return ReturnValue(
2923  pi_uint32{Device->ZeDeviceProperties->numEUsPerSubslice});
2925  return ReturnValue(pi_uint32{Device->ZeDeviceProperties->numThreadsPerEU});
2927  // currently not supported in level zero runtime
2928  return PI_ERROR_INVALID_VALUE;
2930  return PI_ERROR_INVALID_VALUE;
2931 
2932  // TODO: Implement.
2934  default:
2935  zePrint("Unsupported ParamName in piGetDeviceInfo\n");
2936  zePrint("ParamName=%d(0x%x)\n", ParamName, ParamName);
2937  return PI_ERROR_INVALID_VALUE;
2938  }
2939 
2940  return PI_SUCCESS;
2941 }
2942 
2945  pi_uint32 NumDevices, pi_device *OutDevices,
2946  pi_uint32 *OutNumDevices) {
2947  // Other partitioning ways are not supported by Level Zero
2951  return PI_ERROR_INVALID_VALUE;
2952  }
2953 
2954  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
2955 
2956  // Devices cache is normally created in piDevicesGet but still make
2957  // sure that cache is populated.
2958  //
2960  if (Res != PI_SUCCESS) {
2961  return Res;
2962  }
2963 
2964  if (OutNumDevices) {
2965  *OutNumDevices = Device->SubDevices.size();
2966  }
2967 
2968  if (OutDevices) {
2969  // TODO: Consider support for partitioning to <= total sub-devices.
2970  // Currently supported partitioning (by affinity domain/numa) would always
2971  // partition to all sub-devices.
2972  //
2973  PI_ASSERT(NumDevices == Device->SubDevices.size(), PI_ERROR_INVALID_VALUE);
2974 
2975  for (uint32_t I = 0; I < NumDevices; I++) {
2976  OutDevices[I] = Device->SubDevices[I];
2977  // reusing the same pi_device needs to increment the reference count
2978  PI_CALL(piDeviceRetain(OutDevices[I]));
2979  }
2980  }
2981  return PI_SUCCESS;
2982 }
2983 
2984 pi_result
2985 piextDeviceSelectBinary(pi_device Device, // TODO: does this need to be context?
2986  pi_device_binary *Binaries, pi_uint32 NumBinaries,
2987  pi_uint32 *SelectedBinaryInd) {
2988 
2989  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
2990  PI_ASSERT(SelectedBinaryInd, PI_ERROR_INVALID_VALUE);
2991  PI_ASSERT(NumBinaries == 0 || Binaries, PI_ERROR_INVALID_VALUE);
2992 
2993  // TODO: this is a bare-bones implementation for choosing a device image
2994  // that would be compatible with the targeted device. An AOT-compiled
2995  // image is preferred over SPIR-V for known devices (i.e. Intel devices)
2996  // The implementation makes no effort to differentiate between multiple images
2997  // for the given device, and simply picks the first one compatible.
2998  //
2999  // Real implementation will use the same mechanism OpenCL ICD dispatcher
3000  // uses. Something like:
3001  // PI_VALIDATE_HANDLE_RETURN_HANDLE(ctx, PI_ERROR_INVALID_CONTEXT);
3002  // return context->dispatch->piextDeviceSelectIR(
3003  // ctx, images, num_images, selected_image);
3004  // where context->dispatch is set to the dispatch table provided by PI
3005  // plugin for platform/device the ctx was created for.
3006 
3007  // Look for GEN binary, which we known can only be handled by Level-Zero now.
3008  const char *BinaryTarget = __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN;
3009 
3010  // Find the appropriate device image, fallback to spirv if not found
3011  constexpr pi_uint32 InvalidInd = std::numeric_limits<pi_uint32>::max();
3012  pi_uint32 Spirv = InvalidInd;
3013 
3014  for (pi_uint32 i = 0; i < NumBinaries; ++i) {
3015  if (strcmp(Binaries[i]->DeviceTargetSpec, BinaryTarget) == 0) {
3016  *SelectedBinaryInd = i;
3017  return PI_SUCCESS;
3018  }
3019  if (strcmp(Binaries[i]->DeviceTargetSpec,
3021  Spirv = i;
3022  }
3023  // Points to a spirv image, if such indeed was found
3024  if ((*SelectedBinaryInd = Spirv) != InvalidInd)
3025  return PI_SUCCESS;
3026 
3027  // No image can be loaded for the given device
3028  return PI_ERROR_INVALID_BINARY;
3029 }
3030 
3032  pi_native_handle *NativeHandle) {
3033  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
3034  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3035 
3036  auto ZeDevice = pi_cast<ze_device_handle_t *>(NativeHandle);
3037  // Extract the Level Zero module handle from the given PI device
3038  *ZeDevice = Device->ZeDevice;
3039  return PI_SUCCESS;
3040 }
3041 
3043  pi_platform Platform,
3044  pi_device *Device) {
3045  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
3046  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3047 
3048  auto ZeDevice = pi_cast<ze_device_handle_t>(NativeHandle);
3049 
3050  // The SYCL spec requires that the set of devices must remain fixed for the
3051  // duration of the application's execution. We assume that we found all of the
3052  // Level Zero devices when we initialized the platforms/devices cache, so the
3053  // "NativeHandle" must already be in the cache. If it is not, this must not be
3054  // a valid Level Zero device.
3055  //
3056  // TODO: maybe we should populate cache of platforms if it wasn't already.
3057  // For now assert that is was populated.
3058  PI_ASSERT(PiPlatformCachePopulated, PI_ERROR_INVALID_VALUE);
3059  const std::lock_guard<sycl::detail::SpinLock> Lock{*PiPlatformsCacheMutex};
3060 
3061  pi_device Dev = nullptr;
3062  for (auto &ThePlatform : *PiPlatformsCache) {
3063  Dev = ThePlatform->getDeviceFromNativeHandle(ZeDevice);
3064  if (Dev) {
3065  // Check that the input Platform, if was given, matches the found one.
3066  PI_ASSERT(!Platform || Platform == ThePlatform,
3067  PI_ERROR_INVALID_PLATFORM);
3068  break;
3069  }
3070  }
3071 
3072  if (Dev == nullptr)
3073  return PI_ERROR_INVALID_VALUE;
3074 
3075  *Device = Dev;
3076  return PI_SUCCESS;
3077 }
3078 
3080  pi_uint32 NumDevices, const pi_device *Devices,
3081  void (*PFnNotify)(const char *ErrInfo,
3082  const void *PrivateInfo, size_t CB,
3083  void *UserData),
3084  void *UserData, pi_context *RetContext) {
3085  (void)Properties;
3086  (void)PFnNotify;
3087  (void)UserData;
3088  PI_ASSERT(NumDevices, PI_ERROR_INVALID_VALUE);
3089  PI_ASSERT(Devices, PI_ERROR_INVALID_DEVICE);
3090  PI_ASSERT(RetContext, PI_ERROR_INVALID_VALUE);
3091 
3092  pi_platform Platform = (*Devices)->Platform;
3093  ZeStruct<ze_context_desc_t> ContextDesc;
3094  ContextDesc.flags = 0;
3095 
3096  ze_context_handle_t ZeContext;
3097  ZE_CALL(zeContextCreate, (Platform->ZeDriver, &ContextDesc, &ZeContext));
3098  try {
3099  *RetContext = new _pi_context(ZeContext, NumDevices, Devices, true);
3100  (*RetContext)->initialize();
3101  if (IndirectAccessTrackingEnabled) {
3102  std::scoped_lock Lock(Platform->ContextsMutex);
3103  Platform->Contexts.push_back(*RetContext);
3104  }
3105  } catch (const std::bad_alloc &) {
3106  return PI_ERROR_OUT_OF_HOST_MEMORY;
3107  } catch (...) {
3108  return PI_ERROR_UNKNOWN;
3109  }
3110 
3111  return PI_SUCCESS;
3112 }
3113 
3115  size_t ParamValueSize, void *ParamValue,
3116  size_t *ParamValueSizeRet) {
3117 
3118  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3119 
3120  std::shared_lock Lock(Context->Mutex);
3121  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
3122  switch (ParamName) {
3124  return getInfoArray(Context->Devices.size(), ParamValueSize, ParamValue,
3125  ParamValueSizeRet, &Context->Devices[0]);
3127  return ReturnValue(pi_uint32(Context->Devices.size()));
3129  return ReturnValue(pi_uint32{Context->RefCount.load()});
3131  default:
3132  // TODO: implement other parameters
3133  die("piGetContextInfo: unsuppported ParamName.");
3134  }
3135 
3136  return PI_SUCCESS;
3137 }
3138 
3139 // FIXME: Dummy implementation to prevent link fail
3141  pi_context_extended_deleter Function,
3142  void *UserData) {
3143  (void)Context;
3144  (void)Function;
3145  (void)UserData;
3146  die("piextContextSetExtendedDeleter: not supported");
3147  return PI_SUCCESS;
3148 }
3149 
3151  pi_native_handle *NativeHandle) {
3152  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3153  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3154 
3155  auto ZeContext = pi_cast<ze_context_handle_t *>(NativeHandle);
3156  // Extract the Level Zero queue handle from the given PI queue
3157  *ZeContext = Context->ZeContext;
3158  return PI_SUCCESS;
3159 }
3160 
3162  pi_uint32 NumDevices,
3163  const pi_device *Devices,
3164  bool OwnNativeHandle,
3165  pi_context *RetContext) {
3166  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3167  PI_ASSERT(Devices, PI_ERROR_INVALID_DEVICE);
3168  PI_ASSERT(RetContext, PI_ERROR_INVALID_VALUE);
3169  PI_ASSERT(NumDevices, PI_ERROR_INVALID_VALUE);
3170 
3171  try {
3172  *RetContext = new _pi_context(pi_cast<ze_context_handle_t>(NativeHandle),
3173  NumDevices, Devices, OwnNativeHandle);
3174  (*RetContext)->initialize();
3175  } catch (const std::bad_alloc &) {
3176  return PI_ERROR_OUT_OF_HOST_MEMORY;
3177  } catch (...) {
3178  return PI_ERROR_UNKNOWN;
3179  }
3180 
3181  return PI_SUCCESS;
3182 }
3183 
3185 
3186  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3187 
3188  Context->RefCount.increment();
3189  return PI_SUCCESS;
3190 }
3191 
3192 // Helper function to release the context, a caller must lock the platform-level
3193 // mutex guarding the container with contexts because the context can be removed
3194 // from the list of tracked contexts.
3196 
3197  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3198 
3199  if (!Context->RefCount.decrementAndTest())
3200  return PI_SUCCESS;
3201 
3202  if (IndirectAccessTrackingEnabled) {
3203  pi_platform Plt = Context->getPlatform();
3204  auto &Contexts = Plt->Contexts;
3205  auto It = std::find(Contexts.begin(), Contexts.end(), Context);
3206  if (It != Contexts.end())
3207  Contexts.erase(It);
3208  }
3209  ze_context_handle_t DestoryZeContext =
3210  Context->OwnZeContext ? Context->ZeContext : nullptr;
3211 
3212  // Clean up any live memory associated with Context
3213  pi_result Result = Context->finalize();
3214 
3215  // We must delete Context first and then destroy zeContext because
3216  // Context deallocation requires ZeContext in some member deallocation of
3217  // pi_context.
3218  delete Context;
3219 
3220  // Destruction of some members of pi_context uses L0 context
3221  // and therefore it must be valid at that point.
3222  // Technically it should be placed to the destructor of pi_context
3223  // but this makes API error handling more complex.
3224  if (DestoryZeContext)
3225  ZE_CALL(zeContextDestroy, (DestoryZeContext));
3226 
3227  return Result;
3228 }
3229 
3231  pi_platform Plt = Context->getPlatform();
3232  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
3233  std::defer_lock);
3234  if (IndirectAccessTrackingEnabled)
3235  ContextsLock.lock();
3236 
3237  return ContextReleaseHelper(Context);
3238 }
3239 
3242 
3243  // Check that unexpected bits are not set.
3247  PI_ERROR_INVALID_VALUE);
3248 
3249  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3250  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
3251  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
3252 
3253  if (std::find(Context->Devices.begin(), Context->Devices.end(), Device) ==
3254  Context->Devices.end()) {
3255  return PI_ERROR_INVALID_DEVICE;
3256  }
3257 
3258  // Create placeholder queues in the compute queue group.
3259  // Actual L0 queues will be created at first use.
3260  std::vector<ze_command_queue_handle_t> ZeComputeCommandQueues(
3261  Device->QueueGroup[_pi_queue::queue_type::Compute].ZeProperties.numQueues,
3262  nullptr);
3263 
3264  // Create placeholder queues in the copy queue group (main and link
3265  // native groups are combined into one group).
3266  // Actual L0 queues will be created at first use.
3267  size_t NumCopyGroups = 0;
3268  if (Device->hasMainCopyEngine()) {
3269  NumCopyGroups += Device->QueueGroup[_pi_queue::queue_type::MainCopy]
3270  .ZeProperties.numQueues;
3271  }
3272  if (Device->hasLinkCopyEngine()) {
3273  NumCopyGroups += Device->QueueGroup[_pi_queue::queue_type::LinkCopy]
3274  .ZeProperties.numQueues;
3275  }
3276  std::vector<ze_command_queue_handle_t> ZeCopyCommandQueues(NumCopyGroups,
3277  nullptr);
3278 
3279  try {
3280  *Queue = new _pi_queue(ZeComputeCommandQueues, ZeCopyCommandQueues, Context,
3281  Device, true, Properties);
3282  } catch (const std::bad_alloc &) {
3283  return PI_ERROR_OUT_OF_HOST_MEMORY;
3284  } catch (...) {
3285  return PI_ERROR_UNKNOWN;
3286  }
3287  return PI_SUCCESS;
3288 }
3289 
3291  size_t ParamValueSize, void *ParamValue,
3292  size_t *ParamValueSizeRet) {
3293 
3294  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
3295 
3296  std::shared_lock Lock(Queue->Mutex);
3297  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
3298  // TODO: consider support for queue properties and size
3299  switch (ParamName) {
3300  case PI_QUEUE_INFO_CONTEXT:
3301  return ReturnValue(Queue->Context);
3302  case PI_QUEUE_INFO_DEVICE:
3303  return ReturnValue(Queue->Device);
3305  return ReturnValue(pi_uint32{Queue->RefCount.load()});
3307  die("PI_QUEUE_INFO_PROPERTIES in piQueueGetInfo not implemented\n");
3308  break;
3309  case PI_QUEUE_INFO_SIZE:
3310  die("PI_QUEUE_INFO_SIZE in piQueueGetInfo not implemented\n");
3311  break;
3313  die("PI_QUEUE_INFO_DEVICE_DEFAULT in piQueueGetInfo not implemented\n");
3314  break;
3315  default:
3316  zePrint("Unsupported ParamName in piQueueGetInfo: ParamName=%d(0x%x)\n",
3317  ParamName, ParamName);
3318  return PI_ERROR_INVALID_VALUE;
3319  }
3320 
3321  return PI_SUCCESS;
3322 }
3323 
3325  {
3326  std::scoped_lock Lock(Queue->Mutex);
3327  Queue->RefCountExternal++;
3328  }
3329  Queue->RefCount.increment();
3330  return PI_SUCCESS;
3331 }
3332 
3334  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
3335  std::vector<pi_event> EventListToCleanup;
3336 
3337  {
3338  std::scoped_lock Lock(Queue->Mutex);
3339 
3340  if ((--Queue->RefCountExternal) != 0)
3341  return PI_SUCCESS;
3342 
3343  // When external reference count goes to zero it is still possible
3344  // that internal references still exists, e.g. command-lists that
3345  // are not yet completed. So do full queue synchronization here
3346  // and perform proper cleanup.
3347  //
3348  // It is possible to get to here and still have an open command list
3349  // if no wait or finish ever occurred for this queue.
3350  if (auto Res = Queue->executeAllOpenCommandLists())
3351  return Res;
3352 
3353  // Make sure all commands get executed.
3354  Queue->synchronize();
3355 
3356  // Destroy all the fences created associated with this queue.
3357  for (auto it = Queue->CommandListMap.begin();
3358  it != Queue->CommandListMap.end(); ++it) {
3359  // This fence wasn't yet signalled when we polled it for recycling
3360  // the command-list, so need to release the command-list too.
3361  // For immediate commandlists we don't need to do an L0 reset of the
3362  // commandlist but do need to do event cleanup which is also in the
3363  // resetCommandList function.
3364  if (it->second.ZeFence != nullptr && it->second.ZeFenceInUse) {
3365  Queue->resetCommandList(it, true, EventListToCleanup);
3366  }
3367  // TODO: remove "if" when the problem is fixed in the level zero
3368  // runtime. Destroy only if a queue is healthy. Destroying a fence may
3369  // cause a hang otherwise.
3370  // If the fence is a nullptr we are using immediate commandlists.
3371  if (Queue->Healthy && it->second.ZeFence != nullptr)
3372  ZE_CALL(zeFenceDestroy, (it->second.ZeFence));
3373  }
3374  Queue->CommandListMap.clear();
3375  }
3376 
3377  for (auto Event : EventListToCleanup) {
3378  // We don't need to synchronize the events since the queue
3379  // synchronized above already does that.
3380  {
3381  std::scoped_lock EventLock(Event->Mutex);
3382  Event->Completed = true;
3383  }
3385  // This event was removed from the command list, so decrement ref count
3386  // (it was incremented when they were added to the command list).
3387  PI_CALL(piEventRelease(Event));
3388  }
3390  return PI_SUCCESS;
3391 }
3392 
3394  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
3395 
3396  if (!Queue->RefCount.decrementAndTest())
3397  return PI_SUCCESS;
3398 
3399  if (Queue->OwnZeCommandQueue) {
3400  for (auto &ZeQueue : Queue->ComputeQueueGroup.ZeQueues) {
3401  if (ZeQueue)
3402  ZE_CALL(zeCommandQueueDestroy, (ZeQueue));
3403  }
3404  for (auto &ZeQueue : Queue->CopyQueueGroup.ZeQueues) {
3405  if (ZeQueue)
3406  ZE_CALL(zeCommandQueueDestroy, (ZeQueue));
3407  }
3408  }
3409 
3410  zePrint("piQueueRelease(compute) NumTimesClosedFull %d, "
3411  "NumTimesClosedEarly %d\n",
3414  zePrint("piQueueRelease(copy) NumTimesClosedFull %d, NumTimesClosedEarly "
3415  "%d\n",
3418 
3419  delete Queue;
3420 
3421  return PI_SUCCESS;
3422 }
3423 
3425  // Wait until command lists attached to the command queue are executed.
3426  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
3427 
3428  if (UseImmediateCommandLists) {
3429  // Lock automatically releases when this goes out of scope.
3430  std::scoped_lock Lock(Queue->Mutex);
3431 
3432  Queue->synchronize();
3433  return PI_SUCCESS;
3434  } else {
3435  std::unique_lock Lock(Queue->Mutex);
3436  std::vector<ze_command_queue_handle_t> ZeQueues;
3437 
3438  // execute any command list that may still be open.
3439  if (auto Res = Queue->executeAllOpenCommandLists())
3440  return Res;
3441 
3442  // Make a copy of queues to sync and release the lock.
3443  ZeQueues = Queue->CopyQueueGroup.ZeQueues;
3444  std::copy(Queue->ComputeQueueGroup.ZeQueues.begin(),
3445  Queue->ComputeQueueGroup.ZeQueues.end(),
3446  std::back_inserter(ZeQueues));
3447 
3448  // Remember the last command's event.
3449  auto LastCommandEvent = Queue->LastCommandEvent;
3450 
3451  // Don't hold a lock to the queue's mutex while waiting.
3452  // This allows continue working with the queue from other threads.
3453  // TODO: this currently exhibits some issues in the driver, so
3454  // we control this with an env var. Remove this control when
3455  // we settle one way or the other.
3456  static bool HoldLock =
3457  std::getenv("SYCL_PI_LEVEL_ZERO_QUEUE_FINISH_HOLD_LOCK") != nullptr;
3458  if (!HoldLock) {
3459  Lock.unlock();
3460  }
3461 
3462  for (auto ZeQueue : ZeQueues) {
3463  if (ZeQueue)
3464  ZE_CALL(zeHostSynchronize, (ZeQueue));
3465  }
3466 
3467  // Prevent unneeded already finished events to show up in the wait list.
3468  // We can only do so if nothing else was submitted to the queue
3469  // while we were synchronizing it.
3470  if (!HoldLock) {
3471  std::scoped_lock Lock(Queue->Mutex);
3472  if (LastCommandEvent == Queue->LastCommandEvent) {
3473  Queue->LastCommandEvent = nullptr;
3474  }
3475  } else {
3476  Queue->LastCommandEvent = nullptr;
3477  }
3478  }
3479  // Reset signalled command lists and return them back to the cache of
3480  // available command lists.
3481  resetCommandLists(Queue);
3482  return PI_SUCCESS;
3483 }
3484 
3485 // Flushing cross-queue dependencies is covered by createAndRetainPiZeEventList,
3486 // so this can be left as a no-op.
3488  (void)Queue;
3489  return PI_SUCCESS;
3490 }
3491 
3493  pi_native_handle *NativeHandle) {
3494  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
3495  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3496 
3497  // Lock automatically releases when this goes out of scope.
3498  std::shared_lock lock(Queue->Mutex);
3499 
3500  auto ZeQueue = pi_cast<ze_command_queue_handle_t *>(NativeHandle);
3501  // Extract the Level Zero compute queue handle from the given PI queue
3502  *ZeQueue = Queue->ComputeQueueGroup.ZeQueues[0];
3503  return PI_SUCCESS;
3504 }
3505 
3508  bool OwnNativeHandle,
3509  pi_queue *Queue) {
3510  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3511  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3512  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
3513 
3514  auto ZeQueue = pi_cast<ze_command_queue_handle_t>(NativeHandle);
3515  // Assume this is the "0" index queue in the compute command-group.
3516  std::vector<ze_command_queue_handle_t> ZeQueues{ZeQueue};
3517 
3518  // For compatibility with older implementations we allow the device to be
3519  // optional for now. Once the deprecated interop API is removed this can be
3520  // changed to an assert(Device).
3521  if (!Device)
3522  Device = Context->Devices[0];
3523  // TODO: see what we can do to correctly initialize PI queue for
3524  // compute vs. copy Level-Zero queue. Currently we will send
3525  // all commands to the "ZeQueue".
3526  std::vector<ze_command_queue_handle_t> ZeroCopyQueues;
3527  *Queue =
3528  new _pi_queue(ZeQueues, ZeroCopyQueues, Context, Device, OwnNativeHandle);
3529  return PI_SUCCESS;
3530 }
3531 
3532 // If indirect access tracking is enabled then performs reference counting,
3533 // otherwise just calls zeMemAllocDevice.
3535  pi_device Device, size_t Size) {
3536  pi_platform Plt = Device->Platform;
3537  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
3538  std::defer_lock);
3539  if (IndirectAccessTrackingEnabled) {
3540  // Lock the mutex which is guarding contexts container in the platform.
3541  // This prevents new kernels from being submitted in any context while
3542  // we are in the process of allocating a memory, this is needed to
3543  // properly capture allocations by kernels with indirect access.
3544  ContextsLock.lock();
3545  // We are going to defer memory release if there are kernels with
3546  // indirect access, that is why explicitly retain context to be sure
3547  // that it is released after all memory allocations in this context are
3548  // released.
3550  }
3551 
3552  ze_device_mem_alloc_desc_t ZeDesc = {};
3553  ZeDesc.flags = 0;
3554  ZeDesc.ordinal = 0;
3555  ZE_CALL(zeMemAllocDevice,
3556  (Context->ZeContext, &ZeDesc, Size, 1, Device->ZeDevice, ResultPtr));
3557 
3558  if (IndirectAccessTrackingEnabled) {
3559  // Keep track of all memory allocations in the context
3560  Context->MemAllocs.emplace(std::piecewise_construct,
3561  std::forward_as_tuple(*ResultPtr),
3562  std::forward_as_tuple(Context));
3563  }
3564  return PI_SUCCESS;
3565 }
3566 
3567 // If indirect access tracking is enabled then performs reference counting,
3568 // otherwise just calls zeMemAllocHost.
3570  size_t Size) {
3571  pi_platform Plt = Context->getPlatform();
3572  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
3573  std::defer_lock);
3574  if (IndirectAccessTrackingEnabled) {
3575  // Lock the mutex which is guarding contexts container in the platform.
3576  // This prevents new kernels from being submitted in any context while
3577  // we are in the process of allocating a memory, this is needed to
3578  // properly capture allocations by kernels with indirect access.
3579  ContextsLock.lock();
3580  // We are going to defer memory release if there are kernels with
3581  // indirect access, that is why explicitly retain context to be sure
3582  // that it is released after all memory allocations in this context are
3583  // released.
3585  }
3586 
3588  ZeDesc.flags = 0;
3589  ZE_CALL(zeMemAllocHost, (Context->ZeContext, &ZeDesc, Size, 1, ResultPtr));
3590 
3591  if (IndirectAccessTrackingEnabled) {
3592  // Keep track of all memory allocations in the context
3593  Context->MemAllocs.emplace(std::piecewise_construct,
3594  std::forward_as_tuple(*ResultPtr),
3595  std::forward_as_tuple(Context));
3596  }
3597  return PI_SUCCESS;
3598 }
3599 
3601  void *HostPtr, pi_mem *RetMem,
3602  const pi_mem_properties *properties) {
3603 
3604  // TODO: implement support for more access modes
3605  if (!((Flags & PI_MEM_FLAGS_ACCESS_RW) ||
3606  (Flags & PI_MEM_ACCESS_READ_ONLY))) {
3607  die("piMemBufferCreate: Level-Zero supports read-write and read-only "
3608  "buffer,"
3609  "but not other accesses (such as write-only) yet.");
3610  }
3611 
3612  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3613  PI_ASSERT(RetMem, PI_ERROR_INVALID_VALUE);
3614 
3615  if (properties != nullptr) {
3616  die("piMemBufferCreate: no mem properties goes to Level-Zero RT yet");
3617  }
3618 
3619  if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
3620  // Having PI_MEM_FLAGS_HOST_PTR_ALLOC for buffer requires allocation of
3621  // pinned host memory, see:
3622  // sycl/doc/extensions/supported/sycl_ext_oneapi_use_pinned_host_memory_property.asciidoc
3623  // We are however missing such functionality in Level Zero, so we just
3624  // ignore the flag for now.
3625  //
3626  }
3627 
3628  // If USM Import feature is enabled and hostptr is supplied,
3629  // import the hostptr if not already imported into USM.
3630  // Data transfer rate is maximized when both source and destination
3631  // are USM pointers. Promotion of the host pointer to USM thus
3632  // optimizes data transfer performance.
3633  bool HostPtrImported = false;
3634  if (ZeUSMImport.Enabled && HostPtr != nullptr &&
3635  (Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0) {
3636  // Query memory type of the host pointer
3637  ze_device_handle_t ZeDeviceHandle;
3638  ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
3639  ZE_CALL(zeMemGetAllocProperties,
3640  (Context->ZeContext, HostPtr, &ZeMemoryAllocationProperties,
3641  &ZeDeviceHandle));
3642 
3643  // If not shared of any type, we can import the ptr
3644  if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) {
3645  // Promote the host ptr to USM host memory
3646  ze_driver_handle_t driverHandle = Context->getPlatform()->ZeDriver;
3647  ZeUSMImport.doZeUSMImport(driverHandle, HostPtr, Size);
3648  HostPtrImported = true;
3649  }
3650  }
3651 
3652  pi_buffer Buffer = nullptr;
3653  auto HostPtrOrNull =
3654  (Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast<char *>(HostPtr) : nullptr;
3655  try {
3656  Buffer = new _pi_buffer(Context, Size, HostPtrOrNull, HostPtrImported);
3657  } catch (const std::bad_alloc &) {
3658  return PI_ERROR_OUT_OF_HOST_MEMORY;
3659  } catch (...) {
3660  return PI_ERROR_UNKNOWN;
3661  }
3662 
3663  // Initialize the buffer with user data
3664  if (HostPtr) {
3665  if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
3666  (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
3667 
3668  char *ZeHandleDst;
3669  PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only));
3670  if (Buffer->OnHost) {
3671  // Do a host to host copy.
3672  // For an imported HostPtr the copy is unneeded.
3673  if (!HostPtrImported)
3674  memcpy(ZeHandleDst, HostPtr, Size);
3675  } else {
3676  // Initialize the buffer synchronously with immediate offload
3677  // zeCommandListAppendMemoryCopy must not be called from simultaneous
3678  // threads with the same command list handle, so we need exclusive lock.
3679  std::scoped_lock Lock(Context->ImmediateCommandListMutex);
3680  ZE_CALL(zeCommandListAppendMemoryCopy,
3681  (Context->ZeCommandListInit, ZeHandleDst, HostPtr, Size,
3682  nullptr, 0, nullptr));
3683  }
3684  } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
3685  // Nothing more to do.
3686  } else {
3687  die("piMemBufferCreate: not implemented");
3688  }
3689  }
3690 
3691  *RetMem = Buffer;
3692  return PI_SUCCESS;
3693 }
3694 
3695 pi_result piMemGetInfo(pi_mem Mem, pi_mem_info ParamName, size_t ParamValueSize,
3696  void *ParamValue, size_t *ParamValueSizeRet) {
3697  PI_ASSERT(Mem, PI_ERROR_INVALID_VALUE);
3698  // piMemImageGetInfo must be used for images
3699  PI_ASSERT(!Mem->isImage(), PI_ERROR_INVALID_VALUE);
3700 
3701  std::shared_lock Lock(Mem->Mutex);
3702  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
3703 
3704  switch (ParamName) {
3705  case PI_MEM_CONTEXT:
3706  return ReturnValue(Mem->Context);
3707  case PI_MEM_SIZE: {
3708  // Get size of the allocation
3709  auto Buffer = pi_cast<pi_buffer>(Mem);
3710  return ReturnValue(size_t{Buffer->Size});
3711  }
3712  default:
3713  die("piMemGetInfo: Parameter is not implemented");
3714  }
3715 
3716  return {};
3717 }
3718 
3720  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
3721 
3722  Mem->RefCount.increment();
3723  return PI_SUCCESS;
3724 }
3725 
3726 // If indirect access tracking is not enabled then this functions just performs
3727 // zeMemFree. If indirect access tracking is enabled then reference counting is
3728 // performed.
3730  bool OwnZeMemHandle = true) {
3731  pi_platform Plt = Context->getPlatform();
3732  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
3733  std::defer_lock);
3734  if (IndirectAccessTrackingEnabled) {
3735  ContextsLock.lock();
3736  auto It = Context->MemAllocs.find(Ptr);
3737  if (It == std::end(Context->MemAllocs)) {
3738  die("All memory allocations must be tracked!");
3739  }
3740  if (!It->second.RefCount.decrementAndTest()) {
3741  // Memory can't be deallocated yet.
3742  return PI_SUCCESS;
3743  }
3744 
3745  // Reference count is zero, it is ok to free memory.
3746  // We don't need to track this allocation anymore.
3747  Context->MemAllocs.erase(It);
3748  }
3749 
3750  if (OwnZeMemHandle)
3751  ZE_CALL(zeMemFree, (Context->ZeContext, Ptr));
3752 
3753  if (IndirectAccessTrackingEnabled)
3755 
3756  return PI_SUCCESS;
3757 }
3758 
3759 static pi_result USMFreeHelper(pi_context Context, void *Ptr,
3760  bool OwnZeMemHandle);
3761 
3763  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
3764 
3765  if (!Mem->RefCount.decrementAndTest())
3766  return PI_SUCCESS;
3767 
3768  if (Mem->isImage()) {
3769  char *ZeHandleImage;
3770  PI_CALL(Mem->getZeHandle(ZeHandleImage, _pi_mem::write_only));
3771  ZE_CALL(zeImageDestroy, (pi_cast<ze_image_handle_t>(ZeHandleImage)));
3772  } else {
3773  auto Buffer = static_cast<pi_buffer>(Mem);
3774  Buffer->free();
3775  }
3776  delete Mem;
3777 
3778  return PI_SUCCESS;
3779 }
3780 
3782  const pi_image_format *ImageFormat,
3783  const pi_image_desc *ImageDesc, void *HostPtr,
3784  pi_mem *RetImage) {
3785 
3786  // TODO: implement read-only, write-only
3787  if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) {
3788  die("piMemImageCreate: Level-Zero implements only read-write buffer,"
3789  "no read-only or write-only yet.");
3790  }
3791  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3792  PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE);
3793  PI_ASSERT(ImageFormat, PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
3794 
3795  ze_image_format_type_t ZeImageFormatType;
3796  size_t ZeImageFormatTypeSize;
3797  switch (ImageFormat->image_channel_data_type) {
3799  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT;
3800  ZeImageFormatTypeSize = 32;
3801  break;
3803  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT;
3804  ZeImageFormatTypeSize = 16;
3805  break;
3807  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
3808  ZeImageFormatTypeSize = 32;
3809  break;
3811  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
3812  ZeImageFormatTypeSize = 16;
3813  break;
3815  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
3816  ZeImageFormatTypeSize = 8;
3817  break;
3819  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM;
3820  ZeImageFormatTypeSize = 16;
3821  break;
3823  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM;
3824  ZeImageFormatTypeSize = 8;
3825  break;
3827  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
3828  ZeImageFormatTypeSize = 32;
3829  break;
3831  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
3832  ZeImageFormatTypeSize = 16;
3833  break;
3835  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
3836  ZeImageFormatTypeSize = 8;
3837  break;
3839  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM;
3840  ZeImageFormatTypeSize = 16;
3841  break;
3843  ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM;
3844  ZeImageFormatTypeSize = 8;
3845  break;
3846  default:
3847  zePrint("piMemImageCreate: unsupported image data type: data type = %d\n",
3848  ImageFormat->image_channel_data_type);
3849  return PI_ERROR_INVALID_VALUE;
3850  }
3851 
3852  // TODO: populate the layout mapping
3853  ze_image_format_layout_t ZeImageFormatLayout;
3854  switch (ImageFormat->image_channel_order) {
3856  switch (ZeImageFormatTypeSize) {
3857  case 8:
3858  ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8;
3859  break;
3860  case 16:
3861  ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16;
3862  break;
3863  case 32:
3864  ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
3865  break;
3866  default:
3867  zePrint("piMemImageCreate: unexpected data type Size\n");
3868  return PI_ERROR_INVALID_VALUE;
3869  }
3870  break;
3871  default:
3872  zePrint("format layout = %d\n", ImageFormat->image_channel_order);
3873  die("piMemImageCreate: unsupported image format layout\n");
3874  break;
3875  }
3876 
3877  ze_image_format_t ZeFormatDesc = {
3878  ZeImageFormatLayout, ZeImageFormatType,
3879  // TODO: are swizzles deducted from image_format->image_channel_order?
3880  ZE_IMAGE_FORMAT_SWIZZLE_R, ZE_IMAGE_FORMAT_SWIZZLE_G,
3881  ZE_IMAGE_FORMAT_SWIZZLE_B, ZE_IMAGE_FORMAT_SWIZZLE_A};
3882 
3883  ze_image_type_t ZeImageType;
3884  switch (ImageDesc->image_type) {
3885  case PI_MEM_TYPE_IMAGE1D:
3886  ZeImageType = ZE_IMAGE_TYPE_1D;
3887  break;
3888  case PI_MEM_TYPE_IMAGE2D:
3889  ZeImageType = ZE_IMAGE_TYPE_2D;
3890  break;
3891  case PI_MEM_TYPE_IMAGE3D:
3892  ZeImageType = ZE_IMAGE_TYPE_3D;
3893  break;
3895  ZeImageType = ZE_IMAGE_TYPE_1DARRAY;
3896  break;
3898  ZeImageType = ZE_IMAGE_TYPE_2DARRAY;
3899  break;
3900  default:
3901  zePrint("piMemImageCreate: unsupported image type\n");
3902  return PI_ERROR_INVALID_VALUE;
3903  }
3904 
3905  ZeStruct<ze_image_desc_t> ZeImageDesc;
3906  ZeImageDesc.arraylevels = ZeImageDesc.flags = 0;
3907  ZeImageDesc.type = ZeImageType;
3908  ZeImageDesc.format = ZeFormatDesc;
3909  ZeImageDesc.width = pi_cast<uint32_t>(ImageDesc->image_width);
3910  ZeImageDesc.height = pi_cast<uint32_t>(ImageDesc->image_height);
3911  ZeImageDesc.depth = pi_cast<uint32_t>(ImageDesc->image_depth);
3912  ZeImageDesc.arraylevels = pi_cast<uint32_t>(ImageDesc->image_array_size);
3913  ZeImageDesc.miplevels = ImageDesc->num_mip_levels;
3914 
3915  std::shared_lock Lock(Context->Mutex);
3916 
3917  // Currently we have the "0" device in context with mutliple root devices to
3918  // own the image.
3919  // TODO: Implement explicit copying for acessing the image from other devices
3920  // in the context.
3922  : Context->Devices[0];
3923  ze_image_handle_t ZeHImage;
3924  ZE_CALL(zeImageCreate,
3925  (Context->ZeContext, Device->ZeDevice, &ZeImageDesc, &ZeHImage));
3926 
3927  try {
3928  auto ZePIImage = new _pi_image(Context, ZeHImage);
3929 
3930 #ifndef NDEBUG
3931  ZePIImage->ZeImageDesc = ZeImageDesc;
3932 #endif // !NDEBUG
3933 
3934  if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
3935  (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
3936  // Initialize image synchronously with immediate offload.
3937  // zeCommandListAppendImageCopyFromMemory must not be called from
3938  // simultaneous threads with the same command list handle, so we need
3939  // exclusive lock.
3940  std::scoped_lock Lock(Context->ImmediateCommandListMutex);
3941  ZE_CALL(zeCommandListAppendImageCopyFromMemory,
3942  (Context->ZeCommandListInit, ZeHImage, HostPtr, nullptr, nullptr,
3943  0, nullptr));
3944  }
3945 
3946  *RetImage = ZePIImage;
3947  } catch (const std::bad_alloc &) {
3948  return PI_ERROR_OUT_OF_HOST_MEMORY;
3949  } catch (...) {
3950  return PI_ERROR_UNKNOWN;
3951  }
3952  return PI_SUCCESS;
3953 }
3954 
3956  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
3957  std::shared_lock Guard(Mem->Mutex);
3958  char *ZeHandle;
3959  PI_CALL(Mem->getZeHandle(ZeHandle, _pi_mem::read_write));
3960  *NativeHandle = pi_cast<pi_native_handle>(ZeHandle);
3961  return PI_SUCCESS;
3962 }
3963 
3966  bool ownNativeHandle, pi_mem *Mem) {
3967  PI_ASSERT(Mem, PI_ERROR_INVALID_VALUE);
3968  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3969  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
3970 
3971  std::shared_lock Lock(Context->Mutex);
3972 
3973  // Get base of the allocation
3974  void *Base;
3975  size_t Size;
3976  void *Ptr = pi_cast<void *>(NativeHandle);
3977  ZE_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, &Size));
3978  PI_ASSERT(Ptr == Base, PI_ERROR_INVALID_VALUE);
3979 
3981  ze_device_handle_t ZeDevice = nullptr;
3982  ZE_CALL(zeMemGetAllocProperties,
3983  (Context->ZeContext, Ptr, &ZeMemProps, &ZeDevice));
3984 
3985  // Check type of the allocation
3986  switch (ZeMemProps.type) {
3987  case ZE_MEMORY_TYPE_HOST:
3988  case ZE_MEMORY_TYPE_SHARED:
3989  case ZE_MEMORY_TYPE_DEVICE:
3990  break;
3991  case ZE_MEMORY_TYPE_UNKNOWN:
3992  // Memory allocation is unrelated to the context
3993  return PI_ERROR_INVALID_CONTEXT;
3994  default:
3995  die("Unexpected memory type");
3996  }
3997 
3998  pi_device Device = nullptr;
3999  if (ZeDevice) {
4001  // Check that the device is present in this context.
4002  if (std::find(Context->Devices.begin(), Context->Devices.end(), Device) ==
4003  Context->Devices.end()) {
4004  return PI_ERROR_INVALID_CONTEXT;
4005  }
4006  }
4007 
4008  try {
4009  *Mem = new _pi_buffer(Context, Size, Device, pi_cast<char *>(NativeHandle),
4010  ownNativeHandle);
4011 
4012  pi_platform Plt = Context->getPlatform();
4013  std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
4014  std::defer_lock);
4015  if (IndirectAccessTrackingEnabled) {
4016  // We need to keep track of all memory allocations in the context
4017  ContextsLock.lock();
4018  // Retain context to be sure that it is released after all memory
4019  // allocations in this context are released.
4021 
4022  Context->MemAllocs.emplace(
4023  std::piecewise_construct, std::forward_as_tuple(Ptr),
4024  std::forward_as_tuple(Context, ownNativeHandle));
4025  }
4026  } catch (const std::bad_alloc &) {
4027  return PI_ERROR_OUT_OF_HOST_MEMORY;
4028  } catch (...) {
4029  return PI_ERROR_UNKNOWN;
4030  }
4031 
4032  // Initialize the buffer as necessary
4033  auto Buffer = pi_cast<pi_buffer>(*Mem);
4034  if (Device) {
4035  // If this allocation is on a device, then we re-use it for the buffer.
4036  // Nothing to do.
4037  } else if (Buffer->OnHost) {
4038  // If this is host allocation and buffer always stays on host there
4039  // nothing more to do.
4040  } else {
4041  // In all other cases (shared allocation, or host allocation that cannot
4042  // represent the buffer in this context) copy the data to a newly
4043  // created device allocation.
4044  char *ZeHandleDst;
4045  PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only));
4046 
4047  // zeCommandListAppendMemoryCopy must not be called from simultaneous
4048  // threads with the same command list handle, so we need exclusive lock.
4049  std::scoped_lock Lock(Context->ImmediateCommandListMutex);
4050  ZE_CALL(zeCommandListAppendMemoryCopy,
4051  (Context->ZeCommandListInit, ZeHandleDst, Ptr, Size, nullptr, 0,
4052  nullptr));
4053  }
4054 
4055  return PI_SUCCESS;
4056 }
4057 
4059  size_t Length, pi_program *Program) {
4060 
4061  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
4062  PI_ASSERT(ILBytes && Length, PI_ERROR_INVALID_VALUE);
4063  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4064 
4065  // NOTE: the Level Zero module creation is also building the program, so we
4066  // are deferring it until the program is ready to be built.
4067 
4068  try {
4069  *Program = new _pi_program(_pi_program::IL, Context, ILBytes, Length);
4070  } catch (const std::bad_alloc &) {
4071  return PI_ERROR_OUT_OF_HOST_MEMORY;
4072  } catch (...) {
4073  return PI_ERROR_UNKNOWN;
4074  }
4075  return PI_SUCCESS;
4076 }
4077 
4079  pi_context Context, pi_uint32 NumDevices, const pi_device *DeviceList,
4080  const size_t *Lengths, const unsigned char **Binaries,
4081  size_t NumMetadataEntries, const pi_device_binary_property *Metadata,
4082  pi_int32 *BinaryStatus, pi_program *Program) {
4083  (void)Metadata;
4084  (void)NumMetadataEntries;
4085 
4086  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
4087  PI_ASSERT(DeviceList && NumDevices, PI_ERROR_INVALID_VALUE);
4088  PI_ASSERT(Binaries && Lengths, PI_ERROR_INVALID_VALUE);
4089  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4090 
4091  // For now we support only one device.
4092  if (NumDevices != 1) {
4093  zePrint("piProgramCreateWithBinary: level_zero supports only one device.");
4094  return PI_ERROR_INVALID_VALUE;
4095  }
4096  if (!Binaries[0] || !Lengths[0]) {
4097  if (BinaryStatus)
4098  *BinaryStatus = PI_ERROR_INVALID_VALUE;
4099  return PI_ERROR_INVALID_VALUE;
4100  }
4101 
4102  size_t Length = Lengths[0];
4103  auto Binary = Binaries[0];
4104 
4105  // In OpenCL, clCreateProgramWithBinary() can be used to load any of the
4106  // following: "program executable", "compiled program", or "library of
4107  // compiled programs". In addition, the loaded program can be either
4108  // IL (SPIR-v) or native device code. For now, we assume that
4109  // piProgramCreateWithBinary() is only used to load a "program executable"
4110  // as native device code.
4111  // If we wanted to support all the same cases as OpenCL, we would need to
4112  // somehow examine the binary image to distinguish the cases. Alternatively,
4113  // we could change the PI interface and have the caller pass additional
4114  // information to distinguish the cases.
4115 
4116  try {
4117  *Program = new _pi_program(_pi_program::Native, Context, Binary, Length);
4118  } catch (const std::bad_alloc &) {
4119  return PI_ERROR_OUT_OF_HOST_MEMORY;
4120  } catch (...) {
4121  return PI_ERROR_UNKNOWN;
4122  }
4123 
4124  if (BinaryStatus)
4125  *BinaryStatus = PI_SUCCESS;
4126  return PI_SUCCESS;
4127 }
4128 
4130  const char **Strings,
4131  const size_t *Lengths,
4132  pi_program *RetProgram) {
4133 
4134  (void)Context;
4135  (void)Count;
4136  (void)Strings;
4137  (void)Lengths;
4138  (void)RetProgram;
4139  zePrint("piclProgramCreateWithSource: not supported in Level Zero\n");
4140  return PI_ERROR_INVALID_OPERATION;
4141 }
4142 
4144  size_t ParamValueSize, void *ParamValue,
4145  size_t *ParamValueSizeRet) {
4146 
4147  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4148 
4149  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4150  switch (ParamName) {
4152  return ReturnValue(pi_uint32{Program->RefCount.load()});
4154  // TODO: return true number of devices this program exists for.
4155  return ReturnValue(pi_uint32{1});
4157  // TODO: return all devices this program exists for.
4158  return ReturnValue(Program->Context->Devices[0]);
4160  std::shared_lock Guard(Program->Mutex);
4161  size_t SzBinary;
4162  if (Program->State == _pi_program::IL ||
4163  Program->State == _pi_program::Native ||
4164  Program->State == _pi_program::Object) {
4165  SzBinary = Program->CodeLength;
4166  } else if (Program->State == _pi_program::Exe) {
4167  ZE_CALL(zeModuleGetNativeBinary, (Program->ZeModule, &SzBinary, nullptr));
4168  } else {
4169  return PI_ERROR_INVALID_PROGRAM;
4170  }
4171  // This is an array of 1 element, initialized as if it were scalar.
4172  return ReturnValue(size_t{SzBinary});
4173  }
4174  case PI_PROGRAM_INFO_BINARIES: {
4175  // The caller sets "ParamValue" to an array of pointers, one for each
4176  // device. Since Level Zero supports only one device, there is only one
4177  // pointer. If the pointer is NULL, we don't do anything. Otherwise, we
4178  // copy the program's binary image to the buffer at that pointer.
4179  uint8_t **PBinary = pi_cast<uint8_t **>(ParamValue);
4180  if (!PBinary[0])
4181  break;
4182 
4183  std::shared_lock Guard(Program->Mutex);
4184  if (Program->State == _pi_program::IL ||
4185  Program->State == _pi_program::Native ||
4186  Program->State == _pi_program::Object) {
4187  std::memcpy(PBinary[0], Program->Code.get(), Program->CodeLength);
4188  } else if (Program->State == _pi_program::Exe) {
4189  size_t SzBinary = 0;
4190  ZE_CALL(zeModuleGetNativeBinary,
4191  (Program->ZeModule, &SzBinary, PBinary[0]));
4192  } else {
4193  return PI_ERROR_INVALID_PROGRAM;
4194  }
4195  break;
4196  }
4198  std::shared_lock Guard(Program->Mutex);
4199  uint32_t NumKernels;
4200  if (Program->State == _pi_program::IL ||
4201  Program->State == _pi_program::Native ||
4202  Program->State == _pi_program::Object) {
4203  return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
4204  } else if (Program->State == _pi_program::Exe) {
4205  NumKernels = 0;
4206  ZE_CALL(zeModuleGetKernelNames,
4207  (Program->ZeModule, &NumKernels, nullptr));
4208  } else {
4209  return PI_ERROR_INVALID_PROGRAM;
4210  }
4211  return ReturnValue(size_t{NumKernels});
4212  }
4214  try {
4215  std::shared_lock Guard(Program->Mutex);
4216  std::string PINames{""};
4217  if (Program->State == _pi_program::IL ||
4218  Program->State == _pi_program::Native ||
4219  Program->State == _pi_program::Object) {
4220  return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
4221  } else if (Program->State == _pi_program::Exe) {
4222  uint32_t Count = 0;
4223  ZE_CALL(zeModuleGetKernelNames, (Program->ZeModule, &Count, nullptr));
4224  std::unique_ptr<const char *[]> PNames(new const char *[Count]);
4225  ZE_CALL(zeModuleGetKernelNames,
4226  (Program->ZeModule, &Count, PNames.get()));
4227  for (uint32_t I = 0; I < Count; ++I) {
4228  PINames += (I > 0 ? ";" : "");
4229  PINames += PNames[I];
4230  }
4231  } else {
4232  return PI_ERROR_INVALID_PROGRAM;
4233  }
4234  return ReturnValue(PINames.c_str());
4235  } catch (const std::bad_alloc &) {
4236  return PI_ERROR_OUT_OF_HOST_MEMORY;
4237  } catch (...) {
4238  return PI_ERROR_UNKNOWN;
4239  }
4240  default:
4241  die("piProgramGetInfo: not implemented");
4242  }
4243 
4244  return PI_SUCCESS;
4245 }
4246 
4248  const pi_device *DeviceList, const char *Options,
4249  pi_uint32 NumInputPrograms,
4250  const pi_program *InputPrograms,
4251  void (*PFnNotify)(pi_program Program, void *UserData),
4252  void *UserData, pi_program *RetProgram) {
4253  // We only support one device with Level Zero currently.
4254  if (NumDevices != 1) {
4255  zePrint("piProgramLink: level_zero supports only one device.");
4256  return PI_ERROR_INVALID_VALUE;
4257  }
4258 
4259  // We do not support any link flags at this time because the Level Zero API
4260  // does not have any way to pass flags that are specific to linking.
4261  if (Options && *Options != '\0') {
4262  std::string ErrorMessage(
4263  "Level Zero does not support kernel link flags: \"");
4264  ErrorMessage.append(Options);
4265  ErrorMessage.push_back('\"');
4266  pi_program Program =
4268  *RetProgram = Program;
4269  return PI_ERROR_LINK_PROGRAM_FAILURE;
4270  }
4271 
4272  // Validate input parameters.
4273  PI_ASSERT(DeviceList, PI_ERROR_INVALID_DEVICE);
4274  {
4275  auto DeviceEntry =
4276  find(Context->Devices.begin(), Context->Devices.end(), DeviceList[0]);
4277  if (DeviceEntry == Context->Devices.end())
4278  return PI_ERROR_INVALID_DEVICE;
4279  }
4280  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
4281  if (NumInputPrograms == 0 || InputPrograms == nullptr)
4282  return PI_ERROR_INVALID_VALUE;
4283 
4284  pi_result PiResult = PI_SUCCESS;
4285  try {
4286  // Acquire a "shared" lock on each of the input programs, and also validate
4287  // that they are all in Object state.
4288  //
4289  // There is no danger of deadlock here even if two threads call
4290  // piProgramLink simultaneously with the same input programs in a different
4291  // order. If we were acquiring these with "exclusive" access, this could
4292  // lead to a classic lock ordering deadlock. However, there is no such
4293  // deadlock potential with "shared" access. There could also be a deadlock
4294  // potential if there was some other code that holds more than one of these
4295  // locks simultaneously with "exclusive" access. However, there is no such
4296  // code like that, so this is also not a danger.
4297  std::vector<std::shared_lock<pi_shared_mutex>> Guards(NumInputPrograms);
4298  for (pi_uint32 I = 0; I < NumInputPrograms; I++) {
4299  std::shared_lock Guard(InputPrograms[I]->Mutex);
4300  Guards[I].swap(Guard);
4301  if (InputPrograms[I]->State != _pi_program::Object) {
4302  return PI_ERROR_INVALID_OPERATION;
4303  }
4304  }
4305 
4306  // Previous calls to piProgramCompile did not actually compile the SPIR-V.
4307  // Instead, we postpone compilation until this point, when all the modules
4308  // are linked together. By doing compilation and linking together, the JIT
4309  // compiler is able see all modules and do cross-module optimizations.
4310  //
4311  // Construct a ze_module_program_exp_desc_t which contains information about
4312  // all of the modules that will be linked together.
4314  std::vector<size_t> CodeSizes(NumInputPrograms);
4315  std::vector<const uint8_t *> CodeBufs(NumInputPrograms);
4316  std::vector<const char *> BuildFlagPtrs(NumInputPrograms);
4317  std::vector<const ze_module_constants_t *> SpecConstPtrs(NumInputPrograms);
4318  std::vector<_pi_program::SpecConstantShim> SpecConstShims;
4319  SpecConstShims.reserve(NumInputPrograms);
4320 
4321  for (pi_uint32 I = 0; I < NumInputPrograms; I++) {
4322  pi_program Program = InputPrograms[I];
4323  CodeSizes[I] = Program->CodeLength;
4324  CodeBufs[I] = Program->Code.get();
4325  BuildFlagPtrs[I] = Program->BuildFlags.c_str();
4326  SpecConstShims.emplace_back(Program);
4327  SpecConstPtrs[I] = SpecConstShims[I].ze();
4328  }
4329 
4330  ZeExtModuleDesc.count = NumInputPrograms;
4331  ZeExtModuleDesc.inputSizes = CodeSizes.data();
4332  ZeExtModuleDesc.pInputModules = CodeBufs.data();
4333  ZeExtModuleDesc.pBuildFlags = BuildFlagPtrs.data();
4334  ZeExtModuleDesc.pConstants = SpecConstPtrs.data();
4335 
4336  ZeStruct<ze_module_desc_t> ZeModuleDesc;
4337  ZeModuleDesc.pNext = &ZeExtModuleDesc;
4338  ZeModuleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV;
4339 
4340  // This works around a bug in the Level Zero driver. When "ZE_DEBUG=-1",
4341  // the driver does validation of the API calls, and it expects
4342  // "pInputModule" to be non-NULL and "inputSize" to be non-zero. This
4343  // validation is wrong when using the "ze_module_program_exp_desc_t"
4344  // extension because those fields are supposed to be ignored. As a
4345  // workaround, set both fields to 1.
4346  //
4347  // TODO: Remove this workaround when the driver is fixed.
4348  ZeModuleDesc.pInputModule = reinterpret_cast<const uint8_t *>(1);
4349  ZeModuleDesc.inputSize = 1;
4350 
4351  // We need a Level Zero extension to compile multiple programs together into
4352  // a single Level Zero module. However, we don't need that extension if
4353  // there happens to be only one input program.
4354  //
4355  // The "|| (NumInputPrograms == 1)" term is a workaround for a bug in the
4356  // Level Zero driver. The driver's "ze_module_program_exp_desc_t"
4357  // extension should work even in the case when there is just one input
4358  // module. However, there is currently a bug in the driver that leads to a
4359  // crash. As a workaround, do not use the extension when there is one
4360  // input module.
4361  //
4362  // TODO: Remove this workaround when the driver is fixed.
4363  if (!PiDriverModuleProgramExtensionFound || (NumInputPrograms == 1)) {
4364  if (NumInputPrograms == 1) {
4365  ZeModuleDesc.pNext = nullptr;
4366  ZeModuleDesc.inputSize = ZeExtModuleDesc.inputSizes[0];
4367  ZeModuleDesc.pInputModule = ZeExtModuleDesc.pInputModules[0];
4368  ZeModuleDesc.pBuildFlags = ZeExtModuleDesc.pBuildFlags[0];
4369  ZeModuleDesc.pConstants = ZeExtModuleDesc.pConstants[0];
4370  } else {
4371  zePrint("piProgramLink: level_zero driver does not have static linking "
4372  "support.");
4373  return PI_ERROR_INVALID_VALUE;
4374  }
4375  }
4376 
4377  // Call the Level Zero API to compile, link, and create the module.
4378  ze_device_handle_t ZeDevice = DeviceList[0]->ZeDevice;
4379  ze_context_handle_t ZeContext = Context->ZeContext;
4380  ze_module_handle_t ZeModule = nullptr;
4381  ze_module_build_log_handle_t ZeBuildLog = nullptr;
4382  ze_result_t ZeResult =
4383  ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc,
4384  &ZeModule, &ZeBuildLog));
4385 
4386  // We still create a _pi_program object even if there is a BUILD_FAILURE
4387  // because we need the object to hold the ZeBuildLog. There is no build
4388  // log created for other errors, so we don't create an object.
4389  PiResult = mapError(ZeResult);
4390  if (ZeResult != ZE_RESULT_SUCCESS &&
4391  ZeResult != ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) {
4392  return PiResult;
4393  }
4394 
4395  // The call to zeModuleCreate does not report an error if there are
4396  // unresolved symbols because it thinks these could be resolved later via a
4397  // call to zeModuleDynamicLink. However, modules created with piProgramLink
4398  // are supposed to be fully linked and ready to use. Therefore, do an extra
4399  // check now for unresolved symbols. Note that we still create a
4400  // _pi_program if there are unresolved symbols because the ZeBuildLog tells
4401  // which symbols are unresolved.
4402  if (ZeResult == ZE_RESULT_SUCCESS) {
4403  ZeResult = checkUnresolvedSymbols(ZeModule, &ZeBuildLog);
4404  if (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE) {
4405  PiResult = PI_ERROR_LINK_PROGRAM_FAILURE;
4406  } else if (ZeResult != ZE_RESULT_SUCCESS) {
4407  return mapError(ZeResult);
4408  }
4409  }
4410 
4411  _pi_program::state State =
4412  (PiResult == PI_SUCCESS) ? _pi_program::Exe : _pi_program::Invalid;
4413  *RetProgram = new _pi_program(State, Context, ZeModule, ZeBuildLog);
4414  } catch (const std::bad_alloc &) {
4415  return PI_ERROR_OUT_OF_HOST_MEMORY;
4416  } catch (...) {
4417  return PI_ERROR_UNKNOWN;
4418  }
4419  return PiResult;
4420 }
4421 
4423  pi_program Program, pi_uint32 NumDevices, const pi_device *DeviceList,
4424  const char *Options, pi_uint32 NumInputHeaders,
4425  const pi_program *InputHeaders, const char **HeaderIncludeNames,
4426  void (*PFnNotify)(pi_program Program, void *UserData), void *UserData) {
4427  (void)NumInputHeaders;
4428  (void)InputHeaders;
4429  (void)HeaderIncludeNames;
4430 
4431  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4432 
4433  if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList))
4434  return PI_ERROR_INVALID_VALUE;
4435 
4436  // These aren't supported.
4437  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
4438 
4439  std::scoped_lock Guard(Program->Mutex);
4440 
4441  // It's only valid to compile a program created from IL (we don't support
4442  // programs created from source code).
4443  //
4444  // The OpenCL spec says that the header parameters are ignored when compiling
4445  // IL programs, so we don't validate them.
4446  if (Program->State != _pi_program::IL)
4447  return PI_ERROR_INVALID_OPERATION;
4448 
4449  // We don't compile anything now. Instead, we delay compilation until
4450  // piProgramLink, where we do both compilation and linking as a single step.
4451  // This produces better code because the driver can do cross-module
4452  // optimizations. Therefore, we just remember the compilation flags, so we
4453  // can use them later.
4454  if (Options)
4455  Program->BuildFlags = Options;
4456  Program->State = _pi_program::Object;
4457 
4458  return PI_SUCCESS;
4459 }
4460 
4462  const pi_device *DeviceList, const char *Options,
4463  void (*PFnNotify)(pi_program Program, void *UserData),
4464  void *UserData) {
4465 
4466  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4467  if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList))
4468  return PI_ERROR_INVALID_VALUE;
4469 
4470  // We only support build to one device with Level Zero now.
4471  // TODO: we should eventually build to the possibly multiple root
4472  // devices in the context.
4473  if (NumDevices != 1) {
4474  zePrint("piProgramBuild: level_zero supports only one device.");
4475  return PI_ERROR_INVALID_VALUE;
4476  }
4477 
4478  // These aren't supported.
4479  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
4480 
4481  std::scoped_lock Guard(Program->Mutex);
4482  // Check if device belongs to associated context.
4483  PI_ASSERT(Program->Context, PI_ERROR_INVALID_PROGRAM);
4484  {
4485  auto DeviceEntry = find(Program->Context->Devices.begin(),
4486  Program->Context->Devices.end(), DeviceList[0]);
4487  if (DeviceEntry == Program->Context->Devices.end())
4488  return PI_ERROR_INVALID_VALUE;
4489  }
4490  // It is legal to build a program created from either IL or from native
4491  // device code.
4492  if (Program->State != _pi_program::IL &&
4493  Program->State != _pi_program::Native)
4494  return PI_ERROR_INVALID_OPERATION;
4495 
4496  // We should have either IL or native device code.
4497  PI_ASSERT(Program->Code, PI_ERROR_INVALID_PROGRAM);
4498 
4499  // Ask Level Zero to build and load the native code onto the device.
4500  ZeStruct<ze_module_desc_t> ZeModuleDesc;
4501  _pi_program::SpecConstantShim Shim(Program);
4502  ZeModuleDesc.format = (Program->State == _pi_program::IL)
4503  ? ZE_MODULE_FORMAT_IL_SPIRV
4504  : ZE_MODULE_FORMAT_NATIVE;
4505  ZeModuleDesc.inputSize = Program->CodeLength;
4506  ZeModuleDesc.pInputModule = Program->Code.get();
4507  ZeModuleDesc.pBuildFlags = Options;
4508  ZeModuleDesc.pConstants = Shim.ze();
4509 
4510  ze_device_handle_t ZeDevice = DeviceList[0]->ZeDevice;
4511  ze_context_handle_t ZeContext = Program->Context->ZeContext;
4512  ze_module_handle_t ZeModule = nullptr;
4513 
4514  pi_result Result = PI_SUCCESS;
4515  Program->State = _pi_program::Exe;
4516  ze_result_t ZeResult =
4517  ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc,
4518  &ZeModule, &Program->ZeBuildLog));
4519  if (ZeResult != ZE_RESULT_SUCCESS) {
4520  // We adjust pi_program below to avoid attempting to release zeModule when
4521  // RT calls piProgramRelease().
4522  ZeModule = nullptr;
4523  Program->State = _pi_program::Invalid;
4524  Result = mapError(ZeResult);
4525  } else {
4526  // The call to zeModuleCreate does not report an error if there are
4527  // unresolved symbols because it thinks these could be resolved later via a
4528  // call to zeModuleDynamicLink. However, modules created with
4529  // piProgramBuild are supposed to be fully linked and ready to use.
4530  // Therefore, do an extra check now for unresolved symbols.
4531  ZeResult = checkUnresolvedSymbols(ZeModule, &Program->ZeBuildLog);
4532  if (ZeResult != ZE_RESULT_SUCCESS) {
4533  Program->State = _pi_program::Invalid;
4534  Result = (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE)
4535  ? PI_ERROR_BUILD_PROGRAM_FAILURE
4536  : mapError(ZeResult);
4537  }
4538  }
4539 
4540  // We no longer need the IL / native code.
4541  Program->Code.reset();
4542  Program->ZeModule = ZeModule;
4543  return Result;
4544 }
4545 
4547  pi_program_build_info ParamName,
4548  size_t ParamValueSize, void *ParamValue,
4549  size_t *ParamValueSizeRet) {
4550  (void)Device;
4551 
4552  std::shared_lock Guard(Program->Mutex);
4553  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4554  if (ParamName == PI_PROGRAM_BUILD_INFO_BINARY_TYPE) {
4556  if (Program->State == _pi_program::Object) {
4558  } else if (Program->State == _pi_program::Exe) {
4560  }
4561  return ReturnValue(pi_program_binary_type{Type});
4562  }
4563  if (ParamName == PI_PROGRAM_BUILD_INFO_OPTIONS) {
4564  // TODO: how to get module build options out of Level Zero?
4565  // For the programs that we compiled we can remember the options
4566  // passed with piProgramCompile/piProgramBuild, but what can we
4567  // return for programs that were built outside and registered
4568  // with piProgramRegister?
4569  return ReturnValue("");
4570  } else if (ParamName == PI_PROGRAM_BUILD_INFO_LOG) {
4571  // Check first to see if the plugin code recorded an error message.
4572  if (!Program->ErrorMessage.empty()) {
4573  return ReturnValue(Program->ErrorMessage.c_str());
4574  }
4575 
4576  // Next check if there is a Level Zero build log.
4577  if (Program->ZeBuildLog) {
4578  size_t LogSize = ParamValueSize;
4579  ZE_CALL(zeModuleBuildLogGetString,
4580  (Program->ZeBuildLog, &LogSize, pi_cast<char *>(ParamValue)));
4581  if (ParamValueSizeRet) {
4582  *ParamValueSizeRet = LogSize;
4583  }
4584  return PI_SUCCESS;
4585  }
4586 
4587  // Otherwise, there is no error. The OpenCL spec says to return an empty
4588  // string if there ws no previous attempt to compile, build, or link the
4589  // program.
4590  return ReturnValue("");
4591  } else {
4592  zePrint("piProgramGetBuildInfo: unsupported ParamName\n");
4593  return PI_ERROR_INVALID_VALUE;
4594  }
4595  return PI_SUCCESS;
4596 }
4597 
4599  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4600  Program->RefCount.increment();
4601  return PI_SUCCESS;
4602 }
4603 
4605  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4606 
4607  if (!Program->RefCount.decrementAndTest())
4608  return PI_SUCCESS;
4609 
4610  delete Program;
4611 
4612  return PI_SUCCESS;
4613 }
4614 
4616  pi_native_handle *NativeHandle) {
4617  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4618  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
4619 
4620  auto ZeModule = pi_cast<ze_module_handle_t *>(NativeHandle);
4621 
4622  std::shared_lock Guard(Program->Mutex);
4623  switch (Program->State) {
4624  case _pi_program::Exe: {
4625  *ZeModule = Program->ZeModule;
4626  break;
4627  }
4628 
4629  default:
4630  return PI_ERROR_INVALID_OPERATION;
4631  }
4632 
4633  return PI_SUCCESS;
4634 }
4635 
4638  bool ownNativeHandle,
4639  pi_program *Program) {
4640  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4641  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
4642  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
4643 
4644  auto ZeModule = pi_cast<ze_module_handle_t>(NativeHandle);
4645 
4646  // We assume here that programs created from a native handle always
4647  // represent a fully linked executable (state Exe) and not an unlinked
4648  // executable (state Object).
4649 
4650  try {
4651  *Program =
4652  new _pi_program(_pi_program::Exe, Context, ZeModule, ownNativeHandle);
4653  } catch (const std::bad_alloc &) {
4654  return PI_ERROR_OUT_OF_HOST_MEMORY;
4655  } catch (...) {
4656  return PI_ERROR_UNKNOWN;
4657  }
4658  return PI_SUCCESS;
4659 }
4660 
4662  // According to Level Zero Specification, all kernels and build logs
4663  // must be destroyed before the Module can be destroyed. So, be sure
4664  // to destroy build log before destroying the module.
4665  if (ZeBuildLog) {
4666  ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (ZeBuildLog));
4667  }
4668 
4669  if (ZeModule && OwnZeModule) {
4670  ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule));
4671  }
4672 }
4673 
4674 // Check to see if a Level Zero module has any unresolved symbols.
4675 //
4676 // @param ZeModule The module handle to check.
4677 // @param ZeBuildLog If there are unresolved symbols, this build log handle is
4678 // modified to receive information telling which symbols
4679 // are unresolved.
4680 //
4681 // @return ZE_RESULT_ERROR_MODULE_LINK_FAILURE indicates there are unresolved
4682 // symbols. ZE_RESULT_SUCCESS indicates all symbols are resolved. Any other
4683 // value indicates there was an error and we cannot tell if symbols are
4684 // resolved.
4685 static ze_result_t
4687  ze_module_build_log_handle_t *ZeBuildLog) {
4688 
4689  // First check to see if the module has any imported symbols. If there are
4690  // no imported symbols, it's not possible to have any unresolved symbols. We
4691  // do this check first because we assume it's faster than the call to
4692  // zeModuleDynamicLink below.
4693  ZeStruct<ze_module_properties_t> ZeModuleProps;
4694  ze_result_t ZeResult =
4695  ZE_CALL_NOCHECK(zeModuleGetProperties, (ZeModule, &ZeModuleProps));
4696  if (ZeResult != ZE_RESULT_SUCCESS)
4697  return ZeResult;
4698 
4699  // If there are imported symbols, attempt to "link" the module with itself.
4700  // As a side effect, this will return the error
4701  // ZE_RESULT_ERROR_MODULE_LINK_FAILURE if there are any unresolved symbols.
4702  if (ZeModuleProps.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS) {
4703  return ZE_CALL_NOCHECK(zeModuleDynamicLink, (1, &ZeModule, ZeBuildLog));
4704  }
4705  return ZE_RESULT_SUCCESS;
4706 }
4707 
4708 pi_result piKernelCreate(pi_program Program, const char *KernelName,
4709  pi_kernel *RetKernel) {
4710 
4711  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4712  PI_ASSERT(RetKernel, PI_ERROR_INVALID_VALUE);
4713  PI_ASSERT(KernelName, PI_ERROR_INVALID_VALUE);
4714 
4715  std::shared_lock Guard(Program->Mutex);
4716  if (Program->State != _pi_program::Exe) {
4717  return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
4718  }
4719 
4720  ZeStruct<ze_kernel_desc_t> ZeKernelDesc;
4721  ZeKernelDesc.flags = 0;
4722  ZeKernelDesc.pKernelName = KernelName;
4723 
4724  ze_kernel_handle_t ZeKernel;
4725  ZE_CALL(zeKernelCreate, (Program->ZeModule, &ZeKernelDesc, &ZeKernel));
4726 
4727  try {
4728  *RetKernel = new _pi_kernel(ZeKernel, true, Program);
4729  } catch (const std::bad_alloc &) {
4730  return PI_ERROR_OUT_OF_HOST_MEMORY;
4731  } catch (...) {
4732  return PI_ERROR_UNKNOWN;
4733  }
4734 
4735  PI_CALL((*RetKernel)->initialize());
4736  return PI_SUCCESS;
4737 }
4738 
4740  // Retain the program and context to show it's used by this kernel.
4741  PI_CALL(piProgramRetain(Program));
4742  if (IndirectAccessTrackingEnabled)
4743  // TODO: do piContextRetain without the guard
4744  PI_CALL(piContextRetain(Program->Context));
4745 
4746  // Set up how to obtain kernel properties when needed.
4747  ZeKernelProperties.Compute = [this](ze_kernel_properties_t &Properties) {
4748  ZE_CALL_NOCHECK(zeKernelGetProperties, (ZeKernel, &Properties));
4749  };
4750 
4751  // Cache kernel name.
4752  ZeKernelName.Compute = [this](std::string &Name) {
4753  size_t Size = 0;
4754  ZE_CALL_NOCHECK(zeKernelGetName, (ZeKernel, &Size, nullptr));
4755  char *KernelName = new char[Size];
4756  ZE_CALL_NOCHECK(zeKernelGetName, (ZeKernel, &Size, KernelName));
4757  Name = KernelName;
4758  delete[] KernelName;
4759  };
4760 
4761  return PI_SUCCESS;
4762 }
4763 
4764 pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex, size_t ArgSize,
4765  const void *ArgValue) {
4766 
4767  // OpenCL: "the arg_value pointer can be NULL or point to a NULL value
4768  // in which case a NULL value will be used as the value for the argument
4769  // declared as a pointer to global or constant memory in the kernel"
4770  //
4771  // We don't know the type of the argument but it seems that the only time
4772  // SYCL RT would send a pointer to NULL in 'arg_value' is when the argument
4773  // is a NULL pointer. Treat a pointer to NULL in 'arg_value' as a NULL.
4774  if (ArgSize == sizeof(void *) && ArgValue &&
4775  *(void **)(const_cast<void *>(ArgValue)) == nullptr) {
4776  ArgValue = nullptr;
4777  }
4778 
4779  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4780 
4781  std::scoped_lock Guard(Kernel->Mutex);
4782  ZE_CALL(zeKernelSetArgumentValue,
4783  (pi_cast<ze_kernel_handle_t>(Kernel->ZeKernel),
4784  pi_cast<uint32_t>(ArgIndex), pi_cast<size_t>(ArgSize),
4785  pi_cast<const void *>(ArgValue)));
4786 
4787  return PI_SUCCESS;
4788 }
4789 
4790 // Special version of piKernelSetArg to accept pi_mem.
4792  const pi_mem *ArgValue) {
4793  // TODO: the better way would probably be to add a new PI API for
4794  // extracting native PI object from PI handle, and have SYCL
4795  // RT pass that directly to the regular piKernelSetArg (and
4796  // then remove this piextKernelSetArgMemObj).
4797 
4798  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4799 
4800  // We don't yet know the device where this kernel will next be run on.
4801  // Thus we can't know the actual memory allocation that needs to be used.
4802  // Remember the memory object being used as an argument for this kernel
4803  // to process it later when the device is known (at the kernel enqueue).
4804  //
4805  // TODO: for now we have to conservatively assume the access as read-write.
4806  // Improve that by passing SYCL buffer accessor type into
4807  // piextKernelSetArgMemObj.
4808  //
4809  std::scoped_lock Guard(Kernel->Mutex);
4810  Kernel->PendingArguments.push_back(
4811  {ArgIndex, sizeof(void *), *ArgValue, _pi_mem::read_write});
4812 
4813  return PI_SUCCESS;
4814 }
4815 
4816 // Special version of piKernelSetArg to accept pi_sampler.
4818  const pi_sampler *ArgValue) {
4819  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4820 
4821  std::scoped_lock Guard(Kernel->Mutex);
4822  ZE_CALL(zeKernelSetArgumentValue,
4823  (pi_cast<ze_kernel_handle_t>(Kernel->ZeKernel),
4824  pi_cast<uint32_t>(ArgIndex), sizeof(void *),
4825  &(*ArgValue)->ZeSampler));
4826 
4827  return PI_SUCCESS;
4828 }
4829 
4831  size_t ParamValueSize, void *ParamValue,
4832  size_t *ParamValueSizeRet) {
4833  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4834 
4835  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4836 
4837  std::shared_lock Guard(Kernel->Mutex);
4838  switch (ParamName) {
4840  return ReturnValue(pi_context{Kernel->Program->Context});
4842  return ReturnValue(pi_program{Kernel->Program});
4844  try {
4845  std::string &KernelName = *Kernel->ZeKernelName.operator->();
4846  return ReturnValue(static_cast<const char *>(KernelName.c_str()));
4847  } catch (const std::bad_alloc &) {
4848  return PI_ERROR_OUT_OF_HOST_MEMORY;
4849  } catch (...) {
4850  return PI_ERROR_UNKNOWN;
4851  }
4853  return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->numKernelArgs});
4855  return ReturnValue(pi_uint32{Kernel->RefCount.load()});
4857  try {
4858  uint32_t Size;
4859  ZE_CALL(zeKernelGetSourceAttributes, (Kernel->ZeKernel, &Size, nullptr));
4860  char *attributes = new char[Size];
4861  ZE_CALL(zeKernelGetSourceAttributes,
4862  (Kernel->ZeKernel, &Size, &attributes));
4863  auto Res = ReturnValue(attributes);
4864  delete[] attributes;
4865  return Res;
4866  } catch (const std::bad_alloc &) {
4867  return PI_ERROR_OUT_OF_HOST_MEMORY;
4868  } catch (...) {
4869  return PI_ERROR_UNKNOWN;
4870  }
4871  default:
4872  zePrint("Unsupported ParamName in piKernelGetInfo: ParamName=%d(0x%x)\n",
4873  ParamName, ParamName);
4874  return PI_ERROR_INVALID_VALUE;
4875  }
4876 
4877  return PI_SUCCESS;
4878 }
4879 
4881  pi_kernel_group_info ParamName,
4882  size_t ParamValueSize, void *ParamValue,
4883  size_t *ParamValueSizeRet) {
4884  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4885  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
4886 
4887  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4888 
4889  std::shared_lock Guard(Kernel->Mutex);
4890  switch (ParamName) {
4892  // TODO: To revisit after level_zero/issues/262 is resolved
4893  struct {
4894  size_t Arr[3];
4895  } WorkSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX,
4896  Device->ZeDeviceComputeProperties->maxGroupSizeY,
4897  Device->ZeDeviceComputeProperties->maxGroupSizeZ}};
4898  return ReturnValue(WorkSize);
4899  }
4901  uint32_t X, Y, Z;
4902  ZE_CALL(zeKernelSuggestGroupSize,
4903  (Kernel->ZeKernel, 10000, 10000, 10000, &X, &Y, &Z));
4904  return ReturnValue(size_t{X * Y * Z});
4905  }
4907  struct {
4908  size_t Arr[3];
4909  } WgSize = {{Kernel->ZeKernelProperties->requiredGroupSizeX,
4910  Kernel->ZeKernelProperties->requiredGroupSizeY,
4911  Kernel->ZeKernelProperties->requiredGroupSizeZ}};
4912  return ReturnValue(WgSize);
4913  }
4915  return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->localMemSize});
4917  return ReturnValue(size_t{Device->ZeDeviceProperties->physicalEUSimdWidth});
4918  }
4920  return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->privateMemSize});
4922  die("PI_KERNEL_GROUP_INFO_NUM_REGS in piKernelGetGroupInfo not "
4923  "implemented\n");
4924  break;
4925  }
4926  default:
4927  zePrint("Unknown ParamName in piKernelGetGroupInfo: ParamName=%d(0x%x)\n",
4928  ParamName, ParamName);
4929  return PI_ERROR_INVALID_VALUE;
4930  }
4931  return PI_SUCCESS;
4932 }
4933 
4935  pi_kernel_sub_group_info ParamName,
4936  size_t InputValueSize, const void *InputValue,
4937  size_t ParamValueSize, void *ParamValue,
4938  size_t *ParamValueSizeRet) {
4939  (void)Device;
4940  (void)InputValueSize;
4941  (void)InputValue;
4942 
4943  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4944 
4945  std::shared_lock Guard(Kernel->Mutex);
4946  if (ParamName == PI_KERNEL_MAX_SUB_GROUP_SIZE) {
4947  ReturnValue(uint32_t{Kernel->ZeKernelProperties->maxSubgroupSize});
4948  } else if (ParamName == PI_KERNEL_MAX_NUM_SUB_GROUPS) {
4949  ReturnValue(uint32_t{Kernel->ZeKernelProperties->maxNumSubgroups});
4950  } else if (ParamName == PI_KERNEL_COMPILE_NUM_SUB_GROUPS) {
4951  ReturnValue(uint32_t{Kernel->ZeKernelProperties->requiredNumSubGroups});
4952  } else if (ParamName == PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL) {
4953  ReturnValue(uint32_t{Kernel->ZeKernelProperties->requiredSubgroupSize});
4954  } else {
4955  die("piKernelGetSubGroupInfo: parameter not implemented");
4956  return {};
4957  }
4958  return PI_SUCCESS;
4959 }
4960 
4962 
4963  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4964 
4965  Kernel->RefCount.increment();
4966  return PI_SUCCESS;
4967 }
4968 
4970  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4971 
4972  if (IndirectAccessTrackingEnabled) {
4973  // piKernelRelease is called by CleanupCompletedEvent(Event) as soon as
4974  // kernel execution has finished. This is the place where we need to release
4975  // memory allocations. If kernel is not in use (not submitted by some
4976  // other thread) then release referenced memory allocations. As a result,
4977  // memory can be deallocated and context can be removed from container in
4978  // the platform. That's why we need to lock a mutex here.
4979  pi_platform Plt = Kernel->Program->Context->getPlatform();
4980  std::scoped_lock ContextsLock(Plt->ContextsMutex);
4981 
4982  if (--Kernel->SubmissionsCount == 0) {
4983  // Kernel is not submitted for execution, release referenced memory
4984  // allocations.
4985  for (auto &MemAlloc : Kernel->MemAllocs) {
4986  USMFreeHelper(MemAlloc->second.Context, MemAlloc->first,
4987  MemAlloc->second.OwnZeMemHandle);
4988  }
4989  Kernel->MemAllocs.clear();
4990  }
4991  }
4992 
4993  if (!Kernel->RefCount.decrementAndTest())
4994  return PI_SUCCESS;
4995 
4996  auto KernelProgram = Kernel->Program;
4997  if (Kernel->OwnZeKernel)
4998  ZE_CALL(zeKernelDestroy, (Kernel->ZeKernel));
4999  if (IndirectAccessTrackingEnabled) {
5000  PI_CALL(piContextRelease(KernelProgram->Context));
5001  }
5002  // do a release on the program this kernel was part of
5003  PI_CALL(piProgramRelease(KernelProgram));
5004  delete Kernel;
5005 
5006  return PI_SUCCESS;
5007 }
5008 
5009 pi_result
5011  const size_t *GlobalWorkOffset,
5012  const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
5013  pi_uint32 NumEventsInWaitList,
5014  const pi_event *EventWaitList, pi_event *Event) {
5015  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
5016  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
5017  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5018  PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION);
5019 
5020  // Lock automatically releases when this goes out of scope.
5021  std::scoped_lock Lock(Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex);
5022  if (GlobalWorkOffset != NULL) {
5024  zePrint("No global offset extension found on this driver\n");
5025  return PI_ERROR_INVALID_VALUE;
5026  }
5027 
5028  ZE_CALL(zeKernelSetGlobalOffsetExp,
5029  (Kernel->ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1],
5030  GlobalWorkOffset[2]));
5031  }
5032 
5033  // If there are any pending arguments set them now.
5034  for (auto &Arg : Kernel->PendingArguments) {
5035  char **ZeHandlePtr;
5036  PI_CALL(
5037  Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode, Queue->Device));
5038  ZE_CALL(zeKernelSetArgumentValue,
5039  (Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr));
5040  }
5041  Kernel->PendingArguments.clear();
5042 
5043  ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
5044  uint32_t WG[3];
5045 
5046  // global_work_size of unused dimensions must be set to 1
5047  PI_ASSERT(WorkDim == 3 || GlobalWorkSize[2] == 1, PI_ERROR_INVALID_VALUE);
5048  PI_ASSERT(WorkDim >= 2 || GlobalWorkSize[1] == 1, PI_ERROR_INVALID_VALUE);
5049 
5050  if (LocalWorkSize) {
5051  WG[0] = pi_cast<uint32_t>(LocalWorkSize[0]);
5052  WG[1] = pi_cast<uint32_t>(LocalWorkSize[1]);
5053  WG[2] = pi_cast<uint32_t>(LocalWorkSize[2]);
5054  } else {
5055  ZE_CALL(zeKernelSuggestGroupSize,
5056  (Kernel->ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1],
5057  GlobalWorkSize[2], &WG[0], &WG[1], &WG[2]));
5058  }
5059 
5060  // TODO: assert if sizes do not fit into 32-bit?
5061  switch (WorkDim) {
5062  case 3:
5063  ZeThreadGroupDimensions.groupCountX =
5064  pi_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
5065  ZeThreadGroupDimensions.groupCountY =
5066  pi_cast<uint32_t>(GlobalWorkSize[1] / WG[1]);
5067  ZeThreadGroupDimensions.groupCountZ =
5068  pi_cast<uint32_t>(GlobalWorkSize[2] / WG[2]);
5069  break;
5070  case 2:
5071  ZeThreadGroupDimensions.groupCountX =
5072  pi_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
5073  ZeThreadGroupDimensions.groupCountY =
5074  pi_cast<uint32_t>(GlobalWorkSize[1] / WG[1]);
5075  WG[2] = 1;
5076  break;
5077  case 1:
5078  ZeThreadGroupDimensions.groupCountX =
5079  pi_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
5080  WG[1] = WG[2] = 1;
5081  break;
5082 
5083  default:
5084  zePrint("piEnqueueKernelLaunch: unsupported work_dim\n");
5085  return PI_ERROR_INVALID_VALUE;
5086  }
5087 
5088  // Error handling for non-uniform group size case
5089  if (GlobalWorkSize[0] != (ZeThreadGroupDimensions.groupCountX * WG[0])) {
5090  zePrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
5091  "multiple of the group size in the 1st dimension\n");
5092  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
5093  }
5094  if (GlobalWorkSize[1] != (ZeThreadGroupDimensions.groupCountY * WG[1])) {
5095  zePrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
5096  "multiple of the group size in the 2nd dimension\n");
5097  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
5098  }
5099  if (GlobalWorkSize[2] != (ZeThreadGroupDimensions.groupCountZ * WG[2])) {
5100  zePrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
5101  "multiple of the group size in the 3rd dimension\n");
5102  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
5103  }
5104 
5105  ZE_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2]));
5106 
5107  bool UseCopyEngine = false;
5108  _pi_ze_event_list_t TmpWaitList;
5109  if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
5110  NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
5111  return Res;
5112 
5113  // Get a new command list to be used on this call
5114  pi_command_list_ptr_t CommandList{};
5115  if (auto Res = Queue->Context->getAvailableCommandList(
5116  Queue, CommandList, UseCopyEngine, true /* AllowBatching */))
5117  return Res;
5118 
5119  ze_event_handle_t ZeEvent = nullptr;
5121  Queue, Event, PI_COMMAND_TYPE_NDRANGE_KERNEL, CommandList);
5122  if (Res != PI_SUCCESS)
5123  return Res;
5124  ZeEvent = (*Event)->ZeEvent;
5125  (*Event)->WaitList = TmpWaitList;
5126 
5127  // Save the kernel in the event, so that when the event is signalled
5128  // the code can do a piKernelRelease on this kernel.
5129  (*Event)->CommandData = (void *)Kernel;
5130 
5131  // Increment the reference count of the Kernel and indicate that the Kernel is
5132  // in use. Once the event has been signalled, the code in
5133  // CleanupCompletedEvent(Event) will do a piReleaseKernel to update the
5134  // reference count on the kernel, using the kernel saved in CommandData.
5135  PI_CALL(piKernelRetain(Kernel));
5136 
5137  // Add to list of kernels to be submitted
5138  if (IndirectAccessTrackingEnabled)
5139  Queue->KernelsToBeSubmitted.push_back(Kernel);
5140 
5141  if (UseImmediateCommandLists && IndirectAccessTrackingEnabled) {
5142  // If using immediate commandlists then gathering of indirect
5143  // references and appending to the queue (which means submission)
5144  // must be done together.
5145  std::unique_lock<pi_shared_mutex> ContextsLock(
5146  Queue->Device->Platform->ContextsMutex, std::defer_lock);
5147  // We are going to submit kernels for execution. If indirect access flag is
5148  // set for a kernel then we need to make a snapshot of existing memory
5149  // allocations in all contexts in the platform. We need to lock the mutex
5150  // guarding the list of contexts in the platform to prevent creation of new
5151  // memory alocations in any context before we submit the kernel for
5152  // execution.
5153  ContextsLock.lock();
5154  Queue->CaptureIndirectAccesses();
5155  // Add the command to the command list, which implies submission.
5156  ZE_CALL(zeCommandListAppendLaunchKernel,
5157  (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
5158  ZeEvent, (*Event)->WaitList.Length,
5159  (*Event)->WaitList.ZeEventList));
5160  } else {
5161  // Add the command to the command list for later submission.
5162  // No lock is needed here, unlike the immediate commandlist case above,
5163  // because the kernels are not actually submitted yet. Kernels will be
5164  // submitted only when the comamndlist is closed. Then, a lock is held.
5165  ZE_CALL(zeCommandListAppendLaunchKernel,
5166  (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
5167  ZeEvent, (*Event)->WaitList.Length,
5168  (*Event)->WaitList.ZeEventList));
5169  }
5170 
5171  zePrint("calling zeCommandListAppendLaunchKernel() with"
5172  " ZeEvent %#lx\n",
5173  pi_cast<std::uintptr_t>(ZeEvent));
5174  printZeEventList((*Event)->WaitList);
5175 
5176  // Execute command list asynchronously, as the event will be used
5177  // to track down its completion.
5178  if (auto Res = Queue->executeCommandList(CommandList, false, true))
5179  return Res;
5180 
5181  return PI_SUCCESS;
5182 }
5183 
5186  pi_program Program,
5187  bool OwnNativeHandle,
5188  pi_kernel *Kernel) {
5189  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
5190  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
5191  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
5192  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
5193 
5194  auto ZeKernel = pi_cast<ze_kernel_handle_t>(NativeHandle);
5195  *Kernel = new _pi_kernel(ZeKernel, OwnNativeHandle, Program);
5196  PI_CALL((*Kernel)->initialize());
5197  return PI_SUCCESS;
5198 }
5199 
5201  pi_native_handle *NativeHandle) {
5202  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
5203  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
5204 
5205  std::shared_lock Guard(Kernel->Mutex);
5206  auto *ZeKernel = pi_cast<ze_kernel_handle_t *>(NativeHandle);
5207  *ZeKernel = Kernel->ZeKernel;
5208  return PI_SUCCESS;
5209 }
5210 
5211 //
5212 // Events
5213 //
5214 pi_result
5216  PI_ASSERT(Queue, PI_ERROR_INVALID_EVENT);
5217 
5218  std::scoped_lock Lock(Queue->Mutex, this->Mutex);
5219 
5220  if (!HostVisibleEvent) {
5221  if (EventsScope != OnDemandHostVisibleProxy)
5222  die("getOrCreateHostVisibleEvent: missing host-visible event");
5223 
5224  // Submit the command(s) signalling the proxy event to the queue.
5225  // We have to first submit a wait for the device-only event for which this
5226  // proxy is created.
5227  //
5228  // Get a new command list to be used on this call
5229 
5230  // We want to batch these commands to avoid extra submissions (costly)
5231  bool OkToBatch = true;
5232 
5233  pi_command_list_ptr_t CommandList{};
5234  if (auto Res = Queue->Context->getAvailableCommandList(
5235  Queue, CommandList, false /* UseCopyEngine */, OkToBatch))
5236  return Res;
5237 
5238  // Create a "proxy" host-visible event.
5239  auto Res = createEventAndAssociateQueue(
5240  Queue, &HostVisibleEvent, PI_COMMAND_TYPE_USER, CommandList, true);
5241  // HostVisibleEvent->CleanedUp = true;
5242  if (Res != PI_SUCCESS)
5243  return Res;
5244 
5245  ZE_CALL(zeCommandListAppendWaitOnEvents, (CommandList->first, 1, &ZeEvent));
5246  ZE_CALL(zeCommandListAppendSignalEvent,
5247  (CommandList->first, HostVisibleEvent->ZeEvent));
5248 
5249  if (auto Res = Queue->executeCommandList(CommandList, false, OkToBatch))
5250  return Res;
5251  }
5252 
5253  ZeHostVisibleEvent = HostVisibleEvent->ZeEvent;
5254  return PI_SUCCESS;
5255 }
5256 
5257 // Helper function for creating a PI event.
5258 // The "Queue" argument specifies the PI queue where a command is submitted.
5259 // The "HostVisible" argument specifies if event needs to be allocated from
5260 // a host-visible pool.
5261 //
5263  bool HostVisible, pi_event *RetEvent) {
5264 
5265  bool ProfilingEnabled =
5266  !Queue || (Queue->Properties & PI_QUEUE_PROFILING_ENABLE) != 0;
5267 
5268  size_t Index = 0;
5269  ze_event_pool_handle_t ZeEventPool = {};
5270  if (auto Res = Context->getFreeSlotInExistingOrNewPool(
5271  ZeEventPool, Index, HostVisible, ProfilingEnabled))
5272  return Res;
5273 
5274  ze_event_handle_t ZeEvent;
5275  ZeStruct<ze_event_desc_t> ZeEventDesc;
5276  ZeEventDesc.index = Index;
5277  ZeEventDesc.wait = 0;
5278 
5279  if (HostVisible) {
5280  ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
5281  } else {
5282  //
5283  // Set the scope to "device" for every event. This is sufficient for global
5284  // device access and peer device access. If needed to be seen on the host
5285  // we are doing special handling, see EventsScope options.
5286  //
5287  // TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be
5288  // used in some circumstances.
5289  //
5290  ZeEventDesc.signal = 0;
5291  }
5292 
5293  ZE_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeEvent));
5294 
5295  try {
5296  PI_ASSERT(RetEvent, PI_ERROR_INVALID_VALUE);
5297 
5298  *RetEvent = new _pi_event(ZeEvent, ZeEventPool, Context,
5299  PI_COMMAND_TYPE_USER, true);
5300  } catch (const std::bad_alloc &) {
5301  return PI_ERROR_OUT_OF_HOST_MEMORY;
5302  } catch (...) {
5303  return PI_ERROR_UNKNOWN;
5304  }
5305 
5306  if (HostVisible)
5307  (*RetEvent)->HostVisibleEvent = *RetEvent;
5308 
5309  return PI_SUCCESS;
5310 }
5311 
5312 // Exteral PI API entry
5314  return EventCreate(Context, nullptr, EventsScope == AllHostVisible, RetEvent);
5315 }
5316 
5318  size_t ParamValueSize, void *ParamValue,
5319  size_t *ParamValueSizeRet) {
5320 
5321  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5322 
5323  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
5324  switch (ParamName) {
5326  std::shared_lock EventLock(Event->Mutex);
5327  return ReturnValue(pi_queue{Event->Queue});
5328  }
5329  case PI_EVENT_INFO_CONTEXT: {
5330  std::shared_lock EventLock(Event->Mutex);
5331  return ReturnValue(pi_context{Event->Context});
5332  }
5334  std::shared_lock EventLock(Event->Mutex);
5335  return ReturnValue(pi_cast<pi_uint64>(Event->CommandType));
5336  }
5338  // Check to see if the event's Queue has an open command list due to
5339  // batching. If so, go ahead and close and submit it, because it is
5340  // possible that this is trying to query some event's status that
5341  // is part of the batch. This isn't strictly required, but it seems
5342  // like a reasonable thing to do.
5343  auto Queue = Event->Queue;
5344  if (Queue) {
5345  // Lock automatically releases when this goes out of scope.
5346  std::scoped_lock lock(Queue->Mutex);
5347  const auto &OpenCommandList = Queue->eventOpenCommandList(Event);
5348  if (OpenCommandList != Queue->CommandListMap.end()) {
5349  if (auto Res = Queue->executeOpenCommandList(
5350  OpenCommandList->second.isCopy(Queue)))
5351  return Res;
5352  }
5353  }
5354 
5355  // Level Zero has a much more explicit notion of command submission than
5356  // OpenCL. It doesn't happen unless the user submits a command list. We've
5357  // done it just above so the status is at least PI_EVENT_RUNNING.
5358  pi_int32 Result = PI_EVENT_RUNNING;
5359 
5360  // Make sure that we query a host-visible event only.
5361  // If one wasn't yet created then don't create it here as well, and
5362  // just conservatively return that event is not yet completed.
5363  std::shared_lock EventLock(Event->Mutex);
5364  auto HostVisibleEvent = Event->HostVisibleEvent;
5365  if (Event->Completed) {
5366  Result = PI_EVENT_COMPLETE;
5367  } else if (HostVisibleEvent) {
5368  ze_result_t ZeResult;
5369  ZeResult =
5370  ZE_CALL_NOCHECK(zeEventQueryStatus, (HostVisibleEvent->ZeEvent));
5371  if (ZeResult == ZE_RESULT_SUCCESS) {
5372  Result = PI_EVENT_COMPLETE;
5373  }
5374  }
5375  return ReturnValue(pi_cast<pi_int32>(Result));
5376  }
5378  return ReturnValue(pi_uint32{Event->RefCount.load()});
5379  default:
5380  zePrint("Unsupported ParamName in piEventGetInfo: ParamName=%d(%x)\n",
5381  ParamName, ParamName);
5382  return PI_ERROR_INVALID_VALUE;
5383  }
5384 
5385  return PI_SUCCESS;
5386 }
5387 
5389  size_t ParamValueSize, void *ParamValue,
5390  size_t *ParamValueSizeRet) {
5391 
5392  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5393 
5394  std::shared_lock EventLock(Event->Mutex);
5395  if (Event->Queue &&
5396  (Event->Queue->Properties & PI_QUEUE_PROFILING_ENABLE) == 0) {
5397  return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE;
5398  }
5399 
5400  pi_device Device =
5401  Event->Queue ? Event->Queue->Device : Event->Context->Devices[0];
5402 
5403  uint64_t ZeTimerResolution = Device->ZeDeviceProperties->timerResolution;
5404  const uint64_t TimestampMaxValue =
5405  ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL);
5406 
5407  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
5408 
5409  ze_kernel_timestamp_result_t tsResult;
5410 
5411  switch (ParamName) {
5413  ZE_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult));
5414  uint64_t ContextStartTime =
5415  (tsResult.global.kernelStart & TimestampMaxValue) * ZeTimerResolution;
5416  return ReturnValue(ContextStartTime);
5417  }
5419  ZE_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult));
5420 
5421  uint64_t ContextStartTime =
5422  (tsResult.global.kernelStart & TimestampMaxValue);
5423  uint64_t ContextEndTime = (tsResult.global.kernelEnd & TimestampMaxValue);
5424 
5425  //
5426  // Handle a possible wrap-around (the underlying HW counter is < 64-bit).
5427  // Note, it will not report correct time if there were multiple wrap
5428  // arounds, and the longer term plan is to enlarge the capacity of the
5429  // HW timestamps.
5430  //
5431  if (ContextEndTime <= ContextStartTime) {
5432  ContextEndTime += TimestampMaxValue;
5433  }
5434  ContextEndTime *= ZeTimerResolution;
5435  return ReturnValue(ContextEndTime);
5436  }
5439  // TODO: Support these when Level Zero supported is added.
5440  return ReturnValue(uint64_t{0});
5441  default:
5442  zePrint("piEventGetProfilingInfo: not supported ParamName\n");
5443  return PI_ERROR_INVALID_VALUE;
5444  }
5445 
5446  return PI_SUCCESS;
5447 }
5448 
5449 } // extern "C"
5450 
5451 // Perform any necessary cleanup after an event has been signalled.
5452 // This currently makes sure to release any kernel that may have been used by
5453 // the event, updates the last command event in the queue and cleans up all dep
5454 // events of the event.
5455 // The caller must not lock any mutexes.
5457  pi_kernel AssociatedKernel = nullptr;
5458  // List of dependent events.
5459  std::list<pi_event> EventsToBeReleased;
5460  pi_queue AssociatedQueue = nullptr;
5461  {
5462  std::scoped_lock EventLock(Event->Mutex);
5463  // Exit early of event was already cleanedup.
5464  if (Event->CleanedUp)
5465  return PI_SUCCESS;
5466 
5467  AssociatedQueue = Event->Queue;
5468 
5469  // Remember the kernel associated with this event if there is one. We are
5470  // going to release it later.
5472  Event->CommandData) {
5473  AssociatedKernel = pi_cast<pi_kernel>(Event->CommandData);
5474  Event->CommandData = nullptr;
5475  }
5476 
5477  // Make a list of all the dependent events that must have signalled
5478  // because this event was dependent on them.
5480  EventsToBeReleased);
5481 
5482  Event->CleanedUp = true;
5483  }
5484 
5485  // We've reset event data members above, now cleanup resources.
5486  if (AssociatedKernel)
5487  PI_CALL(piKernelRelease(AssociatedKernel));
5488 
5489  if (AssociatedQueue) {
5490  {
5491  // Lock automatically releases when this goes out of scope.
5492  std::scoped_lock Lock(AssociatedQueue->Mutex);
5493 
5494  // If this event was the LastCommandEvent in the queue, being used
5495  // to make sure that commands were executed in-order, remove this.
5496  // If we don't do this, the event can get released and freed leaving
5497  // a dangling pointer to this event. It could also cause unneeded
5498  // already finished events to show up in the wait list.
5499  if (AssociatedQueue->LastCommandEvent == Event) {
5500  AssociatedQueue->LastCommandEvent = nullptr;
5501  }
5502  }
5503 
5504  // Release this event since we explicitly retained it on creation and
5505  // association with queue. Events which don't have associated queue doesn't
5506  // require this release because it means that they are not created using
5507  // createEventAndAssociateQueue, i.e. additional retain was not made.
5508  PI_CALL(piEventRelease(Event));
5509  }
5510 
5511  // The list of dependent events will be appended to as we walk it so that this
5512  // algorithm doesn't go recursive due to dependent events themselves being
5513  // dependent on other events forming a potentially very deep tree, and deep
5514  // recursion. That turned out to be a significant problem with the recursive
5515  // code that preceded this implementation.
5516  while (!EventsToBeReleased.empty()) {
5517  pi_event DepEvent = EventsToBeReleased.front();
5518  EventsToBeReleased.pop_front();
5519 
5520  pi_kernel DepEventKernel = nullptr;
5521  {
5522  std::scoped_lock DepEventLock(DepEvent->Mutex);
5524  EventsToBeReleased);
5525  if (IndirectAccessTrackingEnabled) {
5526  // DepEvent has finished, we can release the associated kernel if there
5527  // is one. This is the earliest place we can do this and it can't be
5528  // done twice, so it is safe. Lock automatically releases when this goes
5529  // out of scope.
5530  // TODO: this code needs to be moved out of the guard.
5531  if (DepEvent->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL &&
5532  DepEvent->CommandData) {
5533  DepEventKernel = pi_cast<pi_kernel>(DepEvent->CommandData);
5534  DepEvent->CommandData = nullptr;
5535  }
5536  }
5537  }
5538  if (DepEventKernel)
5539  PI_CALL(piKernelRelease(pi_cast<pi_kernel>(DepEvent->CommandData)));
5540  PI_CALL(piEventRelease(DepEvent));
5541  }
5542 
5543  return PI_SUCCESS;
5544 }
5545 
5546 extern "C" {
5547 
5548 pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
5549 
5550  if (NumEvents && !EventList) {
5551  return PI_ERROR_INVALID_EVENT;
5552  }
5553  if (EventsScope == OnDemandHostVisibleProxy) {
5554  // Make sure to add all host-visible "proxy" event signals if needed.
5555  // This ensures that all signalling commands are submitted below and
5556  // thus proxy events can be waited without a deadlock.
5557  //
5558  for (uint32_t I = 0; I < NumEvents; I++) {
5559  ze_event_handle_t ZeHostVisibleEvent;
5560  if (auto Res =
5561  EventList[I]->getOrCreateHostVisibleEvent(ZeHostVisibleEvent))
5562  return Res;
5563  }
5564  }
5565  // Submit dependent open command lists for execution, if any
5566  for (uint32_t I = 0; I < NumEvents; I++) {
5567  auto Queue = EventList[I]->Queue;
5568  if (Queue) {
5569  // Lock automatically releases when this goes out of scope.
5570  std::scoped_lock lock(Queue->Mutex);
5571 
5572  if (auto Res = Queue->executeAllOpenCommandLists())
5573  return Res;
5574  }
5575  }
5576  std::unordered_set<pi_queue> Queues;
5577  for (uint32_t I = 0; I < NumEvents; I++) {
5578  {
5579  std::shared_lock EventLock(EventList[I]->Mutex);
5580  if (!EventList[I]->Completed) {
5581  auto HostVisibleEvent = EventList[I]->HostVisibleEvent;
5582  if (!HostVisibleEvent)
5583  die("The host-visible proxy event missing");
5584 
5585  ze_event_handle_t ZeEvent = HostVisibleEvent->ZeEvent;
5586  zePrint("ZeEvent = %#lx\n", pi_cast<std::uintptr_t>(ZeEvent));
5587  ZE_CALL(zeHostSynchronize, (ZeEvent));
5588  }
5589  if (auto Q = EventList[I]->Queue)
5590  Queues.insert(Q);
5591  }
5592  // NOTE: we are cleaning up after the event here to free resources
5593  // sooner in case run-time is not calling piEventRelease soon enough.
5594  CleanupCompletedEvent(EventList[I]);
5595  }
5596 
5597  // We waited some events above, check queue for signaled command lists and
5598  // reset them.
5599  for (auto Q : Queues)
5600  resetCommandLists(Q);
5601 
5602  return PI_SUCCESS;
5603 }
5604 
5605 pi_result piEventSetCallback(pi_event Event, pi_int32 CommandExecCallbackType,
5606  void (*PFnNotify)(pi_event Event,
5607  pi_int32 EventCommandStatus,
5608  void *UserData),
5609  void *UserData) {
5610  (void)Event;
5611  (void)CommandExecCallbackType;
5612  (void)PFnNotify;
5613  (void)UserData;
5614  die("piEventSetCallback: deprecated, to be removed");
5615  return PI_SUCCESS;
5616 }
5617 
5618 pi_result piEventSetStatus(pi_event Event, pi_int32 ExecutionStatus) {
5619  (void)Event;
5620  (void)ExecutionStatus;
5621  die("piEventSetStatus: deprecated, to be removed");
5622  return PI_SUCCESS;
5623 }
5624 
5626  Event->RefCount.increment();
5627  return PI_SUCCESS;
5628 }
5629 
5631  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5632 
5633  if (!Event->RefCount.decrementAndTest())
5634  return PI_SUCCESS;
5635 
5637  Event->CommandData) {
5638  // Free the memory allocated in the piEnqueueMemBufferMap.
5639  if (auto Res = ZeMemFreeHelper(Event->Context, Event->CommandData))
5640  return Res;
5641  Event->CommandData = nullptr;
5642  }
5643  if (Event->OwnZeEvent) {
5644  ZE_CALL(zeEventDestroy, (Event->ZeEvent));
5645  }
5646  // It is possible that host-visible event was never created.
5647  // In case it was check if that's different from this same event
5648  // and release a reference to it.
5649  if (Event->HostVisibleEvent && Event->HostVisibleEvent != Event) {
5650  // Decrement ref-count of the host-visible proxy event.
5652  }
5653 
5654  auto Context = Event->Context;
5655  if (auto Res = Context->decrementUnreleasedEventsInPool(Event))
5656  return Res;
5657 
5658  // We intentionally incremented the reference counter when an event is
5659  // created so that we can avoid pi_queue is released before the associated
5660  // pi_event is released. Here we have to decrement it so pi_queue
5661  // can be released successfully.
5662  if (Event->Queue) {
5664  }
5665  delete Event;
5666 
5667  return PI_SUCCESS;
5668 }
5669 
5671  pi_native_handle *NativeHandle) {
5672  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5673  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
5674 
5675  {
5676  std::shared_lock Lock(Event->Mutex);
5677  auto *ZeEvent = pi_cast<ze_event_handle_t *>(NativeHandle);
5678  *ZeEvent = Event->ZeEvent;
5679  }
5680  // Event can potentially be in an open command-list, make sure that
5681  // it is submitted for execution to avoid potential deadlock if
5682  // interop app is going to wait for it.
5683  auto Queue = Event->Queue;
5684  if (Queue) {
5685  std::scoped_lock lock(Queue->Mutex);
5686  const auto &OpenCommandList = Queue->eventOpenCommandList(Event);
5687  if (OpenCommandList != Queue->CommandListMap.end()) {
5688  if (auto Res = Queue->executeOpenCommandList(
5689  OpenCommandList->second.isCopy(Queue)))
5690  return Res;
5691  }
5692  }
5693  return PI_SUCCESS;
5694 }
5695 
5698  bool OwnNativeHandle,
5699  pi_event *Event) {
5700  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
5701  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5702  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
5703 
5704  auto ZeEvent = pi_cast<ze_event_handle_t>(NativeHandle);
5705  *Event = new _pi_event(ZeEvent, nullptr /* ZeEventPool */, Context,
5706  PI_COMMAND_TYPE_USER, OwnNativeHandle);
5707 
5708  // Assume native event is host-visible, or otherwise we'd
5709  // need to create a host-visible proxy for it.
5710  (*Event)->HostVisibleEvent = *Event;
5711 
5712  // Unlike regular events managed by SYCL RT we don't have to wait for interop
5713  // events completion, and not need to do the their `cleanup()`. This in
5714  // particular guarantees that the extra `piEventRelease` is not called on
5715  // them. That release is needed to match the `piEventRetain` of regular events
5716  // made for waiting for event completion, but not this interop event.
5717  (*Event)->CleanedUp = true;
5718 
5719  return PI_SUCCESS;
5720 }
5721 
5722 //
5723 // Sampler
5724 //
5726  const pi_sampler_properties *SamplerProperties,
5727  pi_sampler *RetSampler) {
5728 
5729  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
5730  PI_ASSERT(RetSampler, PI_ERROR_INVALID_VALUE);
5731 
5732  std::shared_lock Lock(Context->Mutex);
5733 
5734  // Have the "0" device in context to own the sampler. Rely on Level-Zero
5735  // drivers to perform migration as necessary for sharing it across multiple
5736  // devices in the context.
5737  //
5738  // TODO: figure out if we instead need explicit copying for acessing
5739  // the sampler from other devices in the context.
5740  //
5742 
5743  ze_sampler_handle_t ZeSampler;
5744  ZeStruct<ze_sampler_desc_t> ZeSamplerDesc;
5745 
5746  // Set the default values for the ZeSamplerDesc.
5747  ZeSamplerDesc.isNormalized = PI_TRUE;
5748  ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_CLAMP;
5749  ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST;
5750 
5751  // Update the values of the ZeSamplerDesc from the pi_sampler_properties list.
5752  // Default values will be used if any of the following is true:
5753  // a) SamplerProperties list is NULL
5754  // b) SamplerProperties list is missing any properties
5755 
5756  if (SamplerProperties) {
5757  const pi_sampler_properties *CurProperty = SamplerProperties;
5758 
5759  while (*CurProperty != 0) {
5760  switch (*CurProperty) {
5762  pi_bool CurValueBool = pi_cast<pi_bool>(*(++CurProperty));
5763 
5764  if (CurValueBool == PI_TRUE)
5765  ZeSamplerDesc.isNormalized = PI_TRUE;
5766  else if (CurValueBool == PI_FALSE)
5767  ZeSamplerDesc.isNormalized = PI_FALSE;
5768  else {
5769  zePrint("piSamplerCreate: unsupported "
5770  "PI_SAMPLER_NORMALIZED_COORDS value\n");
5771  return PI_ERROR_INVALID_VALUE;
5772  }
5773  } break;
5774 
5776  pi_sampler_addressing_mode CurValueAddressingMode =
5777  pi_cast<pi_sampler_addressing_mode>(
5778  pi_cast<pi_uint32>(*(++CurProperty)));
5779 
5780  // Level Zero runtime with API version 1.2 and lower has a bug:
5781  // ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER is implemented as "clamp to
5782