DPC++ Runtime
Runtime libraries for oneAPI DPC++
pi_level_zero.hpp
Go to the documentation of this file.
1 //===--------- pi_level_zero.hpp - Level Zero Plugin ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===-----------------------------------------------------------------===//
8 
11 
17 
18 #ifndef PI_LEVEL_ZERO_HPP
19 #define PI_LEVEL_ZERO_HPP
20 
21 // This version should be incremented for any change made to this file or its
22 // corresponding .cpp file.
23 #define _PI_LEVEL_ZERO_PLUGIN_VERSION 1
24 
25 #define _PI_LEVEL_ZERO_PLUGIN_VERSION_STRING \
26  _PI_PLUGIN_VERSION_STRING(_PI_LEVEL_ZERO_PLUGIN_VERSION)
27 
28 #include <CL/sycl/detail/pi.h>
29 #include <atomic>
30 #include <cassert>
31 #include <cstring>
32 #include <functional>
33 #include <iostream>
34 #include <list>
35 #include <map>
36 #include <memory>
37 #include <mutex>
38 #include <shared_mutex>
39 #include <string>
40 #include <unordered_map>
41 #include <unordered_set>
42 #include <vector>
43 
44 #include <level_zero/ze_api.h>
45 #include <level_zero/zes_api.h>
46 
47 #include "usm_allocator.hpp"
48 
49 template <class To, class From> To pi_cast(From Value) {
50  // TODO: see if more sanity checks are possible.
51  assert(sizeof(From) == sizeof(To));
52  return (To)(Value);
53 }
54 
55 template <> uint32_t pi_cast(uint64_t Value) {
56  // Cast value and check that we don't lose any information.
57  uint32_t CastedValue = (uint32_t)(Value);
58  assert((uint64_t)CastedValue == Value);
59  return CastedValue;
60 }
61 
62 // TODO: Currently die is defined in each plugin. Probably some
63 // common header file with utilities should be created.
64 [[noreturn]] void die(const char *Message) {
65  std::cerr << "die: " << Message << std::endl;
66  std::terminate();
67 }
68 
69 // Returns the ze_structure_type_t to use in .stype of a structured descriptor.
70 // Intentionally not defined; will give an error if no proper specialization
71 template <class T> ze_structure_type_t getZeStructureType();
72 template <class T> zes_structure_type_t getZesStructureType();
73 
74 template <> ze_structure_type_t getZeStructureType<ze_event_pool_desc_t>() {
75  return ZE_STRUCTURE_TYPE_EVENT_POOL_DESC;
76 }
77 template <> ze_structure_type_t getZeStructureType<ze_fence_desc_t>() {
78  return ZE_STRUCTURE_TYPE_FENCE_DESC;
79 }
80 template <> ze_structure_type_t getZeStructureType<ze_command_list_desc_t>() {
81  return ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC;
82 }
83 template <> ze_structure_type_t getZeStructureType<ze_context_desc_t>() {
84  return ZE_STRUCTURE_TYPE_CONTEXT_DESC;
85 }
86 template <>
87 ze_structure_type_t
89  return ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC;
90 }
91 template <> ze_structure_type_t getZeStructureType<ze_host_mem_alloc_desc_t>() {
92  return ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
93 }
94 template <>
96  return ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
97 }
98 template <> ze_structure_type_t getZeStructureType<ze_command_queue_desc_t>() {
99  return ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
100 }
101 template <> ze_structure_type_t getZeStructureType<ze_image_desc_t>() {
102  return ZE_STRUCTURE_TYPE_IMAGE_DESC;
103 }
104 template <> ze_structure_type_t getZeStructureType<ze_module_desc_t>() {
105  return ZE_STRUCTURE_TYPE_MODULE_DESC;
106 }
107 template <>
109  return ZE_STRUCTURE_TYPE_MODULE_PROGRAM_EXP_DESC;
110 }
111 template <> ze_structure_type_t getZeStructureType<ze_kernel_desc_t>() {
112  return ZE_STRUCTURE_TYPE_KERNEL_DESC;
113 }
114 template <> ze_structure_type_t getZeStructureType<ze_event_desc_t>() {
115  return ZE_STRUCTURE_TYPE_EVENT_DESC;
116 }
117 template <> ze_structure_type_t getZeStructureType<ze_sampler_desc_t>() {
118  return ZE_STRUCTURE_TYPE_SAMPLER_DESC;
119 }
120 template <> ze_structure_type_t getZeStructureType<ze_driver_properties_t>() {
121  return ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES;
122 }
123 template <> ze_structure_type_t getZeStructureType<ze_device_properties_t>() {
124  return ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
125 }
126 template <>
128  return ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES;
129 }
130 template <>
132  return ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES;
133 }
134 template <>
136  return ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES;
137 }
138 template <>
140  return ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
141 }
142 template <>
144  return ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES;
145 }
146 template <>
148  return ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES;
149 }
150 template <>
152  return ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES;
153 }
154 template <> ze_structure_type_t getZeStructureType<ze_module_properties_t>() {
155  return ZE_STRUCTURE_TYPE_MODULE_PROPERTIES;
156 }
157 template <> ze_structure_type_t getZeStructureType<ze_kernel_properties_t>() {
158  return ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES;
159 }
160 template <>
162  return ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES;
163 }
164 
165 template <> zes_structure_type_t getZesStructureType<zes_pci_properties_t>() {
166  return ZES_STRUCTURE_TYPE_PCI_PROPERTIES;
167 }
168 
169 // The helpers to properly default initialize Level-Zero descriptor and
170 // properties structures.
171 template <class T> struct ZeStruct : public T {
172  ZeStruct() : T{} { // zero initializes base struct
173  this->stype = getZeStructureType<T>();
174  this->pNext = nullptr;
175  }
176 };
177 template <class T> struct ZesStruct : public T {
178  ZesStruct() : T{} { // zero initializes base struct
179  this->stype = getZesStructureType<T>();
180  this->pNext = nullptr;
181  }
182 };
183 
184 // A single-threaded app has an opportunity to enable this mode to avoid
185 // overhead from mutex locking. Default value is 0 which means that single
186 // thread mode is disabled.
187 static const bool SingleThreadMode = [] {
188  const char *Ret = std::getenv("SYCL_PI_LEVEL_ZERO_SINGLE_THREAD_MODE");
189  const bool RetVal = Ret ? std::stoi(Ret) : 0;
190  return RetVal;
191 }();
192 
193 // Class which acts like shared_mutex if SingleThreadMode variable is not set.
194 // If SingleThreadMode variable is set then mutex operations are turned into
195 // nop.
196 class pi_shared_mutex : public std::shared_mutex {
197 public:
198  void lock() {
199  if (!SingleThreadMode)
200  std::shared_mutex::lock();
201  }
202  bool try_lock() {
203  return SingleThreadMode ? true : std::shared_mutex::try_lock();
204  }
205  void unlock() {
206  if (!SingleThreadMode)
207  std::shared_mutex::unlock();
208  }
209 
210  void lock_shared() {
211  if (!SingleThreadMode)
212  std::shared_mutex::lock_shared();
213  }
215  return SingleThreadMode ? true : std::shared_mutex::try_lock_shared();
216  }
217  void unlock_shared() {
218  if (!SingleThreadMode)
219  std::shared_mutex::unlock_shared();
220  }
221 };
222 
223 // Class which acts like std::mutex if SingleThreadMode variable is not set.
224 // If SingleThreadMode variable is set then mutex operations are turned into
225 // nop.
226 class pi_mutex : public std::mutex {
227 public:
228  void lock() {
229  if (!SingleThreadMode)
230  std::mutex::lock();
231  }
232  bool try_lock() { return SingleThreadMode ? true : std::mutex::try_lock(); }
233  void unlock() {
234  if (!SingleThreadMode)
235  std::mutex::unlock();
236  }
237 };
238 
239 // The wrapper for immutable Level-Zero data.
240 // The data is initialized only once at first access (via ->) with the
241 // initialization function provided in Init. All subsequent access to
242 // the data just returns the already stored data.
243 //
244 template <class T> struct ZeCache : private T {
245  // The initialization function takes a reference to the data
246  // it is going to initialize, since it is private here in
247  // order to disallow access other than through "->".
248  //
249  typedef std::function<void(T &)> InitFunctionType;
251  bool Computed{false};
253 
254  ZeCache() : T{} {}
255 
256  // Access to the fields of the original T data structure.
257  T *operator->() {
258  std::unique_lock<pi_mutex> Lock(ZeCacheMutex);
259  if (!Computed) {
260  Compute(*this);
261  Computed = true;
262  }
263  return this;
264  }
265 };
266 
267 // This wrapper around std::atomic is created to limit operations with reference
268 // counter and to make allowed operations more transparent in terms of
269 // thread-safety in the plugin. increment() and load() operations do not need a
270 // mutex guard around them since the underlying data is already atomic.
271 // decrementAndTest() method is used to guard a code which needs to be
272 // executed when object's ref count becomes zero after release. This method also
273 // doesn't need a mutex guard because decrement operation is atomic and only one
274 // thread can reach ref count equal to zero, i.e. only a single thread can pass
275 // through this check.
277  ReferenceCounter(pi_uint32 InitVal) : RefCount{InitVal} {}
278 
279  // Used when retaining an object.
280  void increment() { RefCount++; }
281 
282  // Supposed to be used in pi*GetInfo* methods where ref count value is
283  // requested.
284  pi_uint32 load() { return RefCount.load(); }
285 
286  // This method allows to guard a code which needs to be executed when object's
287  // ref count becomes zero after release. It is important to notice that only a
288  // single thread can pass through this check. This is true because of several
289  // reasons:
290  // 1. Decrement operation is executed atomically.
291  // 2. It is not allowed to retain an object after its refcount reaches zero.
292  // 3. It is not allowed to release an object more times than the value of
293  // the ref count.
294  // 2. and 3. basically means that we can't use an object at all as soon as its
295  // refcount reaches zero. Using this check guarantees that code for deleting
296  // an object and releasing its resources is executed once by a single thread
297  // and we don't need to use any mutexes to guard access to this object in the
298  // scope after this check. Of course if we access another objects in this code
299  // (not the one which is being deleted) then access to these objects must be
300  // guarded, for example with a mutex.
301  bool decrementAndTest() { return --RefCount == 0; }
302 
303 private:
304  std::atomic<pi_uint32> RefCount;
305 };
306 
307 // Base class to store common data
308 struct _pi_object {
310 
311  // Level Zero doesn't do the reference counting, so we have to do.
312  // Must be atomic to prevent data race when incrementing/decrementing.
314 
315  // This mutex protects accesses to all the non-const member variables.
316  // Exclusive access is required to modify any of these members.
317  //
318  // To get shared access to the object in a scope use std::shared_lock:
319  // std::shared_lock Lock(Obj->Mutex);
320  // To get exclusive access to the object in a scope use std::scoped_lock:
321  // std::scoped_lock Lock(Obj->Mutex);
322  //
323  // If several pi objects are accessed in a scope then each object's mutex must
324  // be locked. For example, to get write access to Obj1 and Obj2 and read
325  // access to Obj3 in a scope use the following approach:
326  // std::shared_lock Obj3Lock(Obj3->Mutex, std::defer_lock);
327  // std::scoped_lock LockAll(Obj1->Mutex, Obj2->Mutex, Obj3Lock);
329 };
330 
331 // Record for a memory allocation. This structure is used to keep information
332 // for each memory allocation.
336  // Currently kernel can reference memory allocations from different contexts
337  // and we need to know the context of a memory allocation when we release it
338  // in piKernelRelease.
339  // TODO: this should go away when memory isolation issue is fixed in the Level
340  // Zero runtime.
342 
343  // Indicates if we own the native memory handle or it came from interop that
344  // asked to not transfer the ownership to SYCL RT.
346 };
347 
348 // Define the types that are opaque in pi.h in a manner suitabale for Level Zero
349 // plugin
350 
351 struct _pi_platform {
353  // Performs initialization of a newly constructed PI platform.
355 
356  // Level Zero lacks the notion of a platform, but there is a driver, which is
357  // a pretty good fit to keep here.
359 
360  // Cache versions info from zeDriverGetProperties.
361  std::string ZeDriverVersion;
362  std::string ZeDriverApiVersion;
363  ze_api_version_t ZeApiVersion;
364 
365  // Cache driver extensions
366  std::unordered_map<std::string, uint32_t> zeDriverExtensionMap;
367 
368  // Cache pi_devices for reuse
369  std::vector<std::unique_ptr<_pi_device>> PiDevicesCache;
371  bool DeviceCachePopulated = false;
372 
373  // Check the device cache and load it if necessary.
375 
376  // Return the PI device from cache that represents given native device.
377  // If not found, then nullptr is returned.
379 
380  // Keep track of all contexts in the platform. This is needed to manage
381  // a lifetime of memory allocations in each context when there are kernels
382  // with indirect access.
383  // TODO: should be deleted when memory isolation in the context is implemented
384  // in the driver.
385  std::list<pi_context> Contexts;
387 };
388 
389 // Implements memory allocation via L0 RT for USM allocator interface.
391 protected:
394  // Internal allocation routine which must be implemented for each allocation
395  // type
396  virtual pi_result allocateImpl(void **ResultPtr, size_t Size,
397  pi_uint32 Alignment) = 0;
398  virtual MemType getMemTypeImpl() = 0;
399 
400 public:
402  : Context{Ctx}, Device{Dev} {}
403  void *allocate(size_t Size) override final;
404  void *allocate(size_t Size, size_t Alignment) override final;
405  void deallocate(void *Ptr, bool OwnZeMemHandle) override final;
406  MemType getMemType() override final;
407 };
408 
409 // Allocation routines for shared memory type
411 protected:
412  pi_result allocateImpl(void **ResultPtr, size_t Size,
413  pi_uint32 Alignment) override;
414  MemType getMemTypeImpl() override;
415 
416 public:
418  : USMMemoryAllocBase(Ctx, Dev) {}
419 };
420 
421 // Allocation routines for shared memory type that is only modified from host.
423 protected:
424  pi_result allocateImpl(void **ResultPtr, size_t Size,
425  pi_uint32 Alignment) override;
427 
428 public:
430  : USMMemoryAllocBase(Ctx, Dev) {}
431 };
432 
433 // Allocation routines for device memory type
435 protected:
436  pi_result allocateImpl(void **ResultPtr, size_t Size,
437  pi_uint32 Alignment) override;
438  MemType getMemTypeImpl() override;
439 
440 public:
442  : USMMemoryAllocBase(Ctx, Dev) {}
443 };
444 
445 // Allocation routines for host memory type
447 protected:
448  pi_result allocateImpl(void **ResultPtr, size_t Size,
449  pi_uint32 Alignment) override;
450  MemType getMemTypeImpl() override;
451 
452 public:
454 };
455 
456 struct _pi_device : _pi_object {
458  pi_device ParentDevice = nullptr)
459  : ZeDevice{Device}, Platform{Plt}, RootDevice{ParentDevice},
460  ZeDeviceProperties{}, ZeDeviceComputeProperties{} {
461  // NOTE: one must additionally call initialize() to complete
462  // PI device creation.
463  }
464 
465  // The helper structure that keeps info about a command queue groups of the
466  // device. It is not changed after it is initialized.
468  typedef enum {
472  Size // must be last
473  } type;
474 
475  // Keep the ordinal of the commands group as returned by
476  // zeDeviceGetCommandQueueGroupProperties. A value of "-1" means that
477  // there is no such queue group available in the Level Zero runtime.
478  int32_t ZeOrdinal{-1};
479 
480  // Keep the index of the specific queue in this queue group where
481  // all the command enqueues of the corresponding type should go to.
482  // The value of "-1" means that no hard binding is defined and
483  // implementation can choose specific queue index on its own.
484  int32_t ZeIndex{-1};
485 
486  // Keeps the queue group properties.
488  };
489 
490  std::vector<queue_group_info_t> QueueGroup =
491  std::vector<queue_group_info_t>(queue_group_info_t::Size);
492 
493  // This returns "true" if a main copy engine is available for use.
494  bool hasMainCopyEngine() const {
495  return QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal >= 0;
496  }
497 
498  // This returns "true" if a link copy engine is available for use.
499  bool hasLinkCopyEngine() const {
500  return QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal >= 0;
501  }
502 
503  // This returns "true" if a main or link copy engine is available for use.
504  bool hasCopyEngine() const {
505  return hasMainCopyEngine() || hasLinkCopyEngine();
506  }
507 
508  // Initialize the entire PI device.
509  // Optional param `SubSubDeviceOrdinal` `SubSubDeviceIndex` are the compute
510  // command queue ordinal and index respectively, used to initialize
511  // sub-sub-devices.
512  pi_result initialize(int SubSubDeviceOrdinal = -1,
513  int SubSubDeviceIndex = -1);
514 
515  // Level Zero device handle.
516  // This field is only set at _pi_device creation time, and cannot change.
517  // Therefore it can be accessed without holding a lock on this _pi_device.
519 
520  // Keep the subdevices that are partitioned from this pi_device for reuse
521  // The order of sub-devices in this vector is repeated from the
522  // ze_device_handle_t array that are returned from zeDeviceGetSubDevices()
523  // call, which will always return sub-devices in the fixed same order.
524  std::vector<pi_device> SubDevices;
525 
526  // PI platform to which this device belongs.
527  // This field is only set at _pi_device creation time, and cannot change.
528  // Therefore it can be accessed without holding a lock on this _pi_device.
530 
531  // Root-device of a sub-device, null if this is not a sub-device.
532  // This field is only set at _pi_device creation time, and cannot change.
533  // Therefore it can be accessed without holding a lock on this _pi_device.
535 
536  bool isSubDevice() { return RootDevice != nullptr; }
537 
538  // Cache of the immutable device properties.
548 };
549 
550 // Structure describing the specific use of a command-list in a queue.
551 // This is because command-lists are re-used across multiple queues
552 // in the same context.
554  // The Level-Zero fence that will be signalled at completion.
555  // Immediate commandlists do not have an associated fence.
556  // A nullptr for the fence indicates that this is an immediate commandlist.
557  ze_fence_handle_t ZeFence{nullptr};
558  // Record if the fence is in use.
559  // This is needed to avoid leak of the tracked command-list if the fence
560  // was not yet signaled at the time all events in that list were already
561  // completed (we are polling the fence at events completion). The fence
562  // may be still "in-use" due to sporadic delay in HW.
563  bool ZeFenceInUse{false};
564 
565  // Record the queue to which the command list will be submitted.
566  ze_command_queue_handle_t ZeQueue{nullptr};
567  // Keeps the ordinal of the ZeQueue queue group. Invalid if ZeQueue==nullptr
568  uint32_t ZeQueueGroupOrdinal{0};
569  // Helper functions to tell if this is a copy command-list.
570  bool isCopy(pi_queue Queue) const;
571 
572  // Keeps events created by commands submitted into this command-list.
573  // TODO: use this for explicit wait/cleanup of events at command-list
574  // completion.
575  // TODO: use this for optimizing events in the same command-list, e.g.
576  // only have last one visible to the host.
577  std::vector<pi_event> EventList{};
578  size_t size() const { return EventList.size(); }
579  void append(pi_event Event) { EventList.push_back(Event); }
580 };
581 
582 // The map type that would track all command-lists in a queue.
583 typedef std::unordered_map<ze_command_list_handle_t, pi_command_list_info_t>
585 // The iterator pointing to a specific command-list in use.
586 typedef pi_command_list_map_t::iterator pi_command_list_ptr_t;
587 
588 struct _pi_context : _pi_object {
590  const pi_device *Devs, bool OwnZeContext)
591  : ZeContext{ZeContext},
592  OwnZeContext{OwnZeContext}, Devices{Devs, Devs + NumDevices},
593  SingleRootDevice(getRootDevice()), ZeCommandListInit{nullptr} {
594  // NOTE: one must additionally call initialize() to complete
595  // PI context creation.
596 
597  // Create USM allocator context for each pair (device, context).
598  for (uint32_t I = 0; I < NumDevices; I++) {
599  pi_device Device = Devs[I];
600  SharedMemAllocContexts.emplace(
601  std::piecewise_construct, std::make_tuple(Device),
602  std::make_tuple(std::unique_ptr<SystemMemory>(
603  new USMSharedMemoryAlloc(this, Device))));
604  SharedReadOnlyMemAllocContexts.emplace(
605  std::piecewise_construct, std::make_tuple(Device),
606  std::make_tuple(std::unique_ptr<SystemMemory>(
607  new USMSharedReadOnlyMemoryAlloc(this, Device))));
608  DeviceMemAllocContexts.emplace(
609  std::piecewise_construct, std::make_tuple(Device),
610  std::make_tuple(std::unique_ptr<SystemMemory>(
611  new USMDeviceMemoryAlloc(this, Device))));
612  }
613  // Create USM allocator context for host. Device and Shared USM allocations
614  // are device-specific. Host allocations are not device-dependent therefore
615  // we don't need a map with device as key.
616  HostMemAllocContext = std::make_unique<USMAllocContext>(
617  std::unique_ptr<SystemMemory>(new USMHostMemoryAlloc(this)));
618 
619  // We may allocate memory to this root device so create allocators.
620  if (SingleRootDevice && DeviceMemAllocContexts.find(SingleRootDevice) ==
621  DeviceMemAllocContexts.end()) {
622  SharedMemAllocContexts.emplace(
623  std::piecewise_construct, std::make_tuple(SingleRootDevice),
624  std::make_tuple(std::unique_ptr<SystemMemory>(
625  new USMSharedMemoryAlloc(this, SingleRootDevice))));
626  DeviceMemAllocContexts.emplace(
627  std::piecewise_construct, std::make_tuple(SingleRootDevice),
628  std::make_tuple(std::unique_ptr<SystemMemory>(
629  new USMDeviceMemoryAlloc(this, SingleRootDevice))));
630  }
631  }
632 
633  // Initialize the PI context.
635 
636  // Finalize the PI context
637  pi_result finalize();
638 
639  // Return the Platform, which is the same for all devices in the context
640  pi_platform getPlatform() const { return Devices[0]->Platform; }
641 
642  // A L0 context handle is primarily used during creation and management of
643  // resources that may be used by multiple devices.
644  // This field is only set at _pi_context creation time, and cannot change.
645  // Therefore it can be accessed without holding a lock on this _pi_context.
647 
648  // Indicates if we own the ZeContext or it came from interop that
649  // asked to not transfer the ownership to SYCL RT.
651 
652  // Keep the PI devices this PI context was created for.
653  // This field is only set at _pi_context creation time, and cannot change.
654  // Therefore it can be accessed without holding a lock on this _pi_context.
655  const std::vector<pi_device> Devices;
656 
657  // If context contains one device or sub-devices of the same device, we want
658  // to save this device.
659  // This field is only set at _pi_context creation time, and cannot change.
660  // Therefore it can be accessed without holding a lock on this _pi_context.
661  const pi_device SingleRootDevice = nullptr;
662 
663  // Immediate Level Zero command list for the device in this context, to be
664  // used for initializations. To be created as:
665  // - Immediate command list: So any command appended to it is immediately
666  // offloaded to the device.
667  // - Synchronous: So implicit synchronization is made inside the level-zero
668  // driver.
669  // There will be a list of immediate command lists (for each device) when
670  // support of the multiple devices per context will be added.
671  ze_command_list_handle_t ZeCommandListInit;
672 
673  // Mutex for the immediate command list. Per the Level Zero spec memory copy
674  // operations submitted to an immediate command list are not allowed to be
675  // called from simultaneous threads.
677 
678  // Mutex Lock for the Command List Cache. This lock is used to control both
679  // compute and copy command list caches.
681  // Cache of all currently available/completed command/copy lists.
682  // Note that command-list can only be re-used on the same device.
683  //
684  // TODO: explore if we should use root-device for creating command-lists
685  // as spec says that in that case any sub-device can re-use it: "The
686  // application must only use the command list for the device, or its
687  // sub-devices, which was provided during creation."
688  //
689  std::unordered_map<ze_device_handle_t, std::list<ze_command_list_handle_t>>
691  std::unordered_map<ze_device_handle_t, std::list<ze_command_list_handle_t>>
693 
694  // Retrieves a command list for executing on this device along with
695  // a fence to be used in tracking the execution of this command list.
696  // If a command list has been created on this device which has
697  // completed its commands, then that command list and its associated fence
698  // will be reused. Otherwise, a new command list and fence will be created for
699  // running on this device. L0 fences are created on a L0 command queue so the
700  // caller must pass a command queue to create a new fence for the new command
701  // list if a command list/fence pair is not available. All Command Lists &
702  // associated fences are destroyed at Device Release.
703  // If UseCopyEngine is true, the command will eventually be executed in a
704  // copy engine. Otherwise, the command will be executed in a compute engine.
705  // If AllowBatching is true, then the command list returned may already have
706  // command in it, if AllowBatching is false, any open command lists that
707  // already exist in Queue will be closed and executed.
708  // When using immediate commandlists, retrieves an immediate command list
709  // for executing on this device. Immediate commandlists are created only
710  // once for each SYCL Queue and after that they are reused.
711  pi_result getAvailableCommandList(pi_queue Queue,
712  pi_command_list_ptr_t &CommandList,
713  bool UseCopyEngine,
714  bool AllowBatching = false);
715 
716  // Get index of the free slot in the available pool. If there is no available
717  // pool then create new one. The HostVisible parameter tells if we need a
718  // slot for a host-visible event. The ProfilingEnabled tells is we need a
719  // slot for an event with profiling capabilities.
720  pi_result getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &,
721  bool HostVisible,
722  bool ProfilingEnabled);
723 
724  // Decrement number of events living in the pool upon event destroy
725  // and return the pool to the cache if there are no unreleased events.
726  pi_result decrementUnreleasedEventsInPool(pi_event Event);
727 
728  // Store USM allocator context(internal allocator structures)
729  // for USM shared and device allocations. There is 1 allocator context
730  // per each pair of (context, device) per each memory type.
731  std::unordered_map<pi_device, USMAllocContext> DeviceMemAllocContexts;
732  std::unordered_map<pi_device, USMAllocContext> SharedMemAllocContexts;
733  std::unordered_map<pi_device, USMAllocContext> SharedReadOnlyMemAllocContexts;
734 
735  // Since L0 native runtime does not distinguisg "shared device_read_only"
736  // vs regular "shared" allocations, we have keep track of it to use
737  // proper USMAllocContext when freeing allocations.
738  std::unordered_set<void *> SharedReadOnlyAllocs;
739 
740  // Store the host allocator context. It does not depend on any device.
741  std::unique_ptr<USMAllocContext> HostMemAllocContext;
742 
743  // We need to store all memory allocations in the context because there could
744  // be kernels with indirect access. Kernels with indirect access start to
745  // reference all existing memory allocations at the time when they are
746  // submitted to the device. Referenced memory allocations can be released only
747  // when kernel has finished execution.
748  std::unordered_map<void *, MemAllocRecord> MemAllocs;
749 
750 private:
751  // If context contains one device then return this device.
752  // If context contains sub-devices of the same device, then return this parent
753  // device. Return nullptr if context consists of several devices which are not
754  // sub-devices of the same device. We call returned device the root device of
755  // a context.
756  // TODO: get rid of this when contexts with multiple devices are supported for
757  // images.
758  pi_device getRootDevice() const;
759 
760  // Following member variables are used to manage assignment of events
761  // to event pools.
762  //
763  // TODO: Create pi_event_pool class to encapsulate working with pools.
764  // This will avoid needing the use of maps below, and cleanup the
765  // pi_context overall.
766  //
767 
768  // The cache of event pools from where new events are allocated from.
769  // The head event pool is where the next event would be added to if there
770  // is still some room there. If there is no room in the head then
771  // the following event pool is taken (guranteed to be empty) and made the
772  // head. In case there is no next pool, a new pool is created and made the
773  // head.
774  //
775  // Cache of event pools to which host-visible events are added to.
776  std::vector<std::list<ze_event_pool_handle_t>> ZeEventPoolCache{4};
777  auto getZeEventPoolCache(bool HostVisible, bool WithProfiling) {
778  if (HostVisible)
779  return WithProfiling ? &ZeEventPoolCache[0] : &ZeEventPoolCache[1];
780  else
781  return WithProfiling ? &ZeEventPoolCache[2] : &ZeEventPoolCache[3];
782  }
783 
784  // This map will be used to determine if a pool is full or not
785  // by storing number of empty slots available in the pool.
786  std::unordered_map<ze_event_pool_handle_t, pi_uint32>
787  NumEventsAvailableInEventPool;
788  // This map will be used to determine number of unreleased events in the pool.
789  // We use separate maps for number of event slots available in the pool from
790  // the number of events unreleased in the pool.
791  // This will help when we try to make the code thread-safe.
792  std::unordered_map<ze_event_pool_handle_t, pi_uint32>
793  NumEventsUnreleasedInEventPool;
794 
795  // Mutex to control operations on event pool caches and the helper maps
796  // holding the current pool usage counts.
797  pi_mutex ZeEventPoolCacheMutex;
798 };
799 
800 struct _pi_queue : _pi_object {
801  _pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
802  std::vector<ze_command_queue_handle_t> &CopyQueues,
803  pi_context Context, pi_device Device, bool OwnZeCommandQueue,
804  pi_queue_properties Properties = 0);
805 
807 
808  // PI queue is in general a one to many mapping to L0 native queues.
811  pi_queue_group_t() = delete;
812 
813  // The Queue argument captures the enclosing PI queue.
814  // The Type argument specifies the type of this queue group.
815  // The actual ZeQueues are populated at PI queue construction.
817  : Queue(Queue), Type(Type) {}
818 
819  // The type of the queue group.
821  bool isCopy() const { return Type != queue_type::Compute; }
822 
823  // Level Zero command queue handles.
824  std::vector<ze_command_queue_handle_t> ZeQueues;
825 
826  // Immediate commandlist handles, one per Level Zero command queue handle.
827  // These are created only once, along with the L0 queues (see above)
828  // and reused thereafter.
829  std::vector<pi_command_list_ptr_t> ImmCmdLists;
830 
831  // Return the index of the next queue to use based on a
832  // round robin strategy and the queue group ordinal.
833  uint32_t getQueueIndex(uint32_t *QueueGroupOrdinal, uint32_t *QueueIndex);
834 
835  // This function will return one of possibly multiple available native
836  // queues and the value of the queue group ordinal.
837  ze_command_queue_handle_t &getZeQueue(uint32_t *QueueGroupOrdinal);
838 
839  // This function returns the next immediate commandlist to use.
840  pi_command_list_ptr_t &getImmCmdList();
841 
842  // These indices are to filter specific range of the queues to use,
843  // and to organize round-robin across them.
844  uint32_t UpperIndex{0};
845  uint32_t LowerIndex{0};
846  uint32_t NextIndex{0};
847  };
848 
849  pi_queue_group_t ComputeQueueGroup{this, queue_type::Compute};
850 
851  // Vector of Level Zero copy command command queue handles.
852  // In this vector, main copy engine, if available, come first followed by
853  // link copy engines, if available.
854  pi_queue_group_t CopyQueueGroup{this, queue_type::MainCopy};
855 
856  // Wait for all commandlists associated with this Queue to finish operations.
857  pi_result synchronize();
858 
859  pi_queue_group_t &getQueueGroup(bool UseCopyEngine) {
860  return UseCopyEngine ? CopyQueueGroup : ComputeQueueGroup;
861  }
862 
863  // This function considers multiple factors including copy engine
864  // availability and user preference and returns a boolean that is used to
865  // specify if copy engine will eventually be used for a particular command.
866  bool useCopyEngine(bool PreferCopyEngine = true) const;
867 
868  // Keeps the PI context to which this queue belongs.
869  // This field is only set at _pi_queue creation time, and cannot change.
870  // Therefore it can be accessed without holding a lock on this _pi_queue.
872 
873  // Keeps the PI device to which this queue belongs.
874  // This field is only set at _pi_queue creation time, and cannot change.
875  // Therefore it can be accessed without holding a lock on this _pi_queue.
877 
878  // Keeps track of the event associated with the last enqueued command into
879  // this queue. this is used to add dependency with the last command to add
880  // in-order semantics and updated with the latest event each time a new
881  // command is enqueued.
882  pi_event LastCommandEvent = nullptr;
883 
884  // Kernel is not necessarily submitted for execution during
885  // piEnqueueKernelLaunch, it may be batched. That's why we need to save the
886  // list of kernels which is going to be submitted but have not been submitted
887  // yet. This is needed to capture memory allocations for each kernel with
888  // indirect access in the list at the moment when kernel is really submitted
889  // for execution.
890  std::vector<pi_kernel> KernelsToBeSubmitted;
891 
892  // Update map of memory references made by the kernels about to be submitted
893  void CaptureIndirectAccesses();
894 
895  // Indicates if we own the ZeCommandQueue or it came from interop that
896  // asked to not transfer the ownership to SYCL RT.
898 
899  // Map of all command lists used in this queue.
901 
902  // Helper data structure to hold all variables related to batching
903  typedef struct CommandBatch {
904  // These two members are used to keep track of how often the
905  // batching closes and executes a command list before reaching the
906  // QueueComputeBatchSize limit, versus how often we reach the limit.
907  // This info might be used to vary the QueueComputeBatchSize value.
908  pi_uint32 NumTimesClosedEarly = {0};
909  pi_uint32 NumTimesClosedFull = {0};
910 
911  // Open command list fields for batching commands into this queue.
912  pi_command_list_ptr_t OpenCommandList{};
913 
914  // Approximate number of commands that are allowed to be batched for
915  // this queue.
916  // Added this member to the queue rather than using a global variable
917  // so that future implementation could use heuristics to change this on
918  // a queue specific basis. And by putting it in the queue itself, this
919  // is thread safe because of the locking of the queue that occurs.
920  pi_uint32 QueueBatchSize = {0};
921  } command_batch;
922 
923  // ComputeCommandBatch holds data related to batching of non-copy commands.
924  // CopyCommandBatch holds data related to batching of copy commands.
925  command_batch ComputeCommandBatch, CopyCommandBatch;
926 
927  // Returns true if any commands for this queue are allowed to
928  // be batched together.
929  // For copy commands, IsCopy is set to 'true'.
930  // For non-copy commands, IsCopy is set to 'false'.
931  bool isBatchingAllowed(bool IsCopy) const;
932 
933  // Keeps the properties of this queue.
935 
936  // Returns true if the queue is a in-order queue.
937  bool isInOrderQueue() const;
938 
939  // adjust the queue's batch size, knowing that the current command list
940  // is being closed with a full batch.
941  // For copy commands, IsCopy is set to 'true'.
942  // For non-copy commands, IsCopy is set to 'false'.
943  void adjustBatchSizeForFullBatch(bool IsCopy);
944 
945  // adjust the queue's batch size, knowing that the current command list
946  // is being closed with only a partial batch of commands.
947  // For copy commands, IsCopy is set to 'true'.
948  // For non-copy commands, IsCopy is set to 'false'.
949  void adjustBatchSizeForPartialBatch(bool IsCopy);
950 
951  // Resets the Command List and Associated fence in the ZeCommandListFenceMap.
952  // If the reset command list should be made available, then MakeAvailable
953  // needs to be set to true. The caller must verify that this command list and
954  // fence have been signalled. The EventListToCleanup contains a list of events
955  // from the command list which need to be cleaned up.
956  pi_result resetCommandList(pi_command_list_ptr_t CommandList,
957  bool MakeAvailable,
958  std::vector<_pi_event *> &EventListToCleanup);
959 
960  // Returns true if an OpenCommandList has commands that need to be submitted.
961  // If IsCopy is 'true', then the OpenCommandList containing copy commands is
962  // checked. Otherwise, the OpenCommandList containing compute commands is
963  // checked.
964  bool hasOpenCommandList(bool IsCopy) const {
965  auto CommandBatch = (IsCopy) ? CopyCommandBatch : ComputeCommandBatch;
966  return CommandBatch.OpenCommandList != CommandListMap.end();
967  }
968  // Attach a command list to this queue.
969  // For non-immediate commandlist also close and execute it.
970  // Note that this command list cannot be appended to after this.
971  // The "IsBlocking" tells if the wait for completion is required.
972  // If OKToBatchCommand is true, then this command list may be executed
973  // immediately, or it may be left open for other future command to be
974  // batched into.
975  // If IsBlocking is true, then batching will not be allowed regardless
976  // of the value of OKToBatchCommand
977  //
978  // For immediate commandlists, no close and execute is necessary.
979  pi_result executeCommandList(pi_command_list_ptr_t CommandList,
980  bool IsBlocking = false,
981  bool OKToBatchCommand = false);
982 
983  // If there is an open command list associated with this queue,
984  // close it, execute it, and reset the corresponding OpenCommandList.
985  // If IsCopy is 'true', then the OpenCommandList containing copy commands is
986  // executed. Otherwise OpenCommandList containing compute commands is
987  // executed.
988  pi_result executeOpenCommandList(bool IsCopy);
989 
990  // Gets the open command containing the event, or CommandListMap.end()
991  pi_command_list_ptr_t eventOpenCommandList(pi_event Event);
992 
993  // Wrapper function to execute both OpenCommandLists (Copy and Compute).
994  // This wrapper is helpful when all 'open' commands need to be executed.
995  // Call-sites instances: piQuueueFinish, piQueueRelease, etc.
997  using IsCopy = bool;
998  if (auto Res = executeOpenCommandList(IsCopy{false}))
999  return Res;
1000  if (auto Res = executeOpenCommandList(IsCopy{true}))
1001  return Res;
1002  return PI_SUCCESS;
1003  }
1004 
1005  // Besides each PI object keeping a total reference count in
1006  // _pi_object::RefCount we keep special track of the queue *external*
1007  // references. This way we are able to tell when the queue is being finished
1008  // externally, and can wait for internal references to complete, and do proper
1009  // cleanup of the queue.
1010  // This counter doesn't track the lifetime of a queue object, it only tracks
1011  // the number of external references. I.e. even if it reaches zero a queue
1012  // object may not be destroyed and can be used internally in the plugin.
1013  // That's why we intentionally don't use atomic type for this counter to
1014  // enforce guarding with a mutex all the work involving this counter.
1015  pi_uint32 RefCountExternal{1};
1016 
1017  // Indicates that the queue is healthy and all operations on it are OK.
1018  bool Healthy{true};
1019 };
1020 
1021 struct _pi_mem : _pi_object {
1022  // Keeps the PI context of this memory handle.
1024 
1025  // Enumerates all possible types of accesses.
1026  enum access_mode_t { unknown, read_write, read_only, write_only };
1027 
1028  // Interface of the _pi_mem object
1029 
1030  // Get the Level Zero handle of the current memory object
1031  virtual pi_result getZeHandle(char *&ZeHandle, access_mode_t,
1032  pi_device Device = nullptr) = 0;
1033 
1034  // Get a pointer to the Level Zero handle of the current memory object
1035  virtual pi_result getZeHandlePtr(char **&ZeHandlePtr, access_mode_t,
1036  pi_device Device = nullptr) = 0;
1037 
1038  // Method to get type of the derived object (image or buffer)
1039  virtual bool isImage() const = 0;
1040 
1041  virtual ~_pi_mem() = default;
1042 
1043 protected:
1044  _pi_mem(pi_context Ctx) : Context{Ctx} {}
1045 };
1046 
1047 struct _pi_buffer;
1049 
1050 struct _pi_buffer final : _pi_mem {
1051  // Buffer constructor
1052  _pi_buffer(pi_context Context, size_t Size, char *HostPtr,
1053  bool ImportedHostPtr = false)
1054  : _pi_mem(Context), Size(Size), SubBuffer{nullptr, 0} {
1055 
1056  // We treat integrated devices (physical memory shared with the CPU)
1057  // differently from discrete devices (those with distinct memories).
1058  // For integrated devices, allocating the buffer in the host memory
1059  // enables automatic access from the device, and makes copying
1060  // unnecessary in the map/unmap operations. This improves performance.
1061  OnHost = Context->Devices.size() == 1 &&
1062  Context->Devices[0]->ZeDeviceProperties->flags &
1063  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
1064 
1065  // Fill the host allocation data.
1066  if (HostPtr) {
1067  MapHostPtr = HostPtr;
1068  // If this host ptr is imported to USM then use this as a host
1069  // allocation for this buffer.
1070  if (ImportedHostPtr) {
1071  Allocations[nullptr].ZeHandle = HostPtr;
1072  Allocations[nullptr].Valid = true;
1073  Allocations[nullptr].ReleaseAction = _pi_buffer::allocation_t::unimport;
1074  }
1075  }
1076 
1077  // Make first device in the context be the master. Mark that
1078  // allocation (yet to be made) having "valid" data. And real
1079  // allocation and initialization should follow the buffer
1080  // construction with a "write_only" access copy.
1081  LastDeviceWithValidAllocation = Context->Devices[0];
1082  Allocations[LastDeviceWithValidAllocation].Valid = true;
1083  }
1084 
1085  // Sub-buffer constructor
1086  _pi_buffer(pi_buffer Parent, size_t Origin, size_t Size)
1087  : _pi_mem(Parent->Context), Size(Size), SubBuffer{Parent, Origin} {}
1088 
1089  // Interop-buffer constructor
1091  char *ZeMemHandle, bool OwnZeMemHandle)
1092  : _pi_mem(Context), Size(Size), SubBuffer{nullptr, 0} {
1093 
1094  // Device == nullptr means host allocation
1095  Allocations[Device].ZeHandle = ZeMemHandle;
1096  Allocations[Device].Valid = true;
1097  Allocations[Device].ReleaseAction =
1098  OwnZeMemHandle ? allocation_t::free_native : allocation_t::keep;
1099 
1100  // Check if this buffer can always stay on host
1101  OnHost = false;
1102  if (!Device) { // Host allocation
1103  if (Context->Devices.size() == 1 &&
1104  Context->Devices[0]->ZeDeviceProperties->flags &
1105  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) {
1106  OnHost = true;
1107  MapHostPtr = ZeMemHandle; // map to this allocation
1108  }
1109  }
1110  LastDeviceWithValidAllocation = Device;
1111  }
1112 
1113  // Returns a pointer to the USM allocation representing this PI buffer
1114  // on the specified Device. If Device is nullptr then the returned
1115  // USM allocation is on the device where this buffer was used the latest.
1116  // The returned allocation is always valid, i.e. its contents is
1117  // up-to-date and any data copies needed for that are performed under
1118  // the hood.
1119  //
1120  virtual pi_result getZeHandle(char *&ZeHandle, access_mode_t,
1121  pi_device Device = nullptr) override;
1122  virtual pi_result getZeHandlePtr(char **&ZeHandlePtr, access_mode_t,
1123  pi_device Device = nullptr) override;
1124 
1125  bool isImage() const override { return false; }
1126 
1127  bool isSubBuffer() const { return SubBuffer.Parent != nullptr; }
1128 
1129  // Frees all allocations made for the buffer.
1130  pi_result free();
1131 
1132  // Information about a single allocation representing this buffer.
1133  struct allocation_t {
1134  // Level Zero memory handle is really just a naked pointer.
1135  // It is just convenient to have it char * to simplify offset arithmetics.
1136  char *ZeHandle{nullptr};
1137  // Indicates if this allocation's data is valid.
1138  bool Valid{false};
1139  // Specifies the action that needs to be taken for this
1140  // allocation at buffer destruction.
1141  enum {
1142  keep, // do nothing, the allocation is not owned by us
1143  unimport, // release of the imported allocation
1144  free, // free from the pooling context (default)
1145  free_native // free with a native call
1146  } ReleaseAction{free};
1147  };
1148 
1149  // We maintain multiple allocations on possibly all devices in the context.
1150  // The "nullptr" device identifies a host allocation representing buffer.
1151  // Sub-buffers don't maintain own allocations but rely on parent buffer.
1152  std::unordered_map<pi_device, allocation_t> Allocations;
1153  pi_device LastDeviceWithValidAllocation{nullptr};
1154 
1155  // Flag to indicate that this memory is allocated in host memory.
1156  // Integrated device accesses this memory.
1157  bool OnHost{false};
1158 
1159  // Tells the host allocation to use for buffer map operations.
1160  char *MapHostPtr{nullptr};
1161 
1162  // Supplementary data to keep track of the mappings of this buffer
1163  // created with piEnqueueMemBufferMap.
1164  struct Mapping {
1165  // The offset in the buffer giving the start of the mapped region.
1166  size_t Offset;
1167  // The size of the mapped region.
1168  size_t Size;
1169  };
1170 
1171  // The key is the host pointer representing an active mapping.
1172  // The value is the information needed to maintain/undo the mapping.
1173  std::unordered_map<void *, Mapping> Mappings;
1174 
1175  // The size and alignment of the buffer
1176  size_t Size;
1177  size_t getAlignment() const;
1178 
1179  struct {
1181  size_t Origin; // only valid if Parent != nullptr
1182  } SubBuffer;
1183 };
1184 
1185 // TODO: add proper support for images on context with multiple devices.
1186 struct _pi_image final : _pi_mem {
1187  // Image constructor
1189  : _pi_mem(Ctx), ZeImage{Image} {}
1190 
1191  virtual pi_result getZeHandle(char *&ZeHandle, access_mode_t,
1192  pi_device = nullptr) override {
1193  ZeHandle = pi_cast<char *>(ZeImage);
1194  return PI_SUCCESS;
1195  }
1196  virtual pi_result getZeHandlePtr(char **&ZeHandlePtr, access_mode_t,
1197  pi_device = nullptr) override {
1198  ZeHandlePtr = pi_cast<char **>(&ZeImage);
1199  return PI_SUCCESS;
1200  }
1201 
1202  bool isImage() const override { return true; }
1203 
1204 #ifndef NDEBUG
1205  // Keep the descriptor of the image (for debugging purposes)
1207 #endif // !NDEBUG
1208 
1209  // Level Zero image handle.
1211 };
1212 
1214  // List of level zero events for this event list.
1215  ze_event_handle_t *ZeEventList = {nullptr};
1216 
1217  // List of pi_events for this event list.
1218  pi_event *PiEventList = {nullptr};
1219 
1220  // length of both the lists. The actual allocation of these lists
1221  // may be longer than this length. This length is the actual number
1222  // of elements in the above arrays that are valid.
1223  pi_uint32 Length = {0};
1224 
1225  // A mutex is needed for destroying the event list.
1226  // Creation is already thread-safe because we only create the list
1227  // when an event is initially created. However, it might be
1228  // possible to have multiple threads racing to destroy the list,
1229  // so this will be used to make list destruction thread-safe.
1231 
1232  // Initialize this using the array of events in EventList, and retain
1233  // all the pi_events in the created data structure.
1234  // CurQueue is the pi_queue that the command with this event wait
1235  // list is going to be added to. That is needed to flush command
1236  // batches for wait events that are in other queues.
1237  // UseCopyEngine indicates if the next command (the one that this
1238  // event wait-list is for) is going to go to copy or compute
1239  // queue. This is used to properly submit the dependent open
1240  // command-lists.
1241  pi_result createAndRetainPiZeEventList(pi_uint32 EventListLength,
1242  const pi_event *EventList,
1243  pi_queue CurQueue, bool UseCopyEngine);
1244 
1245  // Add all the events in this object's PiEventList to the end
1246  // of the list EventsToBeReleased. Destroy pi_ze_event_list_t data
1247  // structure fields making it look empty.
1248  pi_result collectEventsForReleaseAndDestroyPiZeEventList(
1249  std::list<pi_event> &EventsToBeReleased);
1250 
1251  // Had to create custom assignment operator because the mutex is
1252  // not assignment copyable. Just field by field copy of the other
1253  // fields.
1255  this->ZeEventList = other.ZeEventList;
1256  this->PiEventList = other.PiEventList;
1257  this->Length = other.Length;
1258  return *this;
1259  }
1260 };
1261 
1262 struct _pi_event : _pi_object {
1263  _pi_event(ze_event_handle_t ZeEvent, ze_event_pool_handle_t ZeEventPool,
1264  pi_context Context, pi_command_type CommandType, bool OwnZeEvent)
1265  : ZeEvent{ZeEvent}, OwnZeEvent{OwnZeEvent}, ZeEventPool{ZeEventPool},
1266  CommandType{CommandType}, Context{Context}, CommandData{nullptr} {}
1267 
1268  // Level Zero event handle.
1270 
1271  // Indicates if we own the ZeEvent or it came from interop that
1272  // asked to not transfer the ownership to SYCL RT.
1274 
1275  // Level Zero event pool handle.
1276  ze_event_pool_handle_t ZeEventPool;
1277 
1278  // In case we use device-only events this holds their host-visible
1279  // counterpart. If this event is itself host-visble then HostVisibleEvent
1280  // points to this event. If this event is not host-visible then this field can
1281  // be: 1) null, meaning that a host-visible event wasn't yet created 2) a PI
1282  // event created internally that host will actually be redirected
1283  // to wait/query instead of this PI event.
1284  //
1285  // The HostVisibleEvent is a reference counted PI event and can be used more
1286  // than by just this one event, depending on the mode (see EventsScope).
1287  //
1288  pi_event HostVisibleEvent = {nullptr};
1289  bool isHostVisible() const { return this == HostVisibleEvent; }
1290 
1291  // Get the host-visible event or create one and enqueue its signal.
1292  pi_result getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent);
1293 
1294  // Tells if this event is with profiling capabilities.
1295  bool isProfilingEnabled() const {
1296  return !Queue || // tentatively assume user events are profiling enabled
1297  (Queue->Properties & PI_QUEUE_PROFILING_ENABLE) != 0;
1298  }
1299 
1300  // Keeps the command-queue and command associated with the event.
1301  // These are NULL for the user events.
1302  pi_queue Queue = {nullptr};
1304  // Provide direct access to Context, instead of going via queue.
1305  // Not every PI event has a queue, and we need a handle to Context
1306  // to get to event pool related information.
1308 
1309  // Opaque data to hold any data needed for CommandType.
1311 
1312  // List of events that were in the wait list of the command that will
1313  // signal this event. These events must be retained when the command is
1314  // enqueued, and must then be released when this event has signalled.
1315  // This list must be destroyed once the event has signalled.
1317 
1318  // Tracks if the needed cleanup was already performed for
1319  // a completed event. This allows to control that some cleanup
1320  // actions are performed only once.
1321  //
1322  bool CleanedUp = {false};
1323 
1324  // Indicates that this PI event had already completed in the sense
1325  // that no other synchromization is needed. Note that the underlying
1326  // L0 event (if any) is not guranteed to have been signalled, or
1327  // being visible to the host at all.
1328  bool Completed = {false};
1329 };
1330 
1331 struct _pi_program : _pi_object {
1332  // Possible states of a program.
1333  typedef enum {
1334  // The program has been created from intermediate language (SPIR-V), but it
1335  // is not yet compiled.
1337 
1338  // The program has been created by loading native code, but it has not yet
1339  // been built. This is equivalent to an OpenCL "program executable" that
1340  // is loaded via clCreateProgramWithBinary().
1342 
1343  // The program was notionally compiled from SPIR-V form. However, since we
1344  // postpone compilation until the module is linked, the internal state
1345  // still represents the module as SPIR-V.
1347 
1348  // The program has been built or linked, and it is represented as a Level
1349  // Zero module.
1351 
1352  // An error occurred during piProgramLink, but we created a _pi_program
1353  // object anyways in order to hold the ZeBuildLog. Note that the ZeModule
1354  // may or may not be nullptr in this state, depending on the error.
1355  Invalid
1356  } state;
1357 
1358  // A utility class that converts specialization constants into the form
1359  // required by the Level Zero driver.
1361  public:
1363  ZeSpecConstants.numConstants = Program->SpecConstants.size();
1364  ZeSpecContantsIds.reserve(ZeSpecConstants.numConstants);
1365  ZeSpecContantsValues.reserve(ZeSpecConstants.numConstants);
1366 
1367  for (auto &SpecConstant : Program->SpecConstants) {
1368  ZeSpecContantsIds.push_back(SpecConstant.first);
1369  ZeSpecContantsValues.push_back(SpecConstant.second);
1370  }
1371  ZeSpecConstants.pConstantIds = ZeSpecContantsIds.data();
1372  ZeSpecConstants.pConstantValues = ZeSpecContantsValues.data();
1373  }
1374 
1375  const ze_module_constants_t *ze() { return &ZeSpecConstants; }
1376 
1377  private:
1378  std::vector<uint32_t> ZeSpecContantsIds;
1379  std::vector<const void *> ZeSpecContantsValues;
1380  ze_module_constants_t ZeSpecConstants;
1381  };
1382 
1383  // Construct a program in IL or Native state.
1384  _pi_program(state St, pi_context Context, const void *Input, size_t Length)
1385  : Context{Context},
1386  OwnZeModule{true}, State{St}, Code{new uint8_t[Length]},
1387  CodeLength{Length}, ZeModule{nullptr}, ZeBuildLog{nullptr} {
1388  std::memcpy(Code.get(), Input, Length);
1389  }
1390 
1391  // Construct a program in Exe or Invalid state.
1393  ze_module_build_log_handle_t ZeBuildLog)
1394  : Context{Context}, OwnZeModule{true}, State{St}, ZeModule{ZeModule},
1395  ZeBuildLog{ZeBuildLog} {}
1396 
1397  // Construct a program in Exe state (interop).
1399  bool OwnZeModule)
1400  : Context{Context}, OwnZeModule{OwnZeModule}, State{St},
1401  ZeModule{ZeModule}, ZeBuildLog{nullptr} {}
1402 
1403  // Construct a program in Invalid state with a custom error message.
1405  : Context{Context}, OwnZeModule{true}, ErrorMessage{ErrorMessage},
1406  State{St}, ZeModule{nullptr}, ZeBuildLog{nullptr} {}
1407 
1408  ~_pi_program();
1409 
1410  const pi_context Context; // Context of the program.
1411 
1412  // Indicates if we own the ZeModule or it came from interop that
1413  // asked to not transfer the ownership to SYCL RT.
1414  const bool OwnZeModule;
1415 
1416  // This error message is used only in Invalid state to hold a custom error
1417  // message from a call to piProgramLink.
1418  const std::string ErrorMessage;
1419 
1421 
1422  // In IL and Object states, this contains the SPIR-V representation of the
1423  // module. In Native state, it contains the native code.
1424  std::unique_ptr<uint8_t[]> Code; // Array containing raw IL / native code.
1425  size_t CodeLength; // Size (bytes) of the array.
1426 
1427  // Used only in IL and Object states. Contains the SPIR-V specialization
1428  // constants as a map from the SPIR-V "SpecID" to a buffer that contains the
1429  // associated value. The caller of the PI layer is responsible for
1430  // maintaining the storage of this buffer.
1431  std::unordered_map<uint32_t, const void *> SpecConstants;
1432 
1433  // Used only in Object state. Contains the build flags from the last call to
1434  // piProgramCompile().
1435  std::string BuildFlags;
1436 
1437  // The Level Zero module handle. Used primarily in Exe state.
1439 
1440  // The Level Zero build log from the last call to zeModuleCreate().
1441  ze_module_build_log_handle_t ZeBuildLog;
1442 };
1443 
1444 struct _pi_kernel : _pi_object {
1445  _pi_kernel(ze_kernel_handle_t Kernel, bool OwnZeKernel, pi_program Program)
1446  : ZeKernel{Kernel}, OwnZeKernel{OwnZeKernel}, Program{Program},
1447  MemAllocs{}, SubmissionsCount{0} {}
1448 
1449  // Completed initialization of PI kernel. Must be called after construction.
1451 
1452  // Returns true if kernel has indirect access, false otherwise.
1454  // Currently indirect access flag is set for all kernels and there is no API
1455  // to check if kernel actually indirectly access smth.
1456  return true;
1457  }
1458 
1459  // Level Zero function handle.
1461 
1462  // Indicates if we own the ZeKernel or it came from interop that
1463  // asked to not transfer the ownership to SYCL RT.
1465 
1466  // Keep the program of the kernel.
1468 
1469  // Hash function object for the unordered_set below.
1470  struct Hash {
1471  size_t operator()(const std::pair<void *const, MemAllocRecord> *P) const {
1472  return std::hash<void *>()(P->first);
1473  }
1474  };
1475 
1476  // If kernel has indirect access we need to make a snapshot of all existing
1477  // memory allocations to defer deletion of these memory allocations to the
1478  // moment when kernel execution has finished.
1479  // We store pointers to the elements because pointers are not invalidated by
1480  // insert/delete for std::unordered_map (iterators are invalidated). We need
1481  // to take a snapshot instead of just reference-counting the allocations,
1482  // because picture of active allocations can change during kernel execution
1483  // (new allocations can be added) and we need to know which memory allocations
1484  // were retained by this kernel to release them (and don't touch new
1485  // allocations) at kernel completion. Same kernel may be submitted several
1486  // times and retained allocations may be different at each submission. That's
1487  // why we have a set of memory allocations here and increase ref count only
1488  // once even if kernel is submitted many times. We don't want to know how many
1489  // times and which allocations were retained by each submission. We release
1490  // all allocations in the set only when SubmissionsCount == 0.
1491  std::unordered_set<std::pair<void *const, MemAllocRecord> *, Hash> MemAllocs;
1492 
1493  // Counter to track the number of submissions of the kernel.
1494  // When this value is zero, it means that kernel is not submitted for an
1495  // execution - at this time we can release memory allocations referenced by
1496  // this kernel. We can do this when RefCount turns to 0 but it is too late
1497  // because kernels are cached in the context by SYCL RT and they are released
1498  // only during context object destruction. Regular RefCount is not usable to
1499  // track submissions because user/SYCL RT can retain kernel object any number
1500  // of times. And that's why there is no value of RefCount which can mean zero
1501  // submissions.
1502  std::atomic<pi_uint32> SubmissionsCount;
1503 
1504  // Keeps info about an argument to the kernel enough to set it with
1505  // zeKernelSetArgumentValue.
1506  struct ArgumentInfo {
1507  uint32_t Index;
1508  size_t Size;
1509  const pi_mem Value;
1511  };
1512  // Arguments that still need to be set (with zeKernelSetArgumentValue)
1513  // before kernel is enqueued.
1514  std::vector<ArgumentInfo> PendingArguments;
1515 
1516  // Cache of the kernel properties.
1519 };
1520 
1521 struct _pi_sampler : _pi_object {
1522  _pi_sampler(ze_sampler_handle_t Sampler) : ZeSampler{Sampler} {}
1523 
1524  // Level Zero sampler handle.
1525  ze_sampler_handle_t ZeSampler;
1526 };
1527 
1528 #endif // PI_LEVEL_ZERO_HPP
USMDeviceMemoryAlloc::USMDeviceMemoryAlloc
USMDeviceMemoryAlloc(pi_context Ctx, pi_device Dev)
Definition: pi_level_zero.hpp:441
_pi_device::RootDevice
const pi_device RootDevice
Definition: pi_level_zero.hpp:534
_pi_object::RefCount
std::atomic< pi_uint32 > RefCount
Definition: pi_esimd_emulator.hpp:61
MemAllocRecord::Context
pi_context Context
Definition: pi_level_zero.hpp:341
_pi_event::isProfilingEnabled
bool isProfilingEnabled() const
Definition: pi_level_zero.hpp:1295
_pi_mem
PI Mem mapping to CUDA memory allocations, both data and texture/surface.
Definition: pi_cuda.hpp:221
ze_image_handle_t
struct _ze_image_handle_t * ze_image_handle_t
Definition: backend_traits_level_zero.hpp:34
USMSharedReadOnlyMemoryAlloc::USMSharedReadOnlyMemoryAlloc
USMSharedReadOnlyMemoryAlloc(pi_context Ctx, pi_device Dev)
Definition: pi_level_zero.hpp:429
getZeStructureType< ze_module_desc_t >
ze_structure_type_t getZeStructureType< ze_module_desc_t >()
Definition: pi_level_zero.hpp:104
_pi_buffer::allocation_t::keep
@ keep
Definition: pi_level_zero.hpp:1142
_pi_device::queue_group_info_t::type
type
Definition: pi_level_zero.hpp:468
_pi_context::ZeCommandListInit
ze_command_list_handle_t ZeCommandListInit
Definition: pi_level_zero.hpp:671
_pi_device::_pi_device
_pi_device(ze_device_handle_t Device, pi_platform Plt, pi_device ParentDevice=nullptr)
Definition: pi_level_zero.hpp:457
_pi_context::DeviceMemAllocContexts
std::unordered_map< pi_device, USMAllocContext > DeviceMemAllocContexts
Definition: pi_level_zero.hpp:731
pi_shared_mutex::try_lock
bool try_lock()
Definition: pi_level_zero.hpp:202
pi.h
_pi_buffer::isImage
bool isImage() const override
Definition: pi_level_zero.hpp:1125
getZeStructureType< ze_device_memory_properties_t >
ze_structure_type_t getZeStructureType< ze_device_memory_properties_t >()
Definition: pi_level_zero.hpp:147
_pi_program::_pi_program
_pi_program(state St, pi_context Context, ze_module_handle_t ZeModule, ze_module_build_log_handle_t ZeBuildLog)
Definition: pi_level_zero.hpp:1392
_pi_program::ZeBuildLog
ze_module_build_log_handle_t ZeBuildLog
Definition: pi_level_zero.hpp:1441
_pi_queue::pi_queue_group_t::isCopy
bool isCopy() const
Definition: pi_level_zero.hpp:821
_pi_buffer::Parent
_pi_mem * Parent
Definition: pi_level_zero.hpp:1180
die
void die(const char *Message)
Definition: pi_level_zero.hpp:64
_pi_event::CommandType
pi_command_type CommandType
Definition: pi_level_zero.hpp:1303
_pi_platform::_pi_platform
_pi_platform(ze_driver_handle_t Driver)
Definition: pi_level_zero.hpp:352
T
_pi_context::getPlatform
pi_platform getPlatform() const
Definition: pi_level_zero.hpp:640
cl::sycl::detail::make_tuple
constexpr tuple< Ts... > make_tuple(Ts... Args)
Definition: tuple.hpp:36
ErrorMessage
thread_local char ErrorMessage[MaxMessageSize]
Definition: pi_esimd_emulator.cpp:153
ze_module_handle_t
struct _ze_module_handle_t * ze_module_handle_t
Definition: backend_traits_level_zero.hpp:36
_pi_platform::PiDevicesCacheMutex
pi_shared_mutex PiDevicesCacheMutex
Definition: pi_level_zero.hpp:370
_pi_program::Object
@ Object
Definition: pi_level_zero.hpp:1346
_pi_device::hasCopyEngine
bool hasCopyEngine() const
Definition: pi_level_zero.hpp:504
cl::sycl::detail::pi::initialize
std::vector< plugin > & initialize()
Definition: pi.cpp:374
_pi_image::ZeImageDesc
ZeStruct< ze_image_desc_t > ZeImageDesc
Definition: pi_level_zero.hpp:1206
_pi_event::ZeEventPool
ze_event_pool_handle_t ZeEventPool
Definition: pi_level_zero.hpp:1276
_pi_platform::initialize
pi_result initialize()
Definition: pi_level_zero.cpp:1936
USMSharedMemoryAlloc
Definition: pi_level_zero.hpp:410
_pi_kernel::ZeKernelName
ZeCache< std::string > ZeKernelName
Definition: pi_level_zero.hpp:1518
getZeStructureType< ze_module_properties_t >
ze_structure_type_t getZeStructureType< ze_module_properties_t >()
Definition: pi_level_zero.hpp:154
_pi_program::ErrorMessage
const std::string ErrorMessage
Definition: pi_level_zero.hpp:1418
_pi_buffer
Definition: pi_esimd_emulator.hpp:178
USMMemoryAllocBase::USMMemoryAllocBase
USMMemoryAllocBase(pi_context Ctx, pi_device Dev)
Definition: pi_level_zero.hpp:401
_pi_image
Definition: pi_esimd_emulator.hpp:187
_pi_image::ZeImage
ze_image_handle_t ZeImage
Definition: pi_level_zero.hpp:1210
_pi_buffer::isSubBuffer
bool isSubBuffer() const
Definition: pi_level_zero.hpp:1127
getZeStructureType
ze_structure_type_t getZeStructureType()
SystemMemory
Definition: usm_allocator.hpp:17
getZeStructureType< ze_context_desc_t >
ze_structure_type_t getZeStructureType< ze_context_desc_t >()
Definition: pi_level_zero.hpp:83
getZeStructureType< ze_sampler_desc_t >
ze_structure_type_t getZeStructureType< ze_sampler_desc_t >()
Definition: pi_level_zero.hpp:117
_pi_queue::pi_queue_group_t::pi_queue_group_t
pi_queue_group_t(pi_queue Queue, queue_type Type)
Definition: pi_level_zero.hpp:816
USMMemoryAllocBase
Definition: pi_level_zero.hpp:390
_pi_context::SharedMemAllocContexts
std::unordered_map< pi_device, USMAllocContext > SharedMemAllocContexts
Definition: pi_level_zero.hpp:732
_pi_program::_pi_program
_pi_program(state St, pi_context Context, const void *Input, size_t Length)
Definition: pi_level_zero.hpp:1384
getZeStructureType< ze_driver_properties_t >
ze_structure_type_t getZeStructureType< ze_driver_properties_t >()
Definition: pi_level_zero.hpp:120
_pi_kernel::Hash
Definition: pi_level_zero.hpp:1470
_pi_object::Mutex
pi_shared_mutex Mutex
Definition: pi_level_zero.hpp:328
USMDeviceMemoryAlloc
Definition: pi_level_zero.hpp:434
pi_shared_mutex
Definition: pi_level_zero.hpp:196
ze_command_queue_handle_t
struct _ze_command_queue_handle_t * ze_command_queue_handle_t
Definition: backend_traits_level_zero.hpp:29
_pi_context::ZeComputeCommandListCache
std::unordered_map< ze_device_handle_t, std::list< ze_command_list_handle_t > > ZeComputeCommandListCache
Definition: pi_level_zero.hpp:690
USMMemoryAllocBase::getMemTypeImpl
virtual MemType getMemTypeImpl()=0
_pi_kernel::Program
pi_program Program
Definition: pi_level_zero.hpp:1467
_pi_kernel::OwnZeKernel
bool OwnZeKernel
Definition: pi_level_zero.hpp:1464
_pi_device::ZeDeviceCacheProperties
ZeCache< ZeStruct< ze_device_cache_properties_t > > ZeDeviceCacheProperties
Definition: pi_level_zero.hpp:547
getZeStructureType< ze_device_properties_t >
ze_structure_type_t getZeStructureType< ze_device_properties_t >()
Definition: pi_level_zero.hpp:123
_pi_program::_pi_program
_pi_program(state St, pi_context Context, ze_module_handle_t ZeModule, bool OwnZeModule)
Definition: pi_level_zero.hpp:1398
ze_event_handle_t
struct _ze_event_handle_t * ze_event_handle_t
Definition: backend_traits_level_zero.hpp:33
_pi_result
_pi_result
Definition: pi.h:105
_pi_queue::pi_queue_group_t::ZeQueues
std::vector< ze_command_queue_handle_t > ZeQueues
Definition: pi_level_zero.hpp:824
ZesStruct::ZesStruct
ZesStruct()
Definition: pi_level_zero.hpp:178
_pi_queue::Device
const pi_device Device
Definition: pi_level_zero.hpp:876
_pi_device::ZeDeviceMemoryProperties
ZeCache< std::vector< ZeStruct< ze_device_memory_properties_t > > > ZeDeviceMemoryProperties
Definition: pi_level_zero.hpp:544
_pi_program::SpecConstants
std::unordered_map< uint32_t, const void * > SpecConstants
Definition: pi_level_zero.hpp:1431
_pi_event::CommandData
void * CommandData
Definition: pi_level_zero.hpp:1310
_pi_buffer::_pi_buffer
_pi_buffer(pi_context Context, size_t Size, char *HostPtr, bool ImportedHostPtr=false)
Definition: pi_level_zero.hpp:1052
_pi_device::hasMainCopyEngine
bool hasMainCopyEngine() const
Definition: pi_level_zero.hpp:494
getZeStructureType< ze_command_queue_group_properties_t >
ze_structure_type_t getZeStructureType< ze_command_queue_group_properties_t >()
Definition: pi_level_zero.hpp:131
_pi_sampler::_pi_sampler
_pi_sampler(ze_sampler_handle_t Sampler)
Definition: pi_level_zero.hpp:1522
_pi_device::ZeDevice
const ze_device_handle_t ZeDevice
Definition: pi_level_zero.hpp:518
ze_device_handle_t
struct _ze_device_handle_t * ze_device_handle_t
Definition: backend_traits_level_zero.hpp:31
_pi_platform::ContextsMutex
pi_shared_mutex ContextsMutex
Definition: pi_level_zero.hpp:386
usm_allocator.hpp
_pi_context::HostMemAllocContext
std::unique_ptr< USMAllocContext > HostMemAllocContext
Definition: pi_level_zero.hpp:741
_pi_context::SharedReadOnlyMemAllocContexts
std::unordered_map< pi_device, USMAllocContext > SharedReadOnlyMemAllocContexts
Definition: pi_level_zero.hpp:733
_pi_context::OwnZeContext
bool OwnZeContext
Definition: pi_level_zero.hpp:650
_pi_queue::CommandBatch
Definition: pi_level_zero.hpp:903
getZeStructureType< ze_relaxed_allocation_limits_exp_desc_t >
ze_structure_type_t getZeStructureType< ze_relaxed_allocation_limits_exp_desc_t >()
Definition: pi_level_zero.hpp:88
MemType
MemType
Definition: usm_allocator.hpp:14
_pi_image::_pi_image
_pi_image(pi_context Ctx, ze_image_handle_t Image)
Definition: pi_level_zero.hpp:1188
getZeStructureType< ze_kernel_properties_t >
ze_structure_type_t getZeStructureType< ze_kernel_properties_t >()
Definition: pi_level_zero.hpp:157
_pi_program::ZeModule
ze_module_handle_t ZeModule
Definition: pi_level_zero.hpp:1438
pi_command_list_map_t
std::unordered_map< ze_command_list_handle_t, pi_command_list_info_t > pi_command_list_map_t
Definition: pi_level_zero.hpp:584
ZeCache::ZeCache
ZeCache()
Definition: pi_level_zero.hpp:254
_pi_buffer::Mapping::Size
size_t Size
Definition: pi_level_zero.hpp:1168
USMMemoryAllocBase::deallocate
void deallocate(void *Ptr, bool OwnZeMemHandle) override final
Definition: pi_level_zero.cpp:7384
_pi_queue::pi_queue_group_t::ImmCmdLists
std::vector< pi_command_list_ptr_t > ImmCmdLists
Definition: pi_level_zero.hpp:829
_pi_context::ZeContext
const ze_context_handle_t ZeContext
Definition: pi_level_zero.hpp:646
_pi_platform
A PI platform stores all known PI devices, in the CUDA plugin this is just a vector of available devi...
Definition: pi_cuda.hpp:72
_pi_ze_event_list_t::PiEventList
pi_event * PiEventList
Definition: pi_level_zero.hpp:1218
_pi_ze_event_list_t::ZeEventList
ze_event_handle_t * ZeEventList
Definition: pi_level_zero.hpp:1215
MemAllocRecord::OwnZeMemHandle
bool OwnZeMemHandle
Definition: pi_level_zero.hpp:345
_pi_kernel::SubmissionsCount
std::atomic< pi_uint32 > SubmissionsCount
Definition: pi_level_zero.hpp:1502
_pi_device::queue_group_info_t::Compute
@ Compute
Definition: pi_level_zero.hpp:471
_pi_kernel::ArgumentInfo::Index
uint32_t Index
Definition: pi_level_zero.hpp:1507
pi_mutex
Definition: pi_level_zero.hpp:226
_pi_mem::unknown
@ unknown
Definition: pi_level_zero.hpp:1026
_pi_object
Definition: pi_esimd_emulator.hpp:58
_pi_queue::CopyCommandBatch
command_batch CopyCommandBatch
Definition: pi_level_zero.hpp:925
_pi_device::ZeDeviceProperties
ZeCache< ZeStruct< ze_device_properties_t > > ZeDeviceProperties
Definition: pi_level_zero.hpp:539
_pi_queue::executeAllOpenCommandLists
pi_result executeAllOpenCommandLists()
Definition: pi_level_zero.hpp:996
ReferenceCounter::load
pi_uint32 load()
Definition: pi_level_zero.hpp:284
getZeStructureType< ze_device_mem_alloc_desc_t >
ze_structure_type_t getZeStructureType< ze_device_mem_alloc_desc_t >()
Definition: pi_level_zero.hpp:95
_pi_device::ZeDeviceImageProperties
ZeCache< ZeStruct< ze_device_image_properties_t > > ZeDeviceImageProperties
Definition: pi_level_zero.hpp:541
_pi_kernel
Implementation of a PI Kernel for CUDA.
Definition: pi_cuda.hpp:773
ZeCache::ZeCacheMutex
pi_mutex ZeCacheMutex
Definition: pi_level_zero.hpp:252
ZeCache::InitFunctionType
std::function< void(T &)> InitFunctionType
Definition: pi_level_zero.hpp:249
_pi_event::isHostVisible
bool isHostVisible() const
Definition: pi_level_zero.hpp:1289
getZeStructureType< ze_kernel_desc_t >
ze_structure_type_t getZeStructureType< ze_kernel_desc_t >()
Definition: pi_level_zero.hpp:111
ZeCache::operator->
T * operator->()
Definition: pi_level_zero.hpp:257
_pi_ze_event_list_t::operator=
_pi_ze_event_list_t & operator=(const _pi_ze_event_list_t &other)
Definition: pi_level_zero.hpp:1254
_pi_kernel::ArgumentInfo
Definition: pi_level_zero.hpp:1506
_pi_platform::ZeDriver
ze_driver_handle_t ZeDriver
Definition: pi_level_zero.hpp:358
_pi_buffer::_pi_buffer
_pi_buffer(pi_buffer Parent, size_t Origin, size_t Size)
Definition: pi_level_zero.hpp:1086
cl::sycl::detail::memcpy
void memcpy(void *Dst, const void *Src, std::size_t Size)
_pi_device::hasLinkCopyEngine
bool hasLinkCopyEngine() const
Definition: pi_level_zero.hpp:499
_pi_buffer::allocation_t
Definition: pi_level_zero.hpp:1133
_pi_mem::_pi_mem
_pi_mem(pi_context Ctx)
Definition: pi_level_zero.hpp:1044
_pi_image::getZeHandlePtr
virtual pi_result getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, pi_device=nullptr) override
Definition: pi_level_zero.hpp:1196
_pi_ze_event_list_t::PiZeEventListMutex
pi_mutex PiZeEventListMutex
Definition: pi_level_zero.hpp:1230
_pi_queue
PI queue mapping on to CUstream objects.
Definition: pi_cuda.hpp:392
_pi_platform::ZeApiVersion
ze_api_version_t ZeApiVersion
Definition: pi_level_zero.hpp:363
_pi_device::queue_group_info_t
Definition: pi_level_zero.hpp:467
pi_uint32
uint32_t pi_uint32
Definition: pi.h:94
USMMemoryAllocBase::allocate
void * allocate(size_t Size) override final
Definition: pi_level_zero.cpp:7363
USMHostMemoryAlloc
Definition: pi_level_zero.hpp:446
getZeStructureType< ze_device_module_properties_t >
ze_structure_type_t getZeStructureType< ze_device_module_properties_t >()
Definition: pi_level_zero.hpp:139
_pi_device::ZeDeviceModuleProperties
ZeCache< ZeStruct< ze_device_module_properties_t > > ZeDeviceModuleProperties
Definition: pi_level_zero.hpp:542
_pi_buffer::Mapping::Offset
size_t Offset
Definition: pi_level_zero.hpp:1166
getZeStructureType< ze_command_queue_desc_t >
ze_structure_type_t getZeStructureType< ze_command_queue_desc_t >()
Definition: pi_level_zero.hpp:98
getZeStructureType< ze_event_desc_t >
ze_structure_type_t getZeStructureType< ze_event_desc_t >()
Definition: pi_level_zero.hpp:114
_pi_image::getZeHandle
virtual pi_result getZeHandle(char *&ZeHandle, access_mode_t, pi_device=nullptr) override
Definition: pi_level_zero.hpp:1191
_pi_platform::DeviceCachePopulated
bool DeviceCachePopulated
Definition: pi_esimd_emulator.hpp:71
USMHostMemoryAlloc::USMHostMemoryAlloc
USMHostMemoryAlloc(pi_context Ctx)
Definition: pi_level_zero.hpp:453
_pi_platform::zeDriverExtensionMap
std::unordered_map< std::string, uint32_t > zeDriverExtensionMap
Definition: pi_level_zero.hpp:366
_pi_queue::Properties
pi_queue_properties Properties
Definition: pi_level_zero.hpp:934
_pi_context::ZeCommandListCacheMutex
pi_mutex ZeCommandListCacheMutex
Definition: pi_level_zero.hpp:680
_pi_buffer::Origin
size_t Origin
Definition: pi_level_zero.hpp:1181
_pi_kernel::_pi_kernel
_pi_kernel(ze_kernel_handle_t Kernel, bool OwnZeKernel, pi_program Program)
Definition: pi_level_zero.hpp:1445
ZeCache
Definition: pi_level_zero.hpp:244
_pi_queue::pi_queue_group_t
Definition: pi_level_zero.hpp:809
_pi_program::SpecConstantShim::ze
const ze_module_constants_t * ze()
Definition: pi_level_zero.hpp:1375
_pi_platform::ZeDriverApiVersion
std::string ZeDriverApiVersion
Definition: pi_level_zero.hpp:362
_pi_platform::Contexts
std::list< pi_context > Contexts
Definition: pi_level_zero.hpp:385
_pi_buffer::Allocations
std::unordered_map< pi_device, allocation_t > Allocations
Definition: pi_level_zero.hpp:1152
pi_shared_mutex::lock
void lock()
Definition: pi_level_zero.hpp:198
_pi_queue::getQueueGroup
pi_queue_group_t & getQueueGroup(bool UseCopyEngine)
Definition: pi_level_zero.hpp:859
MemAllocRecord::MemAllocRecord
MemAllocRecord(pi_context Context, bool OwnZeMemHandle=true)
Definition: pi_level_zero.hpp:334
_pi_event::WaitList
_pi_ze_event_list_t WaitList
Definition: pi_level_zero.hpp:1316
_pi_context::SharedReadOnlyAllocs
std::unordered_set< void * > SharedReadOnlyAllocs
Definition: pi_level_zero.hpp:738
_pi_ze_event_list_t::Length
pi_uint32 Length
Definition: pi_level_zero.hpp:1223
getZeStructureType< ze_host_mem_alloc_desc_t >
ze_structure_type_t getZeStructureType< ze_host_mem_alloc_desc_t >()
Definition: pi_level_zero.hpp:91
USMSharedMemoryAlloc::USMSharedMemoryAlloc
USMSharedMemoryAlloc(pi_context Ctx, pi_device Dev)
Definition: pi_level_zero.hpp:417
_pi_program
Implementation of PI Program on CUDA Module object.
Definition: pi_cuda.hpp:718
_pi_sampler
Implementation of samplers for CUDA.
Definition: pi_cuda.hpp:944
getZeStructureType< ze_device_cache_properties_t >
ze_structure_type_t getZeStructureType< ze_device_cache_properties_t >()
Definition: pi_level_zero.hpp:143
getZeStructureType< ze_module_program_exp_desc_t >
ze_structure_type_t getZeStructureType< ze_module_program_exp_desc_t >()
Definition: pi_level_zero.hpp:108
ReferenceCounter::decrementAndTest
bool decrementAndTest()
Definition: pi_level_zero.hpp:301
_pi_event::OwnZeEvent
bool OwnZeEvent
Definition: pi_level_zero.hpp:1273
ReferenceCounter::increment
void increment()
Definition: pi_level_zero.hpp:280
ZeCache::Compute
InitFunctionType Compute
Definition: pi_level_zero.hpp:250
_pi_kernel::ZeKernelProperties
ZeCache< ZeStruct< ze_kernel_properties_t > > ZeKernelProperties
Definition: pi_level_zero.hpp:1517
_pi_program::SpecConstantShim::SpecConstantShim
SpecConstantShim(pi_program Program)
Definition: pi_level_zero.hpp:1362
getZeStructureType< ze_command_list_desc_t >
ze_structure_type_t getZeStructureType< ze_command_list_desc_t >()
Definition: pi_level_zero.hpp:80
_pi_program::Exe
@ Exe
Definition: pi_level_zero.hpp:1350
_pi_platform::PiDevicesCache
std::vector< std::unique_ptr< _pi_device > > PiDevicesCache
Definition: pi_level_zero.hpp:369
_pi_object::RefCount
ReferenceCounter RefCount
Definition: pi_level_zero.hpp:313
getZeStructureType< ze_device_memory_access_properties_t >
ze_structure_type_t getZeStructureType< ze_device_memory_access_properties_t >()
Definition: pi_level_zero.hpp:151
pi_command_list_info_t::size
size_t size() const
Definition: pi_level_zero.hpp:578
pi_mutex::unlock
void unlock()
Definition: pi_level_zero.hpp:233
_pi_device::queue_group_info_t::MainCopy
@ MainCopy
Definition: pi_level_zero.hpp:469
_pi_context::Devices
const std::vector< pi_device > Devices
Definition: pi_level_zero.hpp:655
USMMemoryAllocBase::Device
pi_device Device
Definition: pi_level_zero.hpp:393
_pi_queue::CommandBatch::OpenCommandList
pi_command_list_ptr_t OpenCommandList
Definition: pi_level_zero.hpp:912
USMSharedReadOnlyMemoryAlloc
Definition: pi_level_zero.hpp:422
getZeStructureType< ze_fence_desc_t >
ze_structure_type_t getZeStructureType< ze_fence_desc_t >()
Definition: pi_level_zero.hpp:77
_pi_queue::hasOpenCommandList
bool hasOpenCommandList(bool IsCopy) const
Definition: pi_level_zero.hpp:964
_pi_kernel::ArgumentInfo::Size
size_t Size
Definition: pi_level_zero.hpp:1508
_pi_kernel::PendingArguments
std::vector< ArgumentInfo > PendingArguments
Definition: pi_level_zero.hpp:1514
_pi_kernel::ZeKernel
ze_kernel_handle_t ZeKernel
Definition: pi_level_zero.hpp:1460
pi_shared_mutex::unlock
void unlock()
Definition: pi_level_zero.hpp:205
_pi_device::ZeDeviceMemoryAccessProperties
ZeCache< ZeStruct< ze_device_memory_access_properties_t > > ZeDeviceMemoryAccessProperties
Definition: pi_level_zero.hpp:546
_pi_buffer::Mapping
Definition: pi_level_zero.hpp:1164
_pi_context::_pi_context
_pi_context(ze_context_handle_t ZeContext, pi_uint32 NumDevices, const pi_device *Devs, bool OwnZeContext)
Definition: pi_level_zero.hpp:589
ZesStruct
Definition: pi_level_zero.hpp:177
ze_context_handle_t
struct _ze_context_handle_t * ze_context_handle_t
Definition: backend_traits_level_zero.hpp:30
_pi_context::ZeCopyCommandListCache
std::unordered_map< ze_device_handle_t, std::list< ze_command_list_handle_t > > ZeCopyCommandListCache
Definition: pi_level_zero.hpp:692
_pi_ze_event_list_t
Definition: pi_level_zero.hpp:1213
_pi_kernel::ArgumentInfo::Value
const pi_mem Value
Definition: pi_level_zero.hpp:1509
cl::sycl::detail::usm::free
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:220
getZeStructureType< ze_image_desc_t >
ze_structure_type_t getZeStructureType< ze_image_desc_t >()
Definition: pi_level_zero.hpp:101
ReferenceCounter::ReferenceCounter
ReferenceCounter(pi_uint32 InitVal)
Definition: pi_level_zero.hpp:277
getZesStructureType< zes_pci_properties_t >
zes_structure_type_t getZesStructureType< zes_pci_properties_t >()
Definition: pi_level_zero.hpp:165
pi_mutex::try_lock
bool try_lock()
Definition: pi_level_zero.hpp:232
pi_queue_properties
pi_bitfield pi_queue_properties
Definition: pi.h:559
SingleThreadMode
static const bool SingleThreadMode
Definition: pi_level_zero.hpp:187
_pi_queue::pi_queue_group_t::Queue
pi_queue Queue
Definition: pi_level_zero.hpp:810
_pi_device::queue_group_info_t::LinkCopy
@ LinkCopy
Definition: pi_level_zero.hpp:470
_pi_program::CodeLength
size_t CodeLength
Definition: pi_level_zero.hpp:1425
_pi_buffer::Mappings
std::unordered_map< void *, Mapping > Mappings
Definition: pi_level_zero.hpp:1173
_pi_buffer::_pi_buffer
_pi_buffer(pi_context Context, size_t Size, pi_device Device, char *ZeMemHandle, bool OwnZeMemHandle)
Definition: pi_level_zero.hpp:1090
_pi_event
PI Event mapping to CUevent.
Definition: pi_cuda.hpp:590
_pi_context::MemAllocs
std::unordered_map< void *, MemAllocRecord > MemAllocs
Definition: pi_level_zero.hpp:748
_pi_kernel::Hash::operator()
size_t operator()(const std::pair< void *const, MemAllocRecord > *P) const
Definition: pi_level_zero.hpp:1471
MemAllocRecord
Definition: pi_level_zero.hpp:333
_pi_buffer::allocation_t::free
@ free
Definition: pi_level_zero.hpp:1144
_pi_device::Platform
const pi_platform Platform
Definition: pi_level_zero.hpp:529
_pi_device::queue_group_info_t::ZeProperties
ZeStruct< ze_command_queue_group_properties_t > ZeProperties
Definition: pi_level_zero.hpp:487
_pi_program::Code
std::unique_ptr< uint8_t[]> Code
Definition: pi_level_zero.hpp:1424
ze_driver_handle_t
struct _ze_driver_handle_t * ze_driver_handle_t
Definition: backend_traits_level_zero.hpp:32
_pi_device::isSubDevice
bool isSubDevice()
Definition: pi_level_zero.hpp:536
_pi_queue::OwnZeCommandQueue
bool OwnZeCommandQueue
Definition: pi_level_zero.hpp:897
pi_shared_mutex::try_lock_shared
bool try_lock_shared()
Definition: pi_level_zero.hpp:214
ZeStruct::ZeStruct
ZeStruct()
Definition: pi_level_zero.hpp:172
_pi_queue::KernelsToBeSubmitted
std::vector< pi_kernel > KernelsToBeSubmitted
Definition: pi_level_zero.hpp:890
_pi_kernel::hasIndirectAccess
bool hasIndirectAccess()
Definition: pi_level_zero.hpp:1453
_pi_program::state
state
Definition: pi_level_zero.hpp:1333
USMMemoryAllocBase::allocateImpl
virtual pi_result allocateImpl(void **ResultPtr, size_t Size, pi_uint32 Alignment)=0
_pi_device::SubDevices
std::vector< pi_device > SubDevices
Definition: pi_level_zero.hpp:524
_pi_image::isImage
bool isImage() const override
Definition: pi_level_zero.hpp:1202
_pi_platform::ZeDriverVersion
std::string ZeDriverVersion
Definition: pi_level_zero.hpp:361
_pi_platform::getDeviceFromNativeHandle
pi_device getDeviceFromNativeHandle(ze_device_handle_t)
Definition: pi_level_zero.cpp:2203
_pi_object::_pi_object
_pi_object()
Definition: pi_level_zero.hpp:309
_pi_event::_pi_event
_pi_event(ze_event_handle_t ZeEvent, ze_event_pool_handle_t ZeEventPool, pi_context Context, pi_command_type CommandType, bool OwnZeEvent)
Definition: pi_level_zero.hpp:1263
_pi_platform::populateDeviceCacheIfNeeded
pi_result populateDeviceCacheIfNeeded()
Definition: pi_esimd_emulator.cpp:524
_pi_program::Native
@ Native
Definition: pi_level_zero.hpp:1341
_pi_kernel::MemAllocs
std::unordered_set< std::pair< void *const, MemAllocRecord > *, Hash > MemAllocs
Definition: pi_level_zero.hpp:1491
_pi_context::ImmediateCommandListMutex
pi_mutex ImmediateCommandListMutex
Definition: pi_level_zero.hpp:676
_pi_queue::CommandListMap
pi_command_list_map_t CommandListMap
Definition: pi_level_zero.hpp:900
_pi_queue::Context
const pi_context Context
Definition: pi_level_zero.hpp:871
ze_kernel_handle_t
struct _ze_kernel_handle_t * ze_kernel_handle_t
Definition: backend_traits_level_zero.hpp:35
USMSharedReadOnlyMemoryAlloc::getMemTypeImpl
MemType getMemTypeImpl() override
Definition: pi_level_zero.hpp:426
_pi_program::SpecConstantShim
Definition: pi_level_zero.hpp:1360
_pi_program::OwnZeModule
const bool OwnZeModule
Definition: pi_level_zero.hpp:1414
getZeStructureType< ze_device_image_properties_t >
ze_structure_type_t getZeStructureType< ze_device_image_properties_t >()
Definition: pi_level_zero.hpp:135
ReferenceCounter
Definition: pi_level_zero.hpp:276
_pi_program::State
state State
Definition: pi_level_zero.hpp:1420
_pi_mem::access_mode_t
access_mode_t
Definition: pi_level_zero.hpp:1026
_pi_program::BuildFlags
std::string BuildFlags
Definition: pi_level_zero.hpp:1435
pi_command_list_ptr_t
pi_command_list_map_t::iterator pi_command_list_ptr_t
Definition: pi_level_zero.hpp:586
_pi_command_type
_pi_command_type
Definition: pi.h:363
pi_shared_mutex::lock_shared
void lock_shared()
Definition: pi_level_zero.hpp:210
_pi_queue::pi_queue_group_t::Type
queue_type Type
Definition: pi_level_zero.hpp:820
ZeStruct
Definition: pi_level_zero.hpp:171
_pi_sampler::ZeSampler
ze_sampler_handle_t ZeSampler
Definition: pi_level_zero.hpp:1525
_pi_device::ZeDeviceComputeProperties
ZeCache< ZeStruct< ze_device_compute_properties_t > > ZeDeviceComputeProperties
Definition: pi_level_zero.hpp:540
pi_command_list_info_t
Definition: pi_level_zero.hpp:553
getZeStructureType< ze_event_pool_desc_t >
ze_structure_type_t getZeStructureType< ze_event_pool_desc_t >()
Definition: pi_level_zero.hpp:74
_pi_program::_pi_program
_pi_program(state St, pi_context Context, const std::string &ErrorMessage)
Definition: pi_level_zero.hpp:1404
ZeCache::Computed
bool Computed
Definition: pi_level_zero.hpp:251
getZeStructureType< ze_device_compute_properties_t >
ze_structure_type_t getZeStructureType< ze_device_compute_properties_t >()
Definition: pi_level_zero.hpp:127
USMMemoryAllocBase::getMemType
MemType getMemType() override final
Definition: pi_level_zero.cpp:7391
USMMemoryAllocBase::Context
pi_context Context
Definition: pi_level_zero.hpp:392
_pi_buffer::allocation_t::unimport
@ unimport
Definition: pi_level_zero.hpp:1143
pi_cast
To pi_cast(From Value)
Definition: pi_level_zero.hpp:49
_pi_event::ZeEvent
ze_event_handle_t ZeEvent
Definition: pi_level_zero.hpp:1269
pi_command_list_info_t::append
void append(pi_event Event)
Definition: pi_level_zero.hpp:579
PI_QUEUE_PROFILING_ENABLE
constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE
Definition: pi.h:561
getZesStructureType
zes_structure_type_t getZesStructureType()
_pi_program::Context
const pi_context Context
Definition: pi_level_zero.hpp:1410
pi_mutex::lock
void lock()
Definition: pi_level_zero.hpp:228
_pi_context
PI context mapping to a CUDA context object.
Definition: pi_cuda.hpp:159
getZeStructureType< ze_memory_allocation_properties_t >
ze_structure_type_t getZeStructureType< ze_memory_allocation_properties_t >()
Definition: pi_level_zero.hpp:161
_pi_device
PI device mapping to a CUdevice.
Definition: pi_cuda.hpp:82
SharedReadOnly
@ SharedReadOnly
Definition: usm_allocator.hpp:14
pi_shared_mutex::unlock_shared
void unlock_shared()
Definition: pi_level_zero.hpp:217
_pi_program::IL
@ IL
Definition: pi_level_zero.hpp:1336