DPC++ Runtime
Runtime libraries for oneAPI DPC++
pi_level_zero.hpp
Go to the documentation of this file.
1 //===--------- pi_level_zero.hpp - Level Zero Plugin ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===-----------------------------------------------------------------===//
8 
11 
17 
18 #ifndef PI_LEVEL_ZERO_HPP
19 #define PI_LEVEL_ZERO_HPP
20 
21 // This version should be incremented for any change made to this file or its
22 // corresponding .cpp file.
23 #define _PI_LEVEL_ZERO_PLUGIN_VERSION 1
24 
25 #define _PI_LEVEL_ZERO_PLUGIN_VERSION_STRING \
26  _PI_PLUGIN_VERSION_STRING(_PI_LEVEL_ZERO_PLUGIN_VERSION)
27 
28 #include <atomic>
29 #include <cassert>
30 #include <cstring>
31 #include <functional>
32 #include <list>
33 #include <map>
34 #include <memory>
35 #include <mutex>
36 #include <shared_mutex>
37 #include <string>
38 #include <sycl/detail/pi.h>
39 #include <unordered_map>
40 #include <unordered_set>
41 #include <vector>
42 
43 #include <level_zero/ze_api.h>
44 #include <level_zero/zes_api.h>
46 
47 #include "usm_allocator.hpp"
48 
49 template <class To, class From> To pi_cast(From Value) {
50  // TODO: see if more sanity checks are possible.
51  assert(sizeof(From) == sizeof(To));
52  return (To)(Value);
53 }
54 
55 template <> uint32_t pi_cast(uint64_t Value) {
56  // Cast value and check that we don't lose any information.
57  uint32_t CastedValue = (uint32_t)(Value);
58  assert((uint64_t)CastedValue == Value);
59  return CastedValue;
60 }
61 
62 // TODO: Currently die is defined in each plugin. Probably some
63 // common header file with utilities should be created.
64 [[noreturn]] void die(const char *Message) {
65  std::cerr << "die: " << Message << std::endl;
66  std::terminate();
67 }
68 
69 // Returns the ze_structure_type_t to use in .stype of a structured descriptor.
70 // Intentionally not defined; will give an error if no proper specialization
71 template <class T> ze_structure_type_t getZeStructureType();
72 template <class T> zes_structure_type_t getZesStructureType();
73 
74 template <> ze_structure_type_t getZeStructureType<ze_event_pool_desc_t>() {
75  return ZE_STRUCTURE_TYPE_EVENT_POOL_DESC;
76 }
77 template <> ze_structure_type_t getZeStructureType<ze_fence_desc_t>() {
78  return ZE_STRUCTURE_TYPE_FENCE_DESC;
79 }
80 template <> ze_structure_type_t getZeStructureType<ze_command_list_desc_t>() {
81  return ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC;
82 }
83 template <> ze_structure_type_t getZeStructureType<ze_context_desc_t>() {
84  return ZE_STRUCTURE_TYPE_CONTEXT_DESC;
85 }
86 template <>
87 ze_structure_type_t
89  return ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC;
90 }
91 template <> ze_structure_type_t getZeStructureType<ze_host_mem_alloc_desc_t>() {
92  return ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
93 }
94 template <>
96  return ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
97 }
98 template <> ze_structure_type_t getZeStructureType<ze_command_queue_desc_t>() {
99  return ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
100 }
101 template <> ze_structure_type_t getZeStructureType<ze_image_desc_t>() {
102  return ZE_STRUCTURE_TYPE_IMAGE_DESC;
103 }
104 template <> ze_structure_type_t getZeStructureType<ze_module_desc_t>() {
105  return ZE_STRUCTURE_TYPE_MODULE_DESC;
106 }
107 template <>
109  return ZE_STRUCTURE_TYPE_MODULE_PROGRAM_EXP_DESC;
110 }
111 template <> ze_structure_type_t getZeStructureType<ze_kernel_desc_t>() {
112  return ZE_STRUCTURE_TYPE_KERNEL_DESC;
113 }
114 template <> ze_structure_type_t getZeStructureType<ze_event_desc_t>() {
115  return ZE_STRUCTURE_TYPE_EVENT_DESC;
116 }
117 template <> ze_structure_type_t getZeStructureType<ze_sampler_desc_t>() {
118  return ZE_STRUCTURE_TYPE_SAMPLER_DESC;
119 }
120 template <> ze_structure_type_t getZeStructureType<ze_driver_properties_t>() {
121  return ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES;
122 }
123 template <> ze_structure_type_t getZeStructureType<ze_device_properties_t>() {
124  return ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
125 }
126 template <>
128  return ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES;
129 }
130 template <>
132  return ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES;
133 }
134 template <>
136  return ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES;
137 }
138 template <>
140  return ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
141 }
142 template <>
144  return ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES;
145 }
146 template <>
148  return ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES;
149 }
150 template <>
152  return ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES;
153 }
154 template <> ze_structure_type_t getZeStructureType<ze_module_properties_t>() {
155  return ZE_STRUCTURE_TYPE_MODULE_PROPERTIES;
156 }
157 template <> ze_structure_type_t getZeStructureType<ze_kernel_properties_t>() {
158  return ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES;
159 }
160 template <>
162  return ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES;
163 }
164 
165 template <> zes_structure_type_t getZesStructureType<zes_pci_properties_t>() {
166  return ZES_STRUCTURE_TYPE_PCI_PROPERTIES;
167 }
168 
169 template <> zes_structure_type_t getZesStructureType<zes_mem_state_t>() {
170  return ZES_STRUCTURE_TYPE_MEM_STATE;
171 }
172 
173 template <> zes_structure_type_t getZesStructureType<zes_mem_properties_t>() {
174  return ZES_STRUCTURE_TYPE_MEM_PROPERTIES;
175 }
176 
177 // The helpers to properly default initialize Level-Zero descriptor and
178 // properties structures.
179 template <class T> struct ZeStruct : public T {
180  ZeStruct() : T{} { // zero initializes base struct
181  this->stype = getZeStructureType<T>();
182  this->pNext = nullptr;
183  }
184 };
185 template <class T> struct ZesStruct : public T {
186  ZesStruct() : T{} { // zero initializes base struct
187  this->stype = getZesStructureType<T>();
188  this->pNext = nullptr;
189  }
190 };
191 
192 // A single-threaded app has an opportunity to enable this mode to avoid
193 // overhead from mutex locking. Default value is 0 which means that single
194 // thread mode is disabled.
195 static const bool SingleThreadMode = [] {
196  const char *Ret = std::getenv("SYCL_PI_LEVEL_ZERO_SINGLE_THREAD_MODE");
197  const bool RetVal = Ret ? std::stoi(Ret) : 0;
198  return RetVal;
199 }();
200 
201 // Class which acts like shared_mutex if SingleThreadMode variable is not set.
202 // If SingleThreadMode variable is set then mutex operations are turned into
203 // nop.
204 class pi_shared_mutex : public std::shared_mutex {
205 public:
206  void lock() {
207  if (!SingleThreadMode)
208  std::shared_mutex::lock();
209  }
210  bool try_lock() {
211  return SingleThreadMode ? true : std::shared_mutex::try_lock();
212  }
213  void unlock() {
214  if (!SingleThreadMode)
215  std::shared_mutex::unlock();
216  }
217 
218  void lock_shared() {
219  if (!SingleThreadMode)
220  std::shared_mutex::lock_shared();
221  }
223  return SingleThreadMode ? true : std::shared_mutex::try_lock_shared();
224  }
225  void unlock_shared() {
226  if (!SingleThreadMode)
227  std::shared_mutex::unlock_shared();
228  }
229 };
230 
231 // Class which acts like std::mutex if SingleThreadMode variable is not set.
232 // If SingleThreadMode variable is set then mutex operations are turned into
233 // nop.
234 class pi_mutex : public std::mutex {
235 public:
236  void lock() {
237  if (!SingleThreadMode)
238  std::mutex::lock();
239  }
240  bool try_lock() { return SingleThreadMode ? true : std::mutex::try_lock(); }
241  void unlock() {
242  if (!SingleThreadMode)
243  std::mutex::unlock();
244  }
245 };
246 
247 // The wrapper for immutable Level-Zero data.
248 // The data is initialized only once at first access (via ->) with the
249 // initialization function provided in Init. All subsequent access to
250 // the data just returns the already stored data.
251 //
252 template <class T> struct ZeCache : private T {
253  // The initialization function takes a reference to the data
254  // it is going to initialize, since it is private here in
255  // order to disallow access other than through "->".
256  //
257  typedef std::function<void(T &)> InitFunctionType;
259  bool Computed{false};
261 
262  ZeCache() : T{} {}
263 
264  // Access to the fields of the original T data structure.
265  T *operator->() {
266  std::unique_lock<pi_mutex> Lock(ZeCacheMutex);
267  if (!Computed) {
268  Compute(*this);
269  Computed = true;
270  }
271  return this;
272  }
273 };
274 
275 // This wrapper around std::atomic is created to limit operations with reference
276 // counter and to make allowed operations more transparent in terms of
277 // thread-safety in the plugin. increment() and load() operations do not need a
278 // mutex guard around them since the underlying data is already atomic.
279 // decrementAndTest() method is used to guard a code which needs to be
280 // executed when object's ref count becomes zero after release. This method also
281 // doesn't need a mutex guard because decrement operation is atomic and only one
282 // thread can reach ref count equal to zero, i.e. only a single thread can pass
283 // through this check.
285  ReferenceCounter() : RefCount{1} {}
286 
287  // Reset the counter to the initial value.
288  void reset() { RefCount = 1; }
289 
290  // Used when retaining an object.
291  void increment() { RefCount++; }
292 
293  // Supposed to be used in pi*GetInfo* methods where ref count value is
294  // requested.
295  pi_uint32 load() { return RefCount.load(); }
296 
297  // This method allows to guard a code which needs to be executed when object's
298  // ref count becomes zero after release. It is important to notice that only a
299  // single thread can pass through this check. This is true because of several
300  // reasons:
301  // 1. Decrement operation is executed atomically.
302  // 2. It is not allowed to retain an object after its refcount reaches zero.
303  // 3. It is not allowed to release an object more times than the value of
304  // the ref count.
305  // 2. and 3. basically means that we can't use an object at all as soon as its
306  // refcount reaches zero. Using this check guarantees that code for deleting
307  // an object and releasing its resources is executed once by a single thread
308  // and we don't need to use any mutexes to guard access to this object in the
309  // scope after this check. Of course if we access another objects in this code
310  // (not the one which is being deleted) then access to these objects must be
311  // guarded, for example with a mutex.
312  bool decrementAndTest() { return --RefCount == 0; }
313 
314 private:
315  std::atomic<pi_uint32> RefCount;
316 };
317 
318 // Base class to store common data
319 struct _pi_object {
321 
322  // Level Zero doesn't do the reference counting, so we have to do.
323  // Must be atomic to prevent data race when incrementing/decrementing.
325 
326  // This mutex protects accesses to all the non-const member variables.
327  // Exclusive access is required to modify any of these members.
328  //
329  // To get shared access to the object in a scope use std::shared_lock:
330  // std::shared_lock Lock(Obj->Mutex);
331  // To get exclusive access to the object in a scope use std::scoped_lock:
332  // std::scoped_lock Lock(Obj->Mutex);
333  //
334  // If several pi objects are accessed in a scope then each object's mutex must
335  // be locked. For example, to get write access to Obj1 and Obj2 and read
336  // access to Obj3 in a scope use the following approach:
337  // std::shared_lock Obj3Lock(Obj3->Mutex, std::defer_lock);
338  // std::scoped_lock LockAll(Obj1->Mutex, Obj2->Mutex, Obj3Lock);
340 };
341 
342 // Record for a memory allocation. This structure is used to keep information
343 // for each memory allocation.
347  // Currently kernel can reference memory allocations from different contexts
348  // and we need to know the context of a memory allocation when we release it
349  // in piKernelRelease.
350  // TODO: this should go away when memory isolation issue is fixed in the Level
351  // Zero runtime.
353 
354  // Indicates if we own the native memory handle or it came from interop that
355  // asked to not transfer the ownership to SYCL RT.
357 };
358 
359 // Define the types that are opaque in pi.h in a manner suitabale for Level Zero
360 // plugin
361 
362 struct _pi_platform {
364  // Performs initialization of a newly constructed PI platform.
366 
367  // Level Zero lacks the notion of a platform, but there is a driver, which is
368  // a pretty good fit to keep here.
370 
371  // Cache versions info from zeDriverGetProperties.
372  std::string ZeDriverVersion;
373  std::string ZeDriverApiVersion;
374  ze_api_version_t ZeApiVersion;
375 
376  // Cache driver extensions
377  std::unordered_map<std::string, uint32_t> zeDriverExtensionMap;
378 
379  // Cache pi_devices for reuse
380  std::vector<std::unique_ptr<_pi_device>> PiDevicesCache;
382  bool DeviceCachePopulated = false;
383 
384  // Check the device cache and load it if necessary.
386 
387  // Return the PI device from cache that represents given native device.
388  // If not found, then nullptr is returned.
390 
391  // Keep track of all contexts in the platform. This is needed to manage
392  // a lifetime of memory allocations in each context when there are kernels
393  // with indirect access.
394  // TODO: should be deleted when memory isolation in the context is implemented
395  // in the driver.
396  std::list<pi_context> Contexts;
398 };
399 
400 // Implements memory allocation via L0 RT for USM allocator interface.
402 protected:
405  // Internal allocation routine which must be implemented for each allocation
406  // type
407  virtual pi_result allocateImpl(void **ResultPtr, size_t Size,
408  pi_uint32 Alignment) = 0;
409  virtual MemType getMemTypeImpl() = 0;
410 
411 public:
413  : Context{Ctx}, Device{Dev} {}
414  void *allocate(size_t Size) override final;
415  void *allocate(size_t Size, size_t Alignment) override final;
416  void deallocate(void *Ptr, bool OwnZeMemHandle) override final;
417  MemType getMemType() override final;
418 };
419 
420 // Allocation routines for shared memory type
422 protected:
423  pi_result allocateImpl(void **ResultPtr, size_t Size,
424  pi_uint32 Alignment) override;
425  MemType getMemTypeImpl() override;
426 
427 public:
429  : USMMemoryAllocBase(Ctx, Dev) {}
430 };
431 
432 // Allocation routines for shared memory type that is only modified from host.
434 protected:
435  pi_result allocateImpl(void **ResultPtr, size_t Size,
436  pi_uint32 Alignment) override;
438 
439 public:
441  : USMMemoryAllocBase(Ctx, Dev) {}
442 };
443 
444 // Allocation routines for device memory type
446 protected:
447  pi_result allocateImpl(void **ResultPtr, size_t Size,
448  pi_uint32 Alignment) override;
449  MemType getMemTypeImpl() override;
450 
451 public:
453  : USMMemoryAllocBase(Ctx, Dev) {}
454 };
455 
456 // Allocation routines for host memory type
458 protected:
459  pi_result allocateImpl(void **ResultPtr, size_t Size,
460  pi_uint32 Alignment) override;
461  MemType getMemTypeImpl() override;
462 
463 public:
465 };
466 
468  // All events are created host-visible.
470  // All events are created with device-scope and only when
471  // host waits them or queries their status that a proxy
472  // host-visible event is created and set to signal after
473  // original event signals.
475  // All events are created with device-scope and only
476  // when a batch of commands is submitted for execution a
477  // last command in that batch is added to signal host-visible
478  // completion of each command in this batch (the default mode).
480 };
481 
482 struct _pi_device : _pi_object {
484  pi_device ParentDevice = nullptr)
485  : ZeDevice{Device}, Platform{Plt}, RootDevice{ParentDevice},
486  ImmCommandListsPreferred{false}, ZeDeviceProperties{},
487  ZeDeviceComputeProperties{} {
488  // NOTE: one must additionally call initialize() to complete
489  // PI device creation.
490  }
491 
492  // The helper structure that keeps info about a command queue groups of the
493  // device. It is not changed after it is initialized.
495  typedef enum {
499  Size // must be last
500  } type;
501 
502  // Keep the ordinal of the commands group as returned by
503  // zeDeviceGetCommandQueueGroupProperties. A value of "-1" means that
504  // there is no such queue group available in the Level Zero runtime.
505  int32_t ZeOrdinal{-1};
506 
507  // Keep the index of the specific queue in this queue group where
508  // all the command enqueues of the corresponding type should go to.
509  // The value of "-1" means that no hard binding is defined and
510  // implementation can choose specific queue index on its own.
511  int32_t ZeIndex{-1};
512 
513  // Keeps the queue group properties.
515  };
516 
517  std::vector<queue_group_info_t> QueueGroup =
518  std::vector<queue_group_info_t>(queue_group_info_t::Size);
519 
520  // This returns "true" if a main copy engine is available for use.
521  bool hasMainCopyEngine() const {
522  return QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal >= 0;
523  }
524 
525  // This returns "true" if a link copy engine is available for use.
526  bool hasLinkCopyEngine() const {
527  return QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal >= 0;
528  }
529 
530  // This returns "true" if a main or link copy engine is available for use.
531  bool hasCopyEngine() const {
532  return hasMainCopyEngine() || hasLinkCopyEngine();
533  }
534 
535  // Initialize the entire PI device.
536  // Optional param `SubSubDeviceOrdinal` `SubSubDeviceIndex` are the compute
537  // command queue ordinal and index respectively, used to initialize
538  // sub-sub-devices.
539  pi_result initialize(int SubSubDeviceOrdinal = -1,
540  int SubSubDeviceIndex = -1);
541 
542  // Level Zero device handle.
543  // This field is only set at _pi_device creation time, and cannot change.
544  // Therefore it can be accessed without holding a lock on this _pi_device.
546 
547  // Keep the subdevices that are partitioned from this pi_device for reuse
548  // The order of sub-devices in this vector is repeated from the
549  // ze_device_handle_t array that are returned from zeDeviceGetSubDevices()
550  // call, which will always return sub-devices in the fixed same order.
551  std::vector<pi_device> SubDevices;
552 
553  // PI platform to which this device belongs.
554  // This field is only set at _pi_device creation time, and cannot change.
555  // Therefore it can be accessed without holding a lock on this _pi_device.
557 
558  // Root-device of a sub-device, null if this is not a sub-device.
559  // This field is only set at _pi_device creation time, and cannot change.
560  // Therefore it can be accessed without holding a lock on this _pi_device.
562 
563  // Whether to use immediate commandlists for queues on this device.
564  // For some devices (e.g. PVC) immediate commandlists are preferred.
566 
567  // Return the Events scope to be used in for this device.
568  enum EventsScope eventsScope();
569 
570  // Return whether to use immediate commandlists for this device.
571  bool useImmediateCommandLists();
572 
573  bool isSubDevice() { return RootDevice != nullptr; }
574 
575  // Cache of the immutable device properties.
585 };
586 
587 // Structure describing the specific use of a command-list in a queue.
588 // This is because command-lists are re-used across multiple queues
589 // in the same context.
591  // The Level-Zero fence that will be signalled at completion.
592  // Immediate commandlists do not have an associated fence.
593  // A nullptr for the fence indicates that this is an immediate commandlist.
594  ze_fence_handle_t ZeFence{nullptr};
595  // Record if the fence is in use.
596  // This is needed to avoid leak of the tracked command-list if the fence
597  // was not yet signaled at the time all events in that list were already
598  // completed (we are polling the fence at events completion). The fence
599  // may be still "in-use" due to sporadic delay in HW.
600  bool ZeFenceInUse{false};
601 
602  // Record the queue to which the command list will be submitted.
603  ze_command_queue_handle_t ZeQueue{nullptr};
604  // Keeps the ordinal of the ZeQueue queue group. Invalid if ZeQueue==nullptr
605  uint32_t ZeQueueGroupOrdinal{0};
606  // Helper functions to tell if this is a copy command-list.
607  bool isCopy(pi_queue Queue) const;
608 
609  // Keeps events created by commands submitted into this command-list.
610  // TODO: use this for explicit wait/cleanup of events at command-list
611  // completion.
612  // TODO: use this for optimizing events in the same command-list, e.g.
613  // only have last one visible to the host.
614  std::vector<pi_event> EventList{};
615  size_t size() const { return EventList.size(); }
616  void append(pi_event Event) { EventList.push_back(Event); }
617 };
618 
619 // The map type that would track all command-lists in a queue.
620 typedef std::unordered_map<ze_command_list_handle_t, pi_command_list_info_t>
622 // The iterator pointing to a specific command-list in use.
623 typedef pi_command_list_map_t::iterator pi_command_list_ptr_t;
624 
625 struct _pi_context : _pi_object {
627  const pi_device *Devs, bool OwnZeContext)
628  : ZeContext{ZeContext},
629  OwnZeContext{OwnZeContext}, Devices{Devs, Devs + NumDevices},
630  SingleRootDevice(getRootDevice()), ZeCommandListInit{nullptr} {
631  // NOTE: one must additionally call initialize() to complete
632  // PI context creation.
633  }
634 
635  // Initialize the PI context.
637 
638  // Finalize the PI context
639  pi_result finalize();
640 
641  // Return the Platform, which is the same for all devices in the context
642  pi_platform getPlatform() const { return Devices[0]->Platform; }
643 
644  // A L0 context handle is primarily used during creation and management of
645  // resources that may be used by multiple devices.
646  // This field is only set at _pi_context creation time, and cannot change.
647  // Therefore it can be accessed without holding a lock on this _pi_context.
649 
650  // Indicates if we own the ZeContext or it came from interop that
651  // asked to not transfer the ownership to SYCL RT.
653 
654  // Keep the PI devices this PI context was created for.
655  // This field is only set at _pi_context creation time, and cannot change.
656  // Therefore it can be accessed without holding a lock on this _pi_context.
657  const std::vector<pi_device> Devices;
658 
659  // Checks if Device is covered by this context.
660  // For that the Device or its root devices need to be in the context.
662  while (Device) {
663  if (std::find(Devices.begin(), Devices.end(), Device) != Devices.end())
664  return true;
666  }
667  return false;
668  }
669 
670  // If context contains one device or sub-devices of the same device, we want
671  // to save this device.
672  // This field is only set at _pi_context creation time, and cannot change.
673  // Therefore it can be accessed without holding a lock on this _pi_context.
674  const pi_device SingleRootDevice = nullptr;
675 
676  // Immediate Level Zero command list for the device in this context, to be
677  // used for initializations. To be created as:
678  // - Immediate command list: So any command appended to it is immediately
679  // offloaded to the device.
680  // - Synchronous: So implicit synchronization is made inside the level-zero
681  // driver.
682  // There will be a list of immediate command lists (for each device) when
683  // support of the multiple devices per context will be added.
684  ze_command_list_handle_t ZeCommandListInit;
685 
686  // Mutex for the immediate command list. Per the Level Zero spec memory copy
687  // operations submitted to an immediate command list are not allowed to be
688  // called from simultaneous threads.
690 
691  // Mutex Lock for the Command List Cache. This lock is used to control both
692  // compute and copy command list caches.
694  // Cache of all currently available/completed command/copy lists.
695  // Note that command-list can only be re-used on the same device.
696  //
697  // TODO: explore if we should use root-device for creating command-lists
698  // as spec says that in that case any sub-device can re-use it: "The
699  // application must only use the command list for the device, or its
700  // sub-devices, which was provided during creation."
701  //
702  std::unordered_map<ze_device_handle_t, std::list<ze_command_list_handle_t>>
704  std::unordered_map<ze_device_handle_t, std::list<ze_command_list_handle_t>>
706 
707  // Retrieves a command list for executing on this device along with
708  // a fence to be used in tracking the execution of this command list.
709  // If a command list has been created on this device which has
710  // completed its commands, then that command list and its associated fence
711  // will be reused. Otherwise, a new command list and fence will be created for
712  // running on this device. L0 fences are created on a L0 command queue so the
713  // caller must pass a command queue to create a new fence for the new command
714  // list if a command list/fence pair is not available. All Command Lists &
715  // associated fences are destroyed at Device Release.
716  // If UseCopyEngine is true, the command will eventually be executed in a
717  // copy engine. Otherwise, the command will be executed in a compute engine.
718  // If AllowBatching is true, then the command list returned may already have
719  // command in it, if AllowBatching is false, any open command lists that
720  // already exist in Queue will be closed and executed.
721  // If ForcedCmdQueue is not nullptr, the resulting command list must be tied
722  // to the contained command queue. This option is ignored if immediate
723  // command lists are used.
724  // When using immediate commandlists, retrieves an immediate command list
725  // for executing on this device. Immediate commandlists are created only
726  // once for each SYCL Queue and after that they are reused.
727  pi_result
728  getAvailableCommandList(pi_queue Queue, pi_command_list_ptr_t &CommandList,
729  bool UseCopyEngine, bool AllowBatching = false,
730  ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
731 
732  // Get index of the free slot in the available pool. If there is no available
733  // pool then create new one. The HostVisible parameter tells if we need a
734  // slot for a host-visible event. The ProfilingEnabled tells is we need a
735  // slot for an event with profiling capabilities.
736  pi_result getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &,
737  bool HostVisible,
738  bool ProfilingEnabled);
739 
740  // Decrement number of events living in the pool upon event destroy
741  // and return the pool to the cache if there are no unreleased events.
742  pi_result decrementUnreleasedEventsInPool(pi_event Event);
743 
744  // Store USM allocator context(internal allocator structures)
745  // for USM shared and device allocations. There is 1 allocator context
746  // per each pair of (context, device) per each memory type.
747  std::unordered_map<pi_device, USMAllocContext> DeviceMemAllocContexts;
748  std::unordered_map<pi_device, USMAllocContext> SharedMemAllocContexts;
749  std::unordered_map<pi_device, USMAllocContext> SharedReadOnlyMemAllocContexts;
750 
751  // Since L0 native runtime does not distinguisg "shared device_read_only"
752  // vs regular "shared" allocations, we have keep track of it to use
753  // proper USMAllocContext when freeing allocations.
754  std::unordered_set<void *> SharedReadOnlyAllocs;
755 
756  // Store the host allocator context. It does not depend on any device.
757  std::unique_ptr<USMAllocContext> HostMemAllocContext;
758 
759  // We need to store all memory allocations in the context because there could
760  // be kernels with indirect access. Kernels with indirect access start to
761  // reference all existing memory allocations at the time when they are
762  // submitted to the device. Referenced memory allocations can be released only
763  // when kernel has finished execution.
764  std::unordered_map<void *, MemAllocRecord> MemAllocs;
765 
766  // Get pi_event from cache.
767  pi_event getEventFromCache(bool HostVisible, bool WithProfiling);
768 
769  // Add pi_event to cache.
770  void addEventToCache(pi_event);
771 
772 private:
773  // If context contains one device then return this device.
774  // If context contains sub-devices of the same device, then return this parent
775  // device. Return nullptr if context consists of several devices which are not
776  // sub-devices of the same device. We call returned device the root device of
777  // a context.
778  // TODO: get rid of this when contexts with multiple devices are supported for
779  // images.
780  pi_device getRootDevice() const;
781 
782  // Following member variables are used to manage assignment of events
783  // to event pools.
784  //
785  // TODO: Create pi_event_pool class to encapsulate working with pools.
786  // This will avoid needing the use of maps below, and cleanup the
787  // pi_context overall.
788  //
789 
790  // The cache of event pools from where new events are allocated from.
791  // The head event pool is where the next event would be added to if there
792  // is still some room there. If there is no room in the head then
793  // the following event pool is taken (guranteed to be empty) and made the
794  // head. In case there is no next pool, a new pool is created and made the
795  // head.
796  //
797  // Cache of event pools to which host-visible events are added to.
798  std::vector<std::list<ze_event_pool_handle_t>> ZeEventPoolCache{4};
799  auto getZeEventPoolCache(bool HostVisible, bool WithProfiling) {
800  if (HostVisible)
801  return WithProfiling ? &ZeEventPoolCache[0] : &ZeEventPoolCache[1];
802  else
803  return WithProfiling ? &ZeEventPoolCache[2] : &ZeEventPoolCache[3];
804  }
805 
806  // This map will be used to determine if a pool is full or not
807  // by storing number of empty slots available in the pool.
808  std::unordered_map<ze_event_pool_handle_t, pi_uint32>
809  NumEventsAvailableInEventPool;
810  // This map will be used to determine number of unreleased events in the pool.
811  // We use separate maps for number of event slots available in the pool from
812  // the number of events unreleased in the pool.
813  // This will help when we try to make the code thread-safe.
814  std::unordered_map<ze_event_pool_handle_t, pi_uint32>
815  NumEventsUnreleasedInEventPool;
816 
817  // Mutex to control operations on event pool caches and the helper maps
818  // holding the current pool usage counts.
819  pi_mutex ZeEventPoolCacheMutex;
820 
821  // Mutex to control operations on event caches.
822  pi_mutex EventCacheMutex;
823 
824  // Caches for events.
825  std::vector<std::list<pi_event>> EventCaches{4};
826 
827  // Get the cache of events for a provided scope and profiling mode.
828  auto getEventCache(bool HostVisible, bool WithProfiling) {
829  if (HostVisible)
830  return WithProfiling ? &EventCaches[0] : &EventCaches[1];
831  else
832  return WithProfiling ? &EventCaches[2] : &EventCaches[3];
833  }
834 };
835 
836 struct _pi_queue : _pi_object {
837  _pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
838  std::vector<ze_command_queue_handle_t> &CopyQueues,
839  pi_context Context, pi_device Device, bool OwnZeCommandQueue,
840  pi_queue_properties Properties = 0);
841 
843 
844  // PI queue is in general a one to many mapping to L0 native queues.
847  pi_queue_group_t() = delete;
848 
849  // The Queue argument captures the enclosing PI queue.
850  // The Type argument specifies the type of this queue group.
851  // The actual ZeQueues are populated at PI queue construction.
853  : Queue(Queue), Type(Type) {}
854 
855  // The type of the queue group.
857  bool isCopy() const { return Type != queue_type::Compute; }
858 
859  // Level Zero command queue handles.
860  std::vector<ze_command_queue_handle_t> ZeQueues;
861 
862  // Immediate commandlist handles, one per Level Zero command queue handle.
863  // These are created only once, along with the L0 queues (see above)
864  // and reused thereafter.
865  std::vector<pi_command_list_ptr_t> ImmCmdLists;
866 
867  // Return the index of the next queue to use based on a
868  // round robin strategy and the queue group ordinal.
869  uint32_t getQueueIndex(uint32_t *QueueGroupOrdinal, uint32_t *QueueIndex);
870 
871  // Get the ordinal for a command queue handle.
872  int32_t getCmdQueueOrdinal(ze_command_queue_handle_t CmdQueue);
873 
874  // This function will return one of possibly multiple available native
875  // queues and the value of the queue group ordinal.
876  ze_command_queue_handle_t &getZeQueue(uint32_t *QueueGroupOrdinal);
877 
878  // This function returns the next immediate commandlist to use.
879  pi_command_list_ptr_t &getImmCmdList();
880 
881  // These indices are to filter specific range of the queues to use,
882  // and to organize round-robin across them.
883  uint32_t UpperIndex{0};
884  uint32_t LowerIndex{0};
885  uint32_t NextIndex{0};
886  };
887 
888  pi_queue_group_t ComputeQueueGroup{this, queue_type::Compute};
889 
890  // Vector of Level Zero copy command command queue handles.
891  // In this vector, main copy engine, if available, come first followed by
892  // link copy engines, if available.
893  pi_queue_group_t CopyQueueGroup{this, queue_type::MainCopy};
894 
895  // Wait for all commandlists associated with this Queue to finish operations.
896  pi_result synchronize();
897 
898  pi_queue_group_t &getQueueGroup(bool UseCopyEngine) {
899  return UseCopyEngine ? CopyQueueGroup : ComputeQueueGroup;
900  }
901 
902  // This function considers multiple factors including copy engine
903  // availability and user preference and returns a boolean that is used to
904  // specify if copy engine will eventually be used for a particular command.
905  bool useCopyEngine(bool PreferCopyEngine = true) const;
906 
907  // Keeps the PI context to which this queue belongs.
908  // This field is only set at _pi_queue creation time, and cannot change.
909  // Therefore it can be accessed without holding a lock on this _pi_queue.
911 
912  // Keeps the PI device to which this queue belongs.
913  // This field is only set at _pi_queue creation time, and cannot change.
914  // Therefore it can be accessed without holding a lock on this _pi_queue.
916 
917  // Keeps track of the event associated with the last enqueued command into
918  // this queue. this is used to add dependency with the last command to add
919  // in-order semantics and updated with the latest event each time a new
920  // command is enqueued.
921  pi_event LastCommandEvent = nullptr;
922 
923  // Kernel is not necessarily submitted for execution during
924  // piEnqueueKernelLaunch, it may be batched. That's why we need to save the
925  // list of kernels which is going to be submitted but have not been submitted
926  // yet. This is needed to capture memory allocations for each kernel with
927  // indirect access in the list at the moment when kernel is really submitted
928  // for execution.
929  std::vector<pi_kernel> KernelsToBeSubmitted;
930 
931  // Update map of memory references made by the kernels about to be submitted
932  void CaptureIndirectAccesses();
933 
934  // Indicates if we own the ZeCommandQueue or it came from interop that
935  // asked to not transfer the ownership to SYCL RT.
937 
938  // Map of all command lists used in this queue.
940 
941  // Helper data structure to hold all variables related to batching
942  typedef struct CommandBatch {
943  // These two members are used to keep track of how often the
944  // batching closes and executes a command list before reaching the
945  // QueueComputeBatchSize limit, versus how often we reach the limit.
946  // This info might be used to vary the QueueComputeBatchSize value.
947  pi_uint32 NumTimesClosedEarly = {0};
948  pi_uint32 NumTimesClosedFull = {0};
949 
950  // Open command list fields for batching commands into this queue.
951  pi_command_list_ptr_t OpenCommandList{};
952 
953  // Approximate number of commands that are allowed to be batched for
954  // this queue.
955  // Added this member to the queue rather than using a global variable
956  // so that future implementation could use heuristics to change this on
957  // a queue specific basis. And by putting it in the queue itself, this
958  // is thread safe because of the locking of the queue that occurs.
959  pi_uint32 QueueBatchSize = {0};
960  } command_batch;
961 
962  // ComputeCommandBatch holds data related to batching of non-copy commands.
963  // CopyCommandBatch holds data related to batching of copy commands.
964  command_batch ComputeCommandBatch, CopyCommandBatch;
965 
966  // Returns true if any commands for this queue are allowed to
967  // be batched together.
968  // For copy commands, IsCopy is set to 'true'.
969  // For non-copy commands, IsCopy is set to 'false'.
970  bool isBatchingAllowed(bool IsCopy) const;
971 
972  // Keeps the properties of this queue.
974 
975  // Returns true if the queue is a in-order queue.
976  bool isInOrderQueue() const;
977 
978  // Returns true if the queue has discard events property.
979  bool isDiscardEvents() const;
980 
981  // adjust the queue's batch size, knowing that the current command list
982  // is being closed with a full batch.
983  // For copy commands, IsCopy is set to 'true'.
984  // For non-copy commands, IsCopy is set to 'false'.
985  void adjustBatchSizeForFullBatch(bool IsCopy);
986 
987  // adjust the queue's batch size, knowing that the current command list
988  // is being closed with only a partial batch of commands.
989  // For copy commands, IsCopy is set to 'true'.
990  // For non-copy commands, IsCopy is set to 'false'.
991  void adjustBatchSizeForPartialBatch(bool IsCopy);
992 
993  // Helper function to create a new command-list to this queue and associated
994  // fence tracking its completion. This command list & fence are added to the
995  // map of command lists in this queue with ZeFenceInUse = false.
996  // The caller must hold a lock of the queue already.
997  pi_result
998  createCommandList(bool UseCopyEngine, pi_command_list_ptr_t &CommandList,
999  ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
1000 
1001  // Resets the Command List and Associated fence in the ZeCommandListFenceMap.
1002  // If the reset command list should be made available, then MakeAvailable
1003  // needs to be set to true. The caller must verify that this command list and
1004  // fence have been signalled. The EventListToCleanup contains a list of events
1005  // from the command list which need to be cleaned up.
1006  pi_result resetCommandList(pi_command_list_ptr_t CommandList,
1007  bool MakeAvailable,
1008  std::vector<_pi_event *> &EventListToCleanup);
1009 
1010  // Returns true if an OpenCommandList has commands that need to be submitted.
1011  // If IsCopy is 'true', then the OpenCommandList containing copy commands is
1012  // checked. Otherwise, the OpenCommandList containing compute commands is
1013  // checked.
1014  bool hasOpenCommandList(bool IsCopy) const {
1015  auto CommandBatch = (IsCopy) ? CopyCommandBatch : ComputeCommandBatch;
1016  return CommandBatch.OpenCommandList != CommandListMap.end();
1017  }
1018  // Attach a command list to this queue.
1019  // For non-immediate commandlist also close and execute it.
1020  // Note that this command list cannot be appended to after this.
1021  // The "IsBlocking" tells if the wait for completion is required.
1022  // If OKToBatchCommand is true, then this command list may be executed
1023  // immediately, or it may be left open for other future command to be
1024  // batched into.
1025  // If IsBlocking is true, then batching will not be allowed regardless
1026  // of the value of OKToBatchCommand
1027  //
1028  // For immediate commandlists, no close and execute is necessary.
1029  pi_result executeCommandList(pi_command_list_ptr_t CommandList,
1030  bool IsBlocking = false,
1031  bool OKToBatchCommand = false);
1032 
1033  // If there is an open command list associated with this queue,
1034  // close it, execute it, and reset the corresponding OpenCommandList.
1035  // If IsCopy is 'true', then the OpenCommandList containing copy commands is
1036  // executed. Otherwise OpenCommandList containing compute commands is
1037  // executed.
1038  pi_result executeOpenCommandList(bool IsCopy);
1039 
1040  // Gets the open command containing the event, or CommandListMap.end()
1041  pi_command_list_ptr_t eventOpenCommandList(pi_event Event);
1042 
1043  // Wrapper function to execute both OpenCommandLists (Copy and Compute).
1044  // This wrapper is helpful when all 'open' commands need to be executed.
1045  // Call-sites instances: piQuueueFinish, piQueueRelease, etc.
1047  using IsCopy = bool;
1048  if (auto Res = executeOpenCommandList(IsCopy{false}))
1049  return Res;
1050  if (auto Res = executeOpenCommandList(IsCopy{true}))
1051  return Res;
1052  return PI_SUCCESS;
1053  }
1054 
1055  // Inserts a barrier waiting for all unfinished events in ActiveBarriers into
1056  // CmdList. Any finished events will be removed from ActiveBarriers.
1057  pi_result insertActiveBarriers(pi_command_list_ptr_t &CmdList,
1058  bool UseCopyEngine);
1059 
1060  // A collection of currently active barriers.
1061  // These should be inserted into a command list whenever an available command
1062  // list is needed for a command.
1063  std::vector<pi_event> ActiveBarriers;
1064 
1065  // Besides each PI object keeping a total reference count in
1066  // _pi_object::RefCount we keep special track of the queue *external*
1067  // references. This way we are able to tell when the queue is being finished
1068  // externally, and can wait for internal references to complete, and do proper
1069  // cleanup of the queue.
1070  // This counter doesn't track the lifetime of a queue object, it only tracks
1071  // the number of external references. I.e. even if it reaches zero a queue
1072  // object may not be destroyed and can be used internally in the plugin.
1073  // That's why we intentionally don't use atomic type for this counter to
1074  // enforce guarding with a mutex all the work involving this counter.
1075  pi_uint32 RefCountExternal{1};
1076 
1077  // Indicates that the queue is healthy and all operations on it are OK.
1078  bool Healthy{true};
1079 };
1080 
1081 struct _pi_mem : _pi_object {
1082  // Keeps the PI context of this memory handle.
1084 
1085  // Enumerates all possible types of accesses.
1086  enum access_mode_t { unknown, read_write, read_only, write_only };
1087 
1088  // Interface of the _pi_mem object
1089 
1090  // Get the Level Zero handle of the current memory object
1091  virtual pi_result getZeHandle(char *&ZeHandle, access_mode_t,
1092  pi_device Device = nullptr) = 0;
1093 
1094  // Get a pointer to the Level Zero handle of the current memory object
1095  virtual pi_result getZeHandlePtr(char **&ZeHandlePtr, access_mode_t,
1096  pi_device Device = nullptr) = 0;
1097 
1098  // Method to get type of the derived object (image or buffer)
1099  virtual bool isImage() const = 0;
1100 
1101  virtual ~_pi_mem() = default;
1102 
1103 protected:
1104  _pi_mem(pi_context Ctx) : Context{Ctx} {}
1105 };
1106 
1107 struct _pi_buffer;
1109 
1110 struct _pi_buffer final : _pi_mem {
1111  // Buffer constructor
1112  _pi_buffer(pi_context Context, size_t Size, char *HostPtr,
1113  bool ImportedHostPtr = false)
1114  : _pi_mem(Context), Size(Size), SubBuffer{nullptr, 0} {
1115 
1116  // We treat integrated devices (physical memory shared with the CPU)
1117  // differently from discrete devices (those with distinct memories).
1118  // For integrated devices, allocating the buffer in the host memory
1119  // enables automatic access from the device, and makes copying
1120  // unnecessary in the map/unmap operations. This improves performance.
1121  OnHost = Context->Devices.size() == 1 &&
1122  Context->Devices[0]->ZeDeviceProperties->flags &
1123  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
1124 
1125  // Fill the host allocation data.
1126  if (HostPtr) {
1127  MapHostPtr = HostPtr;
1128  // If this host ptr is imported to USM then use this as a host
1129  // allocation for this buffer.
1130  if (ImportedHostPtr) {
1131  Allocations[nullptr].ZeHandle = HostPtr;
1132  Allocations[nullptr].Valid = true;
1133  Allocations[nullptr].ReleaseAction = _pi_buffer::allocation_t::unimport;
1134  }
1135  }
1136 
1137  // Make first device in the context be the master. Mark that
1138  // allocation (yet to be made) having "valid" data. And real
1139  // allocation and initialization should follow the buffer
1140  // construction with a "write_only" access copy.
1141  LastDeviceWithValidAllocation = Context->Devices[0];
1142  Allocations[LastDeviceWithValidAllocation].Valid = true;
1143  }
1144 
1145  // Sub-buffer constructor
1146  _pi_buffer(pi_buffer Parent, size_t Origin, size_t Size)
1147  : _pi_mem(Parent->Context), Size(Size), SubBuffer{Parent, Origin} {}
1148 
1149  // Interop-buffer constructor
1151  char *ZeMemHandle, bool OwnZeMemHandle)
1152  : _pi_mem(Context), Size(Size), SubBuffer{nullptr, 0} {
1153 
1154  // Device == nullptr means host allocation
1155  Allocations[Device].ZeHandle = ZeMemHandle;
1156  Allocations[Device].Valid = true;
1157  Allocations[Device].ReleaseAction =
1158  OwnZeMemHandle ? allocation_t::free_native : allocation_t::keep;
1159 
1160  // Check if this buffer can always stay on host
1161  OnHost = false;
1162  if (!Device) { // Host allocation
1163  if (Context->Devices.size() == 1 &&
1164  Context->Devices[0]->ZeDeviceProperties->flags &
1165  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) {
1166  OnHost = true;
1167  MapHostPtr = ZeMemHandle; // map to this allocation
1168  }
1169  }
1170  LastDeviceWithValidAllocation = Device;
1171  }
1172 
1173  // Returns a pointer to the USM allocation representing this PI buffer
1174  // on the specified Device. If Device is nullptr then the returned
1175  // USM allocation is on the device where this buffer was used the latest.
1176  // The returned allocation is always valid, i.e. its contents is
1177  // up-to-date and any data copies needed for that are performed under
1178  // the hood.
1179  //
1180  virtual pi_result getZeHandle(char *&ZeHandle, access_mode_t,
1181  pi_device Device = nullptr) override;
1182  virtual pi_result getZeHandlePtr(char **&ZeHandlePtr, access_mode_t,
1183  pi_device Device = nullptr) override;
1184 
1185  bool isImage() const override { return false; }
1186 
1187  bool isSubBuffer() const { return SubBuffer.Parent != nullptr; }
1188 
1189  // Frees all allocations made for the buffer.
1190  pi_result free();
1191 
1192  // Information about a single allocation representing this buffer.
1193  struct allocation_t {
1194  // Level Zero memory handle is really just a naked pointer.
1195  // It is just convenient to have it char * to simplify offset arithmetics.
1196  char *ZeHandle{nullptr};
1197  // Indicates if this allocation's data is valid.
1198  bool Valid{false};
1199  // Specifies the action that needs to be taken for this
1200  // allocation at buffer destruction.
1201  enum {
1202  keep, // do nothing, the allocation is not owned by us
1203  unimport, // release of the imported allocation
1204  free, // free from the pooling context (default)
1205  free_native // free with a native call
1206  } ReleaseAction{free};
1207  };
1208 
1209  // We maintain multiple allocations on possibly all devices in the context.
1210  // The "nullptr" device identifies a host allocation representing buffer.
1211  // Sub-buffers don't maintain own allocations but rely on parent buffer.
1212  std::unordered_map<pi_device, allocation_t> Allocations;
1213  pi_device LastDeviceWithValidAllocation{nullptr};
1214 
1215  // Flag to indicate that this memory is allocated in host memory.
1216  // Integrated device accesses this memory.
1217  bool OnHost{false};
1218 
1219  // Tells the host allocation to use for buffer map operations.
1220  char *MapHostPtr{nullptr};
1221 
1222  // Supplementary data to keep track of the mappings of this buffer
1223  // created with piEnqueueMemBufferMap.
1224  struct Mapping {
1225  // The offset in the buffer giving the start of the mapped region.
1226  size_t Offset;
1227  // The size of the mapped region.
1228  size_t Size;
1229  };
1230 
1231  // The key is the host pointer representing an active mapping.
1232  // The value is the information needed to maintain/undo the mapping.
1233  std::unordered_map<void *, Mapping> Mappings;
1234 
1235  // The size and alignment of the buffer
1236  size_t Size;
1237  size_t getAlignment() const;
1238 
1239  struct {
1241  size_t Origin; // only valid if Parent != nullptr
1242  } SubBuffer;
1243 };
1244 
1245 // TODO: add proper support for images on context with multiple devices.
1246 struct _pi_image final : _pi_mem {
1247  // Image constructor
1249  : _pi_mem(Ctx), ZeImage{Image} {}
1250 
1251  virtual pi_result getZeHandle(char *&ZeHandle, access_mode_t,
1252  pi_device = nullptr) override {
1253  ZeHandle = pi_cast<char *>(ZeImage);
1254  return PI_SUCCESS;
1255  }
1256  virtual pi_result getZeHandlePtr(char **&ZeHandlePtr, access_mode_t,
1257  pi_device = nullptr) override {
1258  ZeHandlePtr = pi_cast<char **>(&ZeImage);
1259  return PI_SUCCESS;
1260  }
1261 
1262  bool isImage() const override { return true; }
1263 
1264 #ifndef NDEBUG
1265  // Keep the descriptor of the image (for debugging purposes)
1267 #endif // !NDEBUG
1268 
1269  // Level Zero image handle.
1271 };
1272 
1274  // List of level zero events for this event list.
1275  ze_event_handle_t *ZeEventList = {nullptr};
1276 
1277  // List of pi_events for this event list.
1278  pi_event *PiEventList = {nullptr};
1279 
1280  // length of both the lists. The actual allocation of these lists
1281  // may be longer than this length. This length is the actual number
1282  // of elements in the above arrays that are valid.
1283  pi_uint32 Length = {0};
1284 
1285  // A mutex is needed for destroying the event list.
1286  // Creation is already thread-safe because we only create the list
1287  // when an event is initially created. However, it might be
1288  // possible to have multiple threads racing to destroy the list,
1289  // so this will be used to make list destruction thread-safe.
1291 
1292  // Initialize this using the array of events in EventList, and retain
1293  // all the pi_events in the created data structure.
1294  // CurQueue is the pi_queue that the command with this event wait
1295  // list is going to be added to. That is needed to flush command
1296  // batches for wait events that are in other queues.
1297  // UseCopyEngine indicates if the next command (the one that this
1298  // event wait-list is for) is going to go to copy or compute
1299  // queue. This is used to properly submit the dependent open
1300  // command-lists.
1301  pi_result createAndRetainPiZeEventList(pi_uint32 EventListLength,
1302  const pi_event *EventList,
1303  pi_queue CurQueue, bool UseCopyEngine);
1304 
1305  // Add all the events in this object's PiEventList to the end
1306  // of the list EventsToBeReleased. Destroy pi_ze_event_list_t data
1307  // structure fields making it look empty.
1308  pi_result collectEventsForReleaseAndDestroyPiZeEventList(
1309  std::list<pi_event> &EventsToBeReleased);
1310 
1311  // Had to create custom assignment operator because the mutex is
1312  // not assignment copyable. Just field by field copy of the other
1313  // fields.
1315  this->ZeEventList = other.ZeEventList;
1316  this->PiEventList = other.PiEventList;
1317  this->Length = other.Length;
1318  return *this;
1319  }
1320 };
1321 
1322 struct _pi_event : _pi_object {
1323  _pi_event(ze_event_handle_t ZeEvent, ze_event_pool_handle_t ZeEventPool,
1324  pi_context Context, pi_command_type CommandType, bool OwnZeEvent)
1325  : ZeEvent{ZeEvent}, OwnZeEvent{OwnZeEvent}, ZeEventPool{ZeEventPool},
1326  CommandType{CommandType}, Context{Context}, CommandData{nullptr} {}
1327 
1328  // Level Zero event handle.
1330 
1331  // Indicates if we own the ZeEvent or it came from interop that
1332  // asked to not transfer the ownership to SYCL RT.
1334 
1335  // Level Zero event pool handle.
1336  ze_event_pool_handle_t ZeEventPool;
1337 
1338  // In case we use device-only events this holds their host-visible
1339  // counterpart. If this event is itself host-visble then HostVisibleEvent
1340  // points to this event. If this event is not host-visible then this field can
1341  // be: 1) null, meaning that a host-visible event wasn't yet created 2) a PI
1342  // event created internally that host will actually be redirected
1343  // to wait/query instead of this PI event.
1344  //
1345  // The HostVisibleEvent is a reference counted PI event and can be used more
1346  // than by just this one event, depending on the mode (see EventsScope).
1347  //
1348  pi_event HostVisibleEvent = {nullptr};
1349  bool isHostVisible() const { return this == HostVisibleEvent; }
1350 
1351  // Get the host-visible event or create one and enqueue its signal.
1352  pi_result getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent);
1353 
1354  // Tells if this event is with profiling capabilities.
1355  bool isProfilingEnabled() const {
1356  return !Queue || // tentatively assume user events are profiling enabled
1357  (Queue->Properties & PI_QUEUE_PROFILING_ENABLE) != 0;
1358  }
1359 
1360  // Keeps the command-queue and command associated with the event.
1361  // These are NULL for the user events.
1362  pi_queue Queue = {nullptr};
1364  // Provide direct access to Context, instead of going via queue.
1365  // Not every PI event has a queue, and we need a handle to Context
1366  // to get to event pool related information.
1368 
1369  // Opaque data to hold any data needed for CommandType.
1371 
1372  // List of events that were in the wait list of the command that will
1373  // signal this event. These events must be retained when the command is
1374  // enqueued, and must then be released when this event has signalled.
1375  // This list must be destroyed once the event has signalled.
1377 
1378  // Tracks if the needed cleanup was already performed for
1379  // a completed event. This allows to control that some cleanup
1380  // actions are performed only once.
1381  //
1382  bool CleanedUp = {false};
1383 
1384  // Indicates that this PI event had already completed in the sense
1385  // that no other synchromization is needed. Note that the underlying
1386  // L0 event (if any) is not guranteed to have been signalled, or
1387  // being visible to the host at all.
1388  bool Completed = {false};
1389 
1390  // Besides each PI object keeping a total reference count in
1391  // _pi_object::RefCount we keep special track of the event *external*
1392  // references. This way we are able to tell when the event is not referenced
1393  // externally anymore, i.e. it can't be passed as a dependency event to
1394  // piEnqueue* functions and explicitly waited meaning that we can do some
1395  // optimizations:
1396  // 1. For in-order queues we can reset and reuse event even if it was not yet
1397  // completed by submitting a reset command to the queue (since there are no
1398  // external references, we know that nobody can wait this event somewhere in
1399  // parallel thread or pass it as a dependency which may lead to hang)
1400  // 2. We can avoid creating host proxy event.
1401  // This counter doesn't track the lifetime of an event object. Even if it
1402  // reaches zero an event object may not be destroyed and can be used
1403  // internally in the plugin.
1404  std::atomic<pi_uint32> RefCountExternal{0};
1405 
1406  bool hasExternalRefs() { return RefCountExternal != 0; }
1407 
1408  // Reset _pi_event object.
1409  pi_result reset();
1410 };
1411 
1412 struct _pi_program : _pi_object {
1413  // Possible states of a program.
1414  typedef enum {
1415  // The program has been created from intermediate language (SPIR-V), but it
1416  // is not yet compiled.
1418 
1419  // The program has been created by loading native code, but it has not yet
1420  // been built. This is equivalent to an OpenCL "program executable" that
1421  // is loaded via clCreateProgramWithBinary().
1423 
1424  // The program was notionally compiled from SPIR-V form. However, since we
1425  // postpone compilation until the module is linked, the internal state
1426  // still represents the module as SPIR-V.
1428 
1429  // The program has been built or linked, and it is represented as a Level
1430  // Zero module.
1432 
1433  // An error occurred during piProgramLink, but we created a _pi_program
1434  // object anyways in order to hold the ZeBuildLog. Note that the ZeModule
1435  // may or may not be nullptr in this state, depending on the error.
1436  Invalid
1437  } state;
1438 
1439  // A utility class that converts specialization constants into the form
1440  // required by the Level Zero driver.
1442  public:
1444  ZeSpecConstants.numConstants = Program->SpecConstants.size();
1445  ZeSpecContantsIds.reserve(ZeSpecConstants.numConstants);
1446  ZeSpecContantsValues.reserve(ZeSpecConstants.numConstants);
1447 
1448  for (auto &SpecConstant : Program->SpecConstants) {
1449  ZeSpecContantsIds.push_back(SpecConstant.first);
1450  ZeSpecContantsValues.push_back(SpecConstant.second);
1451  }
1452  ZeSpecConstants.pConstantIds = ZeSpecContantsIds.data();
1453  ZeSpecConstants.pConstantValues = ZeSpecContantsValues.data();
1454  }
1455 
1456  const ze_module_constants_t *ze() { return &ZeSpecConstants; }
1457 
1458  private:
1459  std::vector<uint32_t> ZeSpecContantsIds;
1460  std::vector<const void *> ZeSpecContantsValues;
1461  ze_module_constants_t ZeSpecConstants;
1462  };
1463 
1464  // Construct a program in IL or Native state.
1465  _pi_program(state St, pi_context Context, const void *Input, size_t Length)
1466  : Context{Context},
1467  OwnZeModule{true}, State{St}, Code{new uint8_t[Length]},
1468  CodeLength{Length}, ZeModule{nullptr}, ZeBuildLog{nullptr} {
1469  std::memcpy(Code.get(), Input, Length);
1470  }
1471 
1472  // Construct a program in Exe or Invalid state.
1474  ze_module_build_log_handle_t ZeBuildLog)
1475  : Context{Context}, OwnZeModule{true}, State{St}, ZeModule{ZeModule},
1476  ZeBuildLog{ZeBuildLog} {}
1477 
1478  // Construct a program in Exe state (interop).
1480  bool OwnZeModule)
1481  : Context{Context}, OwnZeModule{OwnZeModule}, State{St},
1482  ZeModule{ZeModule}, ZeBuildLog{nullptr} {}
1483 
1484  // Construct a program in Invalid state with a custom error message.
1486  : Context{Context}, OwnZeModule{true}, ErrorMessage{ErrorMessage},
1487  State{St}, ZeModule{nullptr}, ZeBuildLog{nullptr} {}
1488 
1489  ~_pi_program();
1490 
1491  const pi_context Context; // Context of the program.
1492 
1493  // Indicates if we own the ZeModule or it came from interop that
1494  // asked to not transfer the ownership to SYCL RT.
1495  const bool OwnZeModule;
1496 
1497  // This error message is used only in Invalid state to hold a custom error
1498  // message from a call to piProgramLink.
1499  const std::string ErrorMessage;
1500 
1502 
1503  // In IL and Object states, this contains the SPIR-V representation of the
1504  // module. In Native state, it contains the native code.
1505  std::unique_ptr<uint8_t[]> Code; // Array containing raw IL / native code.
1506  size_t CodeLength; // Size (bytes) of the array.
1507 
1508  // Used only in IL and Object states. Contains the SPIR-V specialization
1509  // constants as a map from the SPIR-V "SpecID" to a buffer that contains the
1510  // associated value. The caller of the PI layer is responsible for
1511  // maintaining the storage of this buffer.
1512  std::unordered_map<uint32_t, const void *> SpecConstants;
1513 
1514  // Used only in Object state. Contains the build flags from the last call to
1515  // piProgramCompile().
1516  std::string BuildFlags;
1517 
1518  // The Level Zero module handle. Used primarily in Exe state.
1520 
1521  // The Level Zero build log from the last call to zeModuleCreate().
1522  ze_module_build_log_handle_t ZeBuildLog;
1523 };
1524 
1525 struct _pi_kernel : _pi_object {
1526  _pi_kernel(ze_kernel_handle_t Kernel, bool OwnZeKernel, pi_program Program)
1527  : ZeKernel{Kernel}, OwnZeKernel{OwnZeKernel}, Program{Program},
1528  MemAllocs{}, SubmissionsCount{0} {}
1529 
1530  // Completed initialization of PI kernel. Must be called after construction.
1532 
1533  // Returns true if kernel has indirect access, false otherwise.
1535  // Currently indirect access flag is set for all kernels and there is no API
1536  // to check if kernel actually indirectly access smth.
1537  return true;
1538  }
1539 
1540  // Level Zero function handle.
1542 
1543  // Indicates if we own the ZeKernel or it came from interop that
1544  // asked to not transfer the ownership to SYCL RT.
1546 
1547  // Keep the program of the kernel.
1549 
1550  // Hash function object for the unordered_set below.
1551  struct Hash {
1552  size_t operator()(const std::pair<void *const, MemAllocRecord> *P) const {
1553  return std::hash<void *>()(P->first);
1554  }
1555  };
1556 
1557  // If kernel has indirect access we need to make a snapshot of all existing
1558  // memory allocations to defer deletion of these memory allocations to the
1559  // moment when kernel execution has finished.
1560  // We store pointers to the elements because pointers are not invalidated by
1561  // insert/delete for std::unordered_map (iterators are invalidated). We need
1562  // to take a snapshot instead of just reference-counting the allocations,
1563  // because picture of active allocations can change during kernel execution
1564  // (new allocations can be added) and we need to know which memory allocations
1565  // were retained by this kernel to release them (and don't touch new
1566  // allocations) at kernel completion. Same kernel may be submitted several
1567  // times and retained allocations may be different at each submission. That's
1568  // why we have a set of memory allocations here and increase ref count only
1569  // once even if kernel is submitted many times. We don't want to know how many
1570  // times and which allocations were retained by each submission. We release
1571  // all allocations in the set only when SubmissionsCount == 0.
1572  std::unordered_set<std::pair<void *const, MemAllocRecord> *, Hash> MemAllocs;
1573 
1574  // Counter to track the number of submissions of the kernel.
1575  // When this value is zero, it means that kernel is not submitted for an
1576  // execution - at this time we can release memory allocations referenced by
1577  // this kernel. We can do this when RefCount turns to 0 but it is too late
1578  // because kernels are cached in the context by SYCL RT and they are released
1579  // only during context object destruction. Regular RefCount is not usable to
1580  // track submissions because user/SYCL RT can retain kernel object any number
1581  // of times. And that's why there is no value of RefCount which can mean zero
1582  // submissions.
1583  std::atomic<pi_uint32> SubmissionsCount;
1584 
1585  // Keeps info about an argument to the kernel enough to set it with
1586  // zeKernelSetArgumentValue.
1587  struct ArgumentInfo {
1588  uint32_t Index;
1589  size_t Size;
1590  const pi_mem Value;
1592  };
1593  // Arguments that still need to be set (with zeKernelSetArgumentValue)
1594  // before kernel is enqueued.
1595  std::vector<ArgumentInfo> PendingArguments;
1596 
1597  // Cache of the kernel properties.
1600 };
1601 
1602 struct _pi_sampler : _pi_object {
1603  _pi_sampler(ze_sampler_handle_t Sampler) : ZeSampler{Sampler} {}
1604 
1605  // Level Zero sampler handle.
1606  ze_sampler_handle_t ZeSampler;
1607 };
1608 
1609 #endif // PI_LEVEL_ZERO_HPP
USMDeviceMemoryAlloc::USMDeviceMemoryAlloc
USMDeviceMemoryAlloc(pi_context Ctx, pi_device Dev)
Definition: pi_level_zero.hpp:452
_pi_device::RootDevice
const pi_device RootDevice
Definition: pi_level_zero.hpp:561
_pi_object::RefCount
std::atomic< pi_uint32 > RefCount
Definition: pi_esimd_emulator.hpp:62
_pi_event::hasExternalRefs
bool hasExternalRefs()
Definition: pi_level_zero.hpp:1406
MemAllocRecord::Context
pi_context Context
Definition: pi_level_zero.hpp:352
_pi_event::isProfilingEnabled
bool isProfilingEnabled() const
Definition: pi_level_zero.hpp:1355
_pi_mem
PI Mem mapping to CUDA memory allocations, both data and texture/surface.
Definition: pi_cuda.hpp:222
ze_image_handle_t
struct _ze_image_handle_t * ze_image_handle_t
Definition: backend_traits_level_zero.hpp:34
USMSharedReadOnlyMemoryAlloc::USMSharedReadOnlyMemoryAlloc
USMSharedReadOnlyMemoryAlloc(pi_context Ctx, pi_device Dev)
Definition: pi_level_zero.hpp:440
getZeStructureType< ze_module_desc_t >
ze_structure_type_t getZeStructureType< ze_module_desc_t >()
Definition: pi_level_zero.hpp:104
_pi_buffer::allocation_t::keep
@ keep
Definition: pi_level_zero.hpp:1202
EventsScope
EventsScope
Definition: pi_level_zero.hpp:467
_pi_device::queue_group_info_t::type
type
Definition: pi_level_zero.hpp:495
_pi_context::ZeCommandListInit
ze_command_list_handle_t ZeCommandListInit
Definition: pi_level_zero.hpp:684
_pi_device::_pi_device
_pi_device(ze_device_handle_t Device, pi_platform Plt, pi_device ParentDevice=nullptr)
Definition: pi_level_zero.hpp:483
_pi_context::DeviceMemAllocContexts
std::unordered_map< pi_device, USMAllocContext > DeviceMemAllocContexts
Definition: pi_level_zero.hpp:747
pi_shared_mutex::try_lock
bool try_lock()
Definition: pi_level_zero.hpp:210
pi.h
_pi_buffer::isImage
bool isImage() const override
Definition: pi_level_zero.hpp:1185
getZeStructureType< ze_device_memory_properties_t >
ze_structure_type_t getZeStructureType< ze_device_memory_properties_t >()
Definition: pi_level_zero.hpp:147
AllHostVisible
@ AllHostVisible
Definition: pi_level_zero.hpp:469
_pi_program::_pi_program
_pi_program(state St, pi_context Context, ze_module_handle_t ZeModule, ze_module_build_log_handle_t ZeBuildLog)
Definition: pi_level_zero.hpp:1473
_pi_program::ZeBuildLog
ze_module_build_log_handle_t ZeBuildLog
Definition: pi_level_zero.hpp:1522
_pi_queue::pi_queue_group_t::isCopy
bool isCopy() const
Definition: pi_level_zero.hpp:857
_pi_buffer::Parent
_pi_mem * Parent
Definition: pi_level_zero.hpp:1240
die
void die(const char *Message)
Definition: pi_level_zero.hpp:64
_pi_event::CommandType
pi_command_type CommandType
Definition: pi_level_zero.hpp:1363
_pi_platform::_pi_platform
_pi_platform(ze_driver_handle_t Driver)
Definition: pi_level_zero.hpp:363
T
_pi_context::getPlatform
pi_platform getPlatform() const
Definition: pi_level_zero.hpp:642
LastCommandInBatchHostVisible
@ LastCommandInBatchHostVisible
Definition: pi_level_zero.hpp:479
ErrorMessage
thread_local char ErrorMessage[MaxMessageSize]
Definition: pi_esimd_emulator.cpp:154
ze_module_handle_t
struct _ze_module_handle_t * ze_module_handle_t
Definition: backend_traits_level_zero.hpp:36
_pi_platform::PiDevicesCacheMutex
pi_shared_mutex PiDevicesCacheMutex
Definition: pi_level_zero.hpp:381
_pi_program::Object
@ Object
Definition: pi_level_zero.hpp:1427
_pi_device::hasCopyEngine
bool hasCopyEngine() const
Definition: pi_level_zero.hpp:531
_pi_image::ZeImageDesc
ZeStruct< ze_image_desc_t > ZeImageDesc
Definition: pi_level_zero.hpp:1266
_pi_event::ZeEventPool
ze_event_pool_handle_t ZeEventPool
Definition: pi_level_zero.hpp:1336
_pi_platform::initialize
pi_result initialize()
Definition: pi_level_zero.cpp:2183
USMSharedMemoryAlloc
Definition: pi_level_zero.hpp:421
_pi_kernel::ZeKernelName
ZeCache< std::string > ZeKernelName
Definition: pi_level_zero.hpp:1599
getZeStructureType< ze_module_properties_t >
ze_structure_type_t getZeStructureType< ze_module_properties_t >()
Definition: pi_level_zero.hpp:154
_pi_program::ErrorMessage
const std::string ErrorMessage
Definition: pi_level_zero.hpp:1499
_pi_buffer
Definition: pi_esimd_emulator.hpp:179
USMMemoryAllocBase::USMMemoryAllocBase
USMMemoryAllocBase(pi_context Ctx, pi_device Dev)
Definition: pi_level_zero.hpp:412
_pi_image
Definition: pi_esimd_emulator.hpp:188
_pi_image::ZeImage
ze_image_handle_t ZeImage
Definition: pi_level_zero.hpp:1270
_pi_buffer::isSubBuffer
bool isSubBuffer() const
Definition: pi_level_zero.hpp:1187
getZeStructureType
ze_structure_type_t getZeStructureType()
SystemMemory
Definition: usm_allocator.hpp:17
getZeStructureType< ze_context_desc_t >
ze_structure_type_t getZeStructureType< ze_context_desc_t >()
Definition: pi_level_zero.hpp:83
getZeStructureType< ze_sampler_desc_t >
ze_structure_type_t getZeStructureType< ze_sampler_desc_t >()
Definition: pi_level_zero.hpp:117
_pi_queue::pi_queue_group_t::pi_queue_group_t
pi_queue_group_t(pi_queue Queue, queue_type Type)
Definition: pi_level_zero.hpp:852
USMMemoryAllocBase
Definition: pi_level_zero.hpp:401
_pi_context::SharedMemAllocContexts
std::unordered_map< pi_device, USMAllocContext > SharedMemAllocContexts
Definition: pi_level_zero.hpp:748
_pi_program::_pi_program
_pi_program(state St, pi_context Context, const void *Input, size_t Length)
Definition: pi_level_zero.hpp:1465
getZeStructureType< ze_driver_properties_t >
ze_structure_type_t getZeStructureType< ze_driver_properties_t >()
Definition: pi_level_zero.hpp:120
ReferenceCounter::reset
void reset()
Definition: pi_level_zero.hpp:288
_pi_kernel::Hash
Definition: pi_level_zero.hpp:1551
_pi_object::Mutex
pi_shared_mutex Mutex
Definition: pi_level_zero.hpp:339
USMDeviceMemoryAlloc
Definition: pi_level_zero.hpp:445
pi_shared_mutex
Definition: pi_level_zero.hpp:204
ze_command_queue_handle_t
struct _ze_command_queue_handle_t * ze_command_queue_handle_t
Definition: backend_traits_level_zero.hpp:29
_pi_context::ZeComputeCommandListCache
std::unordered_map< ze_device_handle_t, std::list< ze_command_list_handle_t > > ZeComputeCommandListCache
Definition: pi_level_zero.hpp:703
USMMemoryAllocBase::getMemTypeImpl
virtual MemType getMemTypeImpl()=0
_pi_kernel::Program
pi_program Program
Definition: pi_level_zero.hpp:1548
_pi_kernel::OwnZeKernel
bool OwnZeKernel
Definition: pi_level_zero.hpp:1545
_pi_device::ZeDeviceCacheProperties
ZeCache< ZeStruct< ze_device_cache_properties_t > > ZeDeviceCacheProperties
Definition: pi_level_zero.hpp:584
getZeStructureType< ze_device_properties_t >
ze_structure_type_t getZeStructureType< ze_device_properties_t >()
Definition: pi_level_zero.hpp:123
_pi_program::_pi_program
_pi_program(state St, pi_context Context, ze_module_handle_t ZeModule, bool OwnZeModule)
Definition: pi_level_zero.hpp:1479
ze_event_handle_t
struct _ze_event_handle_t * ze_event_handle_t
Definition: backend_traits_level_zero.hpp:33
_pi_result
_pi_result
Definition: pi.h:110
_pi_queue::pi_queue_group_t::ZeQueues
std::vector< ze_command_queue_handle_t > ZeQueues
Definition: pi_level_zero.hpp:860
ZesStruct::ZesStruct
ZesStruct()
Definition: pi_level_zero.hpp:186
_pi_queue::Device
const pi_device Device
Definition: pi_level_zero.hpp:915
_pi_device::ZeDeviceMemoryProperties
ZeCache< std::vector< ZeStruct< ze_device_memory_properties_t > > > ZeDeviceMemoryProperties
Definition: pi_level_zero.hpp:581
_pi_program::SpecConstants
std::unordered_map< uint32_t, const void * > SpecConstants
Definition: pi_level_zero.hpp:1512
_pi_event::CommandData
void * CommandData
Definition: pi_level_zero.hpp:1370
_pi_buffer::_pi_buffer
_pi_buffer(pi_context Context, size_t Size, char *HostPtr, bool ImportedHostPtr=false)
Definition: pi_level_zero.hpp:1112
_pi_device::hasMainCopyEngine
bool hasMainCopyEngine() const
Definition: pi_level_zero.hpp:521
getZeStructureType< ze_command_queue_group_properties_t >
ze_structure_type_t getZeStructureType< ze_command_queue_group_properties_t >()
Definition: pi_level_zero.hpp:131
_pi_sampler::_pi_sampler
_pi_sampler(ze_sampler_handle_t Sampler)
Definition: pi_level_zero.hpp:1603
_pi_device::ZeDevice
const ze_device_handle_t ZeDevice
Definition: pi_level_zero.hpp:545
ze_device_handle_t
struct _ze_device_handle_t * ze_device_handle_t
Definition: backend_traits_level_zero.hpp:31
_pi_platform::ContextsMutex
pi_shared_mutex ContextsMutex
Definition: pi_level_zero.hpp:397
usm_allocator.hpp
_pi_context::HostMemAllocContext
std::unique_ptr< USMAllocContext > HostMemAllocContext
Definition: pi_level_zero.hpp:757
_pi_context::SharedReadOnlyMemAllocContexts
std::unordered_map< pi_device, USMAllocContext > SharedReadOnlyMemAllocContexts
Definition: pi_level_zero.hpp:749
_pi_context::OwnZeContext
bool OwnZeContext
Definition: pi_level_zero.hpp:652
_pi_queue::CommandBatch
Definition: pi_level_zero.hpp:942
getZeStructureType< ze_relaxed_allocation_limits_exp_desc_t >
ze_structure_type_t getZeStructureType< ze_relaxed_allocation_limits_exp_desc_t >()
Definition: pi_level_zero.hpp:88
MemType
MemType
Definition: usm_allocator.hpp:14
_pi_image::_pi_image
_pi_image(pi_context Ctx, ze_image_handle_t Image)
Definition: pi_level_zero.hpp:1248
getZeStructureType< ze_kernel_properties_t >
ze_structure_type_t getZeStructureType< ze_kernel_properties_t >()
Definition: pi_level_zero.hpp:157
_pi_program::ZeModule
ze_module_handle_t ZeModule
Definition: pi_level_zero.hpp:1519
pi_command_list_map_t
std::unordered_map< ze_command_list_handle_t, pi_command_list_info_t > pi_command_list_map_t
Definition: pi_level_zero.hpp:621
ZeCache::ZeCache
ZeCache()
Definition: pi_level_zero.hpp:262
_pi_buffer::Mapping::Size
size_t Size
Definition: pi_level_zero.hpp:1228
USMMemoryAllocBase::deallocate
void deallocate(void *Ptr, bool OwnZeMemHandle) override final
Definition: pi_level_zero.cpp:7910
_pi_queue::pi_queue_group_t::ImmCmdLists
std::vector< pi_command_list_ptr_t > ImmCmdLists
Definition: pi_level_zero.hpp:865
_pi_context::ZeContext
const ze_context_handle_t ZeContext
Definition: pi_level_zero.hpp:648
_pi_platform
A PI platform stores all known PI devices, in the CUDA plugin this is just a vector of available devi...
Definition: pi_cuda.hpp:73
_pi_ze_event_list_t::PiEventList
pi_event * PiEventList
Definition: pi_level_zero.hpp:1278
sycl::_V1::detail::memcpy
void memcpy(void *Dst, const void *Src, std::size_t Size)
_pi_ze_event_list_t::ZeEventList
ze_event_handle_t * ZeEventList
Definition: pi_level_zero.hpp:1275
MemAllocRecord::OwnZeMemHandle
bool OwnZeMemHandle
Definition: pi_level_zero.hpp:356
_pi_kernel::SubmissionsCount
std::atomic< pi_uint32 > SubmissionsCount
Definition: pi_level_zero.hpp:1583
_pi_device::queue_group_info_t::Compute
@ Compute
Definition: pi_level_zero.hpp:498
_pi_kernel::ArgumentInfo::Index
uint32_t Index
Definition: pi_level_zero.hpp:1588
pi_mutex
Definition: pi_level_zero.hpp:234
_pi_mem::unknown
@ unknown
Definition: pi_level_zero.hpp:1086
OnDemandHostVisibleProxy
@ OnDemandHostVisibleProxy
Definition: pi_level_zero.hpp:474
_pi_object
Definition: pi_esimd_emulator.hpp:59
_pi_queue::CopyCommandBatch
command_batch CopyCommandBatch
Definition: pi_level_zero.hpp:964
_pi_device::ZeDeviceProperties
ZeCache< ZeStruct< ze_device_properties_t > > ZeDeviceProperties
Definition: pi_level_zero.hpp:576
_pi_queue::executeAllOpenCommandLists
pi_result executeAllOpenCommandLists()
Definition: pi_level_zero.hpp:1046
ReferenceCounter::load
pi_uint32 load()
Definition: pi_level_zero.hpp:295
getZeStructureType< ze_device_mem_alloc_desc_t >
ze_structure_type_t getZeStructureType< ze_device_mem_alloc_desc_t >()
Definition: pi_level_zero.hpp:95
_pi_device::ZeDeviceImageProperties
ZeCache< ZeStruct< ze_device_image_properties_t > > ZeDeviceImageProperties
Definition: pi_level_zero.hpp:578
_pi_kernel
Implementation of a PI Kernel for CUDA.
Definition: pi_cuda.hpp:774
ZeCache::ZeCacheMutex
pi_mutex ZeCacheMutex
Definition: pi_level_zero.hpp:260
ZeCache::InitFunctionType
std::function< void(T &)> InitFunctionType
Definition: pi_level_zero.hpp:257
_pi_event::isHostVisible
bool isHostVisible() const
Definition: pi_level_zero.hpp:1349
getZeStructureType< ze_kernel_desc_t >
ze_structure_type_t getZeStructureType< ze_kernel_desc_t >()
Definition: pi_level_zero.hpp:111
ZeCache::operator->
T * operator->()
Definition: pi_level_zero.hpp:265
_pi_ze_event_list_t::operator=
_pi_ze_event_list_t & operator=(const _pi_ze_event_list_t &other)
Definition: pi_level_zero.hpp:1314
_pi_kernel::ArgumentInfo
Definition: pi_level_zero.hpp:1587
_pi_platform::ZeDriver
ze_driver_handle_t ZeDriver
Definition: pi_level_zero.hpp:369
_pi_buffer::_pi_buffer
_pi_buffer(pi_buffer Parent, size_t Origin, size_t Size)
Definition: pi_level_zero.hpp:1146
_pi_device::hasLinkCopyEngine
bool hasLinkCopyEngine() const
Definition: pi_level_zero.hpp:526
_pi_buffer::allocation_t
Definition: pi_level_zero.hpp:1193
_pi_mem::_pi_mem
_pi_mem(pi_context Ctx)
Definition: pi_level_zero.hpp:1104
_pi_image::getZeHandlePtr
virtual pi_result getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, pi_device=nullptr) override
Definition: pi_level_zero.hpp:1256
_pi_ze_event_list_t::PiZeEventListMutex
pi_mutex PiZeEventListMutex
Definition: pi_level_zero.hpp:1290
_pi_queue
PI queue mapping on to CUstream objects.
Definition: pi_cuda.hpp:393
_pi_platform::ZeApiVersion
ze_api_version_t ZeApiVersion
Definition: pi_level_zero.hpp:374
_pi_device::queue_group_info_t
Definition: pi_level_zero.hpp:494
pi_uint32
uint32_t pi_uint32
Definition: pi.h:99
USMMemoryAllocBase::allocate
void * allocate(size_t Size) override final
Definition: pi_level_zero.cpp:7889
USMHostMemoryAlloc
Definition: pi_level_zero.hpp:457
getZeStructureType< ze_device_module_properties_t >
ze_structure_type_t getZeStructureType< ze_device_module_properties_t >()
Definition: pi_level_zero.hpp:139
_pi_device::ZeDeviceModuleProperties
ZeCache< ZeStruct< ze_device_module_properties_t > > ZeDeviceModuleProperties
Definition: pi_level_zero.hpp:579
_pi_buffer::Mapping::Offset
size_t Offset
Definition: pi_level_zero.hpp:1226
getZeStructureType< ze_command_queue_desc_t >
ze_structure_type_t getZeStructureType< ze_command_queue_desc_t >()
Definition: pi_level_zero.hpp:98
getZeStructureType< ze_event_desc_t >
ze_structure_type_t getZeStructureType< ze_event_desc_t >()
Definition: pi_level_zero.hpp:114
_pi_image::getZeHandle
virtual pi_result getZeHandle(char *&ZeHandle, access_mode_t, pi_device=nullptr) override
Definition: pi_level_zero.hpp:1251
_pi_platform::DeviceCachePopulated
bool DeviceCachePopulated
Definition: pi_esimd_emulator.hpp:72
USMHostMemoryAlloc::USMHostMemoryAlloc
USMHostMemoryAlloc(pi_context Ctx)
Definition: pi_level_zero.hpp:464
_pi_platform::zeDriverExtensionMap
std::unordered_map< std::string, uint32_t > zeDriverExtensionMap
Definition: pi_level_zero.hpp:377
_pi_queue::Properties
pi_queue_properties Properties
Definition: pi_level_zero.hpp:973
_pi_context::ZeCommandListCacheMutex
pi_mutex ZeCommandListCacheMutex
Definition: pi_level_zero.hpp:693
_pi_buffer::Origin
size_t Origin
Definition: pi_level_zero.hpp:1241
_pi_kernel::_pi_kernel
_pi_kernel(ze_kernel_handle_t Kernel, bool OwnZeKernel, pi_program Program)
Definition: pi_level_zero.hpp:1526
ZeCache
Definition: pi_level_zero.hpp:252
_pi_queue::pi_queue_group_t
Definition: pi_level_zero.hpp:845
_pi_program::SpecConstantShim::ze
const ze_module_constants_t * ze()
Definition: pi_level_zero.hpp:1456
_pi_platform::ZeDriverApiVersion
std::string ZeDriverApiVersion
Definition: pi_level_zero.hpp:373
_pi_platform::Contexts
std::list< pi_context > Contexts
Definition: pi_level_zero.hpp:396
_pi_buffer::Allocations
std::unordered_map< pi_device, allocation_t > Allocations
Definition: pi_level_zero.hpp:1212
std::cerr
__SYCL_EXTERN_STREAM_ATTRS ostream cerr
Linked to standard error (unbuffered)
pi_shared_mutex::lock
void lock()
Definition: pi_level_zero.hpp:206
_pi_queue::getQueueGroup
pi_queue_group_t & getQueueGroup(bool UseCopyEngine)
Definition: pi_level_zero.hpp:898
_pi_queue::ActiveBarriers
std::vector< pi_event > ActiveBarriers
Definition: pi_level_zero.hpp:1063
MemAllocRecord::MemAllocRecord
MemAllocRecord(pi_context Context, bool OwnZeMemHandle=true)
Definition: pi_level_zero.hpp:345
_pi_event::WaitList
_pi_ze_event_list_t WaitList
Definition: pi_level_zero.hpp:1376
_pi_context::SharedReadOnlyAllocs
std::unordered_set< void * > SharedReadOnlyAllocs
Definition: pi_level_zero.hpp:754
_pi_device::ImmCommandListsPreferred
bool ImmCommandListsPreferred
Definition: pi_level_zero.hpp:565
_pi_ze_event_list_t::Length
pi_uint32 Length
Definition: pi_level_zero.hpp:1283
getZeStructureType< ze_host_mem_alloc_desc_t >
ze_structure_type_t getZeStructureType< ze_host_mem_alloc_desc_t >()
Definition: pi_level_zero.hpp:91
USMSharedMemoryAlloc::USMSharedMemoryAlloc
USMSharedMemoryAlloc(pi_context Ctx, pi_device Dev)
Definition: pi_level_zero.hpp:428
_pi_program
Implementation of PI Program on CUDA Module object.
Definition: pi_cuda.hpp:719
_pi_sampler
Implementation of samplers for CUDA.
Definition: pi_cuda.hpp:945
sycl::_V1::AccessMode
class __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_SPECIAL_CLASS AccessMode
Definition: accessor.hpp:2686
getZeStructureType< ze_device_cache_properties_t >
ze_structure_type_t getZeStructureType< ze_device_cache_properties_t >()
Definition: pi_level_zero.hpp:143
getZeStructureType< ze_module_program_exp_desc_t >
ze_structure_type_t getZeStructureType< ze_module_program_exp_desc_t >()
Definition: pi_level_zero.hpp:108
ReferenceCounter::decrementAndTest
bool decrementAndTest()
Definition: pi_level_zero.hpp:312
_pi_event::OwnZeEvent
bool OwnZeEvent
Definition: pi_level_zero.hpp:1333
ReferenceCounter::increment
void increment()
Definition: pi_level_zero.hpp:291
ZeCache::Compute
InitFunctionType Compute
Definition: pi_level_zero.hpp:258
_pi_kernel::ZeKernelProperties
ZeCache< ZeStruct< ze_kernel_properties_t > > ZeKernelProperties
Definition: pi_level_zero.hpp:1598
_pi_program::SpecConstantShim::SpecConstantShim
SpecConstantShim(pi_program Program)
Definition: pi_level_zero.hpp:1443
getZesStructureType< zes_mem_properties_t >
zes_structure_type_t getZesStructureType< zes_mem_properties_t >()
Definition: pi_level_zero.hpp:173
getZeStructureType< ze_command_list_desc_t >
ze_structure_type_t getZeStructureType< ze_command_list_desc_t >()
Definition: pi_level_zero.hpp:80
_pi_program::Exe
@ Exe
Definition: pi_level_zero.hpp:1431
_pi_platform::PiDevicesCache
std::vector< std::unique_ptr< _pi_device > > PiDevicesCache
Definition: pi_level_zero.hpp:380
_pi_object::RefCount
ReferenceCounter RefCount
Definition: pi_level_zero.hpp:324
getZeStructureType< ze_device_memory_access_properties_t >
ze_structure_type_t getZeStructureType< ze_device_memory_access_properties_t >()
Definition: pi_level_zero.hpp:151
pi_command_list_info_t::size
size_t size() const
Definition: pi_level_zero.hpp:615
pi_mutex::unlock
void unlock()
Definition: pi_level_zero.hpp:241
_pi_device::queue_group_info_t::MainCopy
@ MainCopy
Definition: pi_level_zero.hpp:496
_pi_context::Devices
const std::vector< pi_device > Devices
Definition: pi_level_zero.hpp:657
sycl::_V1::detail::pi::initialize
std::vector< plugin > & initialize()
Definition: pi.cpp:373
USMMemoryAllocBase::Device
pi_device Device
Definition: pi_level_zero.hpp:404
getZesStructureType< zes_mem_state_t >
zes_structure_type_t getZesStructureType< zes_mem_state_t >()
Definition: pi_level_zero.hpp:169
_pi_queue::CommandBatch::OpenCommandList
pi_command_list_ptr_t OpenCommandList
Definition: pi_level_zero.hpp:951
USMSharedReadOnlyMemoryAlloc
Definition: pi_level_zero.hpp:433
getZeStructureType< ze_fence_desc_t >
ze_structure_type_t getZeStructureType< ze_fence_desc_t >()
Definition: pi_level_zero.hpp:77
_pi_queue::hasOpenCommandList
bool hasOpenCommandList(bool IsCopy) const
Definition: pi_level_zero.hpp:1014
iostream_proxy.hpp
_pi_kernel::ArgumentInfo::Size
size_t Size
Definition: pi_level_zero.hpp:1589
_pi_kernel::PendingArguments
std::vector< ArgumentInfo > PendingArguments
Definition: pi_level_zero.hpp:1595
_pi_kernel::ZeKernel
ze_kernel_handle_t ZeKernel
Definition: pi_level_zero.hpp:1541
pi_shared_mutex::unlock
void unlock()
Definition: pi_level_zero.hpp:213
_pi_device::ZeDeviceMemoryAccessProperties
ZeCache< ZeStruct< ze_device_memory_access_properties_t > > ZeDeviceMemoryAccessProperties
Definition: pi_level_zero.hpp:583
_pi_buffer::Mapping
Definition: pi_level_zero.hpp:1224
_pi_context::_pi_context
_pi_context(ze_context_handle_t ZeContext, pi_uint32 NumDevices, const pi_device *Devs, bool OwnZeContext)
Definition: pi_level_zero.hpp:626
ZesStruct
Definition: pi_level_zero.hpp:185
ze_context_handle_t
struct _ze_context_handle_t * ze_context_handle_t
Definition: backend_traits_level_zero.hpp:30
_pi_context::ZeCopyCommandListCache
std::unordered_map< ze_device_handle_t, std::list< ze_command_list_handle_t > > ZeCopyCommandListCache
Definition: pi_level_zero.hpp:705
_pi_ze_event_list_t
Definition: pi_level_zero.hpp:1273
_pi_kernel::ArgumentInfo::Value
const pi_mem Value
Definition: pi_level_zero.hpp:1590
getZeStructureType< ze_image_desc_t >
ze_structure_type_t getZeStructureType< ze_image_desc_t >()
Definition: pi_level_zero.hpp:101
getZesStructureType< zes_pci_properties_t >
zes_structure_type_t getZesStructureType< zes_pci_properties_t >()
Definition: pi_level_zero.hpp:165
pi_mutex::try_lock
bool try_lock()
Definition: pi_level_zero.hpp:240
pi_queue_properties
pi_bitfield pi_queue_properties
Definition: pi.h:566
SingleThreadMode
static const bool SingleThreadMode
Definition: pi_level_zero.hpp:195
_pi_queue::pi_queue_group_t::Queue
pi_queue Queue
Definition: pi_level_zero.hpp:846
_pi_device::queue_group_info_t::LinkCopy
@ LinkCopy
Definition: pi_level_zero.hpp:497
_pi_program::CodeLength
size_t CodeLength
Definition: pi_level_zero.hpp:1506
_pi_buffer::Mappings
std::unordered_map< void *, Mapping > Mappings
Definition: pi_level_zero.hpp:1233
_pi_buffer::_pi_buffer
_pi_buffer(pi_context Context, size_t Size, pi_device Device, char *ZeMemHandle, bool OwnZeMemHandle)
Definition: pi_level_zero.hpp:1150
_pi_event
PI Event mapping to CUevent.
Definition: pi_cuda.hpp:591
_pi_context::MemAllocs
std::unordered_map< void *, MemAllocRecord > MemAllocs
Definition: pi_level_zero.hpp:764
_pi_kernel::Hash::operator()
size_t operator()(const std::pair< void *const, MemAllocRecord > *P) const
Definition: pi_level_zero.hpp:1552
MemAllocRecord
Definition: pi_level_zero.hpp:344
_pi_buffer::allocation_t::free
@ free
Definition: pi_level_zero.hpp:1204
_pi_device::Platform
const pi_platform Platform
Definition: pi_level_zero.hpp:556
_pi_device::queue_group_info_t::ZeProperties
ZeStruct< ze_command_queue_group_properties_t > ZeProperties
Definition: pi_level_zero.hpp:514
_pi_program::Code
std::unique_ptr< uint8_t[]> Code
Definition: pi_level_zero.hpp:1505
ze_driver_handle_t
struct _ze_driver_handle_t * ze_driver_handle_t
Definition: backend_traits_level_zero.hpp:32
_pi_device::isSubDevice
bool isSubDevice()
Definition: pi_level_zero.hpp:573
_pi_queue::OwnZeCommandQueue
bool OwnZeCommandQueue
Definition: pi_level_zero.hpp:936
pi_shared_mutex::try_lock_shared
bool try_lock_shared()
Definition: pi_level_zero.hpp:222
ZeStruct::ZeStruct
ZeStruct()
Definition: pi_level_zero.hpp:180
_pi_queue::KernelsToBeSubmitted
std::vector< pi_kernel > KernelsToBeSubmitted
Definition: pi_level_zero.hpp:929
_pi_kernel::hasIndirectAccess
bool hasIndirectAccess()
Definition: pi_level_zero.hpp:1534
_pi_program::state
state
Definition: pi_level_zero.hpp:1414
USMMemoryAllocBase::allocateImpl
virtual pi_result allocateImpl(void **ResultPtr, size_t Size, pi_uint32 Alignment)=0
_pi_device::SubDevices
std::vector< pi_device > SubDevices
Definition: pi_level_zero.hpp:551
_pi_image::isImage
bool isImage() const override
Definition: pi_level_zero.hpp:1262
_pi_platform::ZeDriverVersion
std::string ZeDriverVersion
Definition: pi_level_zero.hpp:372
_pi_platform::getDeviceFromNativeHandle
pi_device getDeviceFromNativeHandle(ze_device_handle_t)
Definition: pi_level_zero.cpp:2445
_pi_object::_pi_object
_pi_object()
Definition: pi_level_zero.hpp:320
_pi_event::_pi_event
_pi_event(ze_event_handle_t ZeEvent, ze_event_pool_handle_t ZeEventPool, pi_context Context, pi_command_type CommandType, bool OwnZeEvent)
Definition: pi_level_zero.hpp:1323
_pi_platform::populateDeviceCacheIfNeeded
pi_result populateDeviceCacheIfNeeded()
Definition: pi_esimd_emulator.cpp:525
_pi_program::Native
@ Native
Definition: pi_level_zero.hpp:1422
_pi_kernel::MemAllocs
std::unordered_set< std::pair< void *const, MemAllocRecord > *, Hash > MemAllocs
Definition: pi_level_zero.hpp:1572
_pi_context::ImmediateCommandListMutex
pi_mutex ImmediateCommandListMutex
Definition: pi_level_zero.hpp:689
_pi_queue::CommandListMap
pi_command_list_map_t CommandListMap
Definition: pi_level_zero.hpp:939
_pi_queue::Context
const pi_context Context
Definition: pi_level_zero.hpp:910
sycl::_V1::detail::usm::free
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:221
ze_kernel_handle_t
struct _ze_kernel_handle_t * ze_kernel_handle_t
Definition: backend_traits_level_zero.hpp:35
USMSharedReadOnlyMemoryAlloc::getMemTypeImpl
MemType getMemTypeImpl() override
Definition: pi_level_zero.hpp:437
_pi_program::SpecConstantShim
Definition: pi_level_zero.hpp:1441
_pi_program::OwnZeModule
const bool OwnZeModule
Definition: pi_level_zero.hpp:1495
getZeStructureType< ze_device_image_properties_t >
ze_structure_type_t getZeStructureType< ze_device_image_properties_t >()
Definition: pi_level_zero.hpp:135
_pi_context::isValidDevice
bool isValidDevice(pi_device Device) const
Definition: pi_level_zero.hpp:661
ReferenceCounter
Definition: pi_level_zero.hpp:284
_pi_program::State
state State
Definition: pi_level_zero.hpp:1501
_pi_mem::access_mode_t
access_mode_t
Definition: pi_level_zero.hpp:1086
_pi_program::BuildFlags
std::string BuildFlags
Definition: pi_level_zero.hpp:1516
pi_command_list_ptr_t
pi_command_list_map_t::iterator pi_command_list_ptr_t
Definition: pi_level_zero.hpp:623
_pi_command_type
_pi_command_type
Definition: pi.h:369
pi_shared_mutex::lock_shared
void lock_shared()
Definition: pi_level_zero.hpp:218
_pi_queue::pi_queue_group_t::Type
queue_type Type
Definition: pi_level_zero.hpp:856
ZeStruct
Definition: pi_level_zero.hpp:179
_pi_sampler::ZeSampler
ze_sampler_handle_t ZeSampler
Definition: pi_level_zero.hpp:1606
_pi_device::ZeDeviceComputeProperties
ZeCache< ZeStruct< ze_device_compute_properties_t > > ZeDeviceComputeProperties
Definition: pi_level_zero.hpp:577
pi_command_list_info_t
Definition: pi_level_zero.hpp:590
getZeStructureType< ze_event_pool_desc_t >
ze_structure_type_t getZeStructureType< ze_event_pool_desc_t >()
Definition: pi_level_zero.hpp:74
_pi_program::_pi_program
_pi_program(state St, pi_context Context, const std::string &ErrorMessage)
Definition: pi_level_zero.hpp:1485
ZeCache::Computed
bool Computed
Definition: pi_level_zero.hpp:259
getZeStructureType< ze_device_compute_properties_t >
ze_structure_type_t getZeStructureType< ze_device_compute_properties_t >()
Definition: pi_level_zero.hpp:127
USMMemoryAllocBase::getMemType
MemType getMemType() override final
Definition: pi_level_zero.cpp:7917
USMMemoryAllocBase::Context
pi_context Context
Definition: pi_level_zero.hpp:403
_pi_buffer::allocation_t::unimport
@ unimport
Definition: pi_level_zero.hpp:1203
pi_cast
To pi_cast(From Value)
Definition: pi_level_zero.hpp:49
_pi_event::ZeEvent
ze_event_handle_t ZeEvent
Definition: pi_level_zero.hpp:1329
pi_command_list_info_t::append
void append(pi_event Event)
Definition: pi_level_zero.hpp:616
PI_QUEUE_PROFILING_ENABLE
constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE
Definition: pi.h:568
getZesStructureType
zes_structure_type_t getZesStructureType()
_pi_program::Context
const pi_context Context
Definition: pi_level_zero.hpp:1491
pi_mutex::lock
void lock()
Definition: pi_level_zero.hpp:236
_pi_context
PI context mapping to a CUDA context object.
Definition: pi_cuda.hpp:160
ReferenceCounter::ReferenceCounter
ReferenceCounter()
Definition: pi_level_zero.hpp:285
getZeStructureType< ze_memory_allocation_properties_t >
ze_structure_type_t getZeStructureType< ze_memory_allocation_properties_t >()
Definition: pi_level_zero.hpp:161
_pi_device
PI device mapping to a CUdevice.
Definition: pi_cuda.hpp:83
SharedReadOnly
@ SharedReadOnly
Definition: usm_allocator.hpp:14
pi_shared_mutex::unlock_shared
void unlock_shared()
Definition: pi_level_zero.hpp:225
_pi_program::IL
@ IL
Definition: pi_level_zero.hpp:1417