38 bool HostVisible,
pi_event *RetEvent);
49 static const bool UseCopyEngineForD2DCopy = [] {
50 const char *CopyEngineForD2DCopy =
51 std::getenv(
"SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY");
52 return (CopyEngineForD2DCopy && (std::stoi(CopyEngineForD2DCopy) != 0));
58 static const bool UseCopyEngineForInOrderQueue = [] {
59 const char *CopyEngineForInOrderQueue =
60 std::getenv(
"SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE");
61 return (!CopyEngineForInOrderQueue ||
62 (std::stoi(CopyEngineForInOrderQueue) != 0));
67 static const bool UseMultipleCmdlistBarriers = [] {
68 const char *UseMultipleCmdlistBarriersFlag =
69 std::getenv(
"SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS");
70 if (!UseMultipleCmdlistBarriersFlag)
72 return std::stoi(UseMultipleCmdlistBarriersFlag) > 0;
77 static const bool DisableEventsCaching = [] {
78 const char *DisableEventsCachingFlag =
79 std::getenv(
"SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING");
80 if (!DisableEventsCachingFlag)
82 return std::stoi(DisableEventsCachingFlag) != 0;
87 static const bool ReuseDiscardedEvents = [] {
88 const char *ReuseDiscardedEventsFlag =
89 std::getenv(
"SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS");
90 if (!ReuseDiscardedEventsFlag)
92 return std::stoi(ReuseDiscardedEventsFlag) > 0;
96 static const bool IndirectAccessTrackingEnabled = [] {
97 return std::getenv(
"SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY") !=
103 static const bool UseMemcpy2DOperations = [] {
104 const char *UseMemcpy2DOperationsFlag =
105 std::getenv(
"SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D");
106 if (!UseMemcpy2DOperationsFlag)
108 return std::stoi(UseMemcpy2DOperationsFlag) > 0;
114 static inline pi_result mapError(ze_result_t Result) {
119 #define ZE_CALL(ZeName, ZeArgs) \
121 ze_result_t ZeResult = ZeName ZeArgs; \
122 if (auto Result = ZeCall().doCall(ZeResult, #ZeName, #ZeArgs, true)) \
123 return mapError(Result); \
127 #define PI_CALL(Call) \
130 fprintf(stderr, "PI ---> %s\n", #Call); \
131 pi_result Result = (Call); \
132 if (Result != PI_SUCCESS) \
140 static bool doEagerInit = [] {
141 const char *EagerInit = std::getenv(
"SYCL_EAGER_INIT");
142 return EagerInit ? std::atoi(EagerInit) != 0 :
false;
148 static const pi_uint32 MaxNumEventsPerPool = [] {
149 const auto MaxNumEventsPerPoolEnv =
150 std::getenv(
"ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
152 MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256;
162 template <
typename T,
typename Func>
163 ze_result_t zeHostSynchronizeImpl(Func Api, T Handle) {
165 return Api(Handle, UINT64_MAX);
169 while ((R = Api(Handle, 1000)) == ZE_RESULT_NOT_READY)
178 template <
typename T> ze_result_t zeHostSynchronize(T Handle);
180 return zeHostSynchronizeImpl(zeEventHostSynchronize, Handle);
183 return zeHostSynchronizeImpl(zeCommandQueueSynchronize, Handle);
196 static const char *EnvVar =
197 std::getenv(
"SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE");
201 return std::pair<int, int>(0, 0);
203 auto EnvVarValue = std::atoi(EnvVar);
204 if (EnvVarValue >= 0) {
205 return std::pair<int, int>(EnvVarValue, EnvVarValue);
208 return std::pair<int, int>(0, INT_MAX);
224 size_t &Index,
bool HostVisible,
225 bool ProfilingEnabled) {
227 std::scoped_lock<pi_mutex> Lock(ZeEventPoolCacheMutex);
229 std::list<ze_event_pool_handle_t> *ZePoolCache =
230 getZeEventPoolCache(HostVisible, ProfilingEnabled);
232 if (!ZePoolCache->empty()) {
233 if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) {
234 if (DisableEventsCaching) {
236 ZePoolCache->erase(ZePoolCache->begin());
242 ZePoolCache->push_front(
nullptr);
246 if (ZePoolCache->empty()) {
247 ZePoolCache->push_back(
nullptr);
251 ze_event_pool_handle_t *ZePool = &ZePoolCache->front();
254 if (*ZePool ==
nullptr) {
256 ZeEventPoolDesc.count = MaxNumEventsPerPool;
257 ZeEventPoolDesc.flags = 0;
259 ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
260 if (ProfilingEnabled)
261 ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
262 zePrint(
"ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags);
264 std::vector<ze_device_handle_t> ZeDevices;
266 ZeDevices.push_back(D->ZeDevice);
270 &ZeDevices[0], ZePool));
271 NumEventsAvailableInEventPool[*ZePool] = MaxNumEventsPerPool - 1;
272 NumEventsUnreleasedInEventPool[*ZePool] = 1;
274 Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[*ZePool];
275 --NumEventsAvailableInEventPool[*ZePool];
276 ++NumEventsUnreleasedInEventPool[*ZePool];
283 std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex, std::defer_lock);
284 std::scoped_lock<pi_mutex, std::shared_lock<pi_shared_mutex>> LockAll(
285 ZeEventPoolCacheMutex, EventLock);
292 std::list<ze_event_pool_handle_t> *ZePoolCache =
296 if (NumEventsUnreleasedInEventPool[Event->
ZeEventPool] == 0)
297 die(
"Invalid event release: event pool doesn't have unreleased events");
298 if (--NumEventsUnreleasedInEventPool[Event->
ZeEventPool] == 0) {
302 NumEventsAvailableInEventPool[Event->
ZeEventPool] = MaxNumEventsPerPool;
311 pi_bool BlockingWrite,
size_t Size,
321 size_t SrcRowPitch,
size_t DstRowPitch,
size_t SrcSlicePitch,
332 ZE_CALL(zeCommandListAppendBarrier,
334 ZE_CALL(zeCommandListAppendEventReset,
344 }
catch (
const std::bad_alloc &) {
345 return PI_ERROR_OUT_OF_HOST_MEMORY;
347 return PI_ERROR_UNKNOWN;
373 bool ForceHostVisible =
false) {
375 if (!ForceHostVisible)
382 if (*Event ==
nullptr)
385 (*Event)->Queue = Queue;
386 (*Event)->CommandType = CommandType;
387 (*Event)->IsDiscarded = IsInternal;
388 (*Event)->CommandList = CommandList;
394 (*Event)->OwnZeEvent =
false;
398 CommandList->second.append(*Event);
399 (*Event)->RefCount.increment();
407 Queue->RefCount.increment();
438 ZE_CALL(zeCommandListAppendSignalEvent, (CommandList->first, Event->
ZeEvent));
448 if (Cache->size() < 2)
453 auto It = Cache->begin();
461 Cache->emplace_back(Event);
469 const char *ImmCmdListsEventCleanupThresholdStr = std::getenv(
470 "SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD");
471 static constexpr
int Default = 20;
472 if (!ImmCmdListsEventCleanupThresholdStr)
475 int Threshold = std::atoi(ImmCmdListsEventCleanupThresholdStr);
484 pi_device _pi_context::getRootDevice()
const {
498 if (ContextRootDevice->isSubDevice()) {
499 ContextRootDevice = ContextRootDevice->RootDevice;
503 if ((!
Device->RootDevice &&
Device != ContextRootDevice) ||
504 (
Device->RootDevice &&
Device->RootDevice != ContextRootDevice)) {
505 ContextRootDevice =
nullptr;
509 return ContextRootDevice;
521 std::unique_ptr<SystemMemory>(
529 USMAllocatorConfigInstance
535 std::unique_ptr<SystemMemory>(
537 USMAllocatorConfigInstance.
Configs[usm_settings::MemType::Device]));
541 std::function<void(
pi_device)> createUSMAllocatorsRecursive;
542 createUSMAllocatorsRecursive =
543 [createUSMAllocators,
545 createUSMAllocators(
Device);
546 for (
auto &SubDevice :
Device->SubDevices)
547 createUSMAllocatorsRecursive(SubDevice);
553 createUSMAllocatorsRecursive(
Device);
585 ZeCommandQueueDesc.ordinal =
586 Device->QueueGroup[_pi_device::queue_group_info_t::Compute].ZeOrdinal;
587 if (Range.first >= 0 &&
588 Device->QueueGroup[_pi_device::queue_group_info_t::MainCopy].ZeOrdinal !=
590 ZeCommandQueueDesc.ordinal =
591 Device->QueueGroup[_pi_device::queue_group_info_t::MainCopy].ZeOrdinal;
593 ZeCommandQueueDesc.index = 0;
594 ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
596 zeCommandListCreateImmediate,
606 if (!DisableEventsCaching) {
607 std::scoped_lock<pi_mutex> Lock(EventCacheMutex);
608 for (
auto &EventCache : EventCaches) {
609 for (
auto &Event : EventCache) {
610 ZE_CALL(zeEventDestroy, (Event->ZeEvent));
617 std::scoped_lock<pi_mutex> Lock(ZeEventPoolCacheMutex);
618 for (
auto &ZePoolCache : ZeEventPoolCache) {
619 for (
auto &ZePool : ZePoolCache)
620 ZE_CALL(zeEventPoolDestroy, (ZePool));
630 for (ze_command_list_handle_t &ZeCommandList : List.second) {
632 ZE_CALL(zeCommandListDestroy, (ZeCommandList));
636 for (ze_command_list_handle_t &ZeCommandList : List.second) {
638 ZE_CALL(zeCommandListDestroy, (ZeCommandList));
647 ->QueueGroup[_pi_device::queue_group_info_t::type::Compute]
671 std::vector<pi_event> &EventListToCleanup,
673 bool UseCopyEngine = CommandList->second.isCopy(
this);
676 if (CommandList->second.ZeFence !=
nullptr) {
680 ZE_CALL(zeFenceReset, (CommandList->second.ZeFence));
681 ZE_CALL(zeCommandListReset, (CommandList->first));
682 CommandList->second.ZeFenceInUse =
false;
683 CommandList->second.IsClosed =
false;
686 auto &EventList = CommandList->second.EventList;
690 if (!CheckStatus || CommandList->second.ZeFence !=
nullptr ||
694 std::move(std::begin(EventList), std::end(EventList),
695 std::back_inserter(EventListToCleanup));
700 for (
auto it = EventList.begin(); it != EventList.end();) {
701 std::scoped_lock<pi_shared_mutex> EventLock((*it)->Mutex);
702 ze_result_t ZeResult =
712 if (ZeResult == ZE_RESULT_NOT_READY)
715 if (ZeResult != ZE_RESULT_SUCCESS)
716 return mapError(ZeResult);
718 EventListToCleanup.push_back(std::move((*it)));
719 it = EventList.erase(it);
725 if (CommandList->second.ZeFence !=
nullptr && MakeAvailable) {
727 auto &ZeCommandListCache =
731 ZeCommandListCache.push_back(CommandList->first);
769 const auto BatchSizeStr =
770 (IsCopy) ? std::getenv(
"SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE")
771 : std::getenv(
"SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
773 pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
777 if (BatchSizeStrVal > 0) {
778 Config.Size = BatchSizeStrVal;
779 }
else if (BatchSizeStrVal == 0) {
787 std::string BatchConfig(BatchSizeStr);
794 Pos = BatchConfig.find(
":", Pos);
795 if (Pos == std::string::npos)
801 Val = std::stoi(BatchConfig.substr(Pos));
805 "SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: failed to parse value\n");
807 zePrint(
"SYCL_PI_LEVEL_ZERO_BATCH_SIZE: failed to parse value\n");
812 Config.DynamicSizeStart = Val;
815 Config.DynamicSizeMax = Val;
818 Config.DynamicSizeStep = Val;
821 Config.NumTimesClosedEarlyThreshold = Val;
824 Config.NumTimesClosedFullThreshold = Val;
827 die(
"Unexpected batch config");
830 zePrint(
"SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: dynamic batch param "
835 "SYCL_PI_LEVEL_ZERO_BATCH_SIZE: dynamic batch param #%d: %d\n",
842 zePrint(
"SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: ignored negative value\n");
844 zePrint(
"SYCL_PI_LEVEL_ZERO_BATCH_SIZE: ignored negative value\n");
863 std::vector<ze_command_queue_handle_t> &CopyQueues,
865 bool OwnZeCommandQueue,
867 int ForceComputeIndex)
868 : Context{
Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue},
874 auto &ComputeQueueGroupInfo =
Device->QueueGroup[queue_type::Compute];
875 pi_queue_group_t ComputeQueueGroup{
this, queue_type::Compute};
876 ComputeQueueGroup.ZeQueues = ComputeQueues;
879 if (
Device->ImmCommandListUsed) {
880 ComputeQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
881 ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
883 if (ComputeQueueGroupInfo.ZeIndex >= 0) {
889 assert(ForceComputeIndex <= 0);
890 ComputeQueueGroup.LowerIndex = ComputeQueueGroupInfo.ZeIndex;
891 ComputeQueueGroup.UpperIndex = ComputeQueueGroupInfo.ZeIndex;
892 ComputeQueueGroup.NextIndex = ComputeQueueGroupInfo.ZeIndex;
893 }
else if (ForceComputeIndex >= 0) {
894 ComputeQueueGroup.LowerIndex = ForceComputeIndex;
895 ComputeQueueGroup.UpperIndex = ForceComputeIndex;
896 ComputeQueueGroup.NextIndex = ForceComputeIndex;
901 FilterUpperIndex = std::min((
size_t)FilterUpperIndex,
902 FilterLowerIndex + ComputeQueues.size() - 1);
903 if (FilterLowerIndex <= FilterUpperIndex) {
904 ComputeQueueGroup.LowerIndex = FilterLowerIndex;
905 ComputeQueueGroup.UpperIndex = FilterUpperIndex;
906 ComputeQueueGroup.NextIndex = ComputeQueueGroup.LowerIndex;
908 die(
"No compute queue available/allowed.");
911 if (
Device->ImmCommandListUsed) {
914 ComputeQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
915 ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
919 auto TID = std::this_thread::get_id();
920 ComputeQueueGroupsByTID.insert({TID, ComputeQueueGroup});
923 pi_queue_group_t CopyQueueGroup{
this, queue_type::MainCopy};
925 if (Range.first < 0 || Range.second < 0) {
929 uint32_t FilterLowerIndex = Range.first;
930 uint32_t FilterUpperIndex = Range.second;
931 FilterUpperIndex = std::min((
size_t)FilterUpperIndex,
932 FilterLowerIndex + CopyQueues.size() - 1);
933 if (FilterLowerIndex <= FilterUpperIndex) {
934 CopyQueueGroup.ZeQueues = CopyQueues;
935 CopyQueueGroup.LowerIndex = FilterLowerIndex;
936 CopyQueueGroup.UpperIndex = FilterUpperIndex;
937 CopyQueueGroup.NextIndex = CopyQueueGroup.LowerIndex;
940 if (
Device->ImmCommandListUsed) {
941 CopyQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
942 CopyQueueGroup.ZeQueues.size(), CommandListMap.end());
946 CopyQueueGroupsByTID.insert({TID, CopyQueueGroup});
949 ComputeCommandBatch.OpenCommandList = CommandListMap.end();
950 CopyCommandBatch.OpenCommandList = CommandListMap.end();
951 ComputeCommandBatch.QueueBatchSize =
957 bool QueueLocked =
false);
963 bool QueueLocked =
false) {
964 for (
auto &Event : EventListToCleanup) {
968 std::scoped_lock<pi_shared_mutex> EventLock(Event->Mutex);
969 Event->Completed =
true;
990 bool QueueLocked =
false,
991 bool QueueSynced =
false,
992 pi_event CompletedEvent =
nullptr) {
994 if (!Queue || !Queue->
Device->ImmCommandListUsed)
997 std::vector<pi_event> EventListToCleanup;
999 std::unique_lock<pi_shared_mutex> QueueLock(Queue->Mutex, std::defer_lock);
1021 if (!(CompletedEvent->CommandList &&
1022 CompletedEvent->CommandList.value() != Queue->
CommandListMap.end()))
1025 auto &CmdListEvents =
1026 CompletedEvent->CommandList.value()->second.EventList;
1027 auto CompletedEventIt =
1028 std::find(CmdListEvents.begin(), CmdListEvents.end(), CompletedEvent);
1029 if (CompletedEventIt != CmdListEvents.end()) {
1035 std::move(std::begin(CmdListEvents), CompletedEventIt + 1,
1036 std::back_inserter(EventListToCleanup));
1037 CmdListEvents.erase(CmdListEvents.begin(), CompletedEventIt + 1);
1061 if (Queue->
Device->ImmCommandListUsed) {
1070 std::vector<pi_event> EventListToCleanup;
1079 std::unique_lock<pi_shared_mutex> QueueLock(Queue->Mutex);
1084 assert(it->second.ZeFence !=
nullptr);
1087 if (it->second.ZeFenceInUse) {
1088 ze_result_t ZeResult =
1090 if (ZeResult == ZE_RESULT_SUCCESS)
1104 if (Queue->
Device->ImmCommandListUsed) {
1106 if (CommandList->second.EventList.size() >
1108 std::vector<pi_event> EventListToCleanup;
1118 auto &CommandBatch =
1124 if (AllowBatching) {
1154 auto &ZeCommandListCache =
1160 for (
auto ZeCommandListIt = ZeCommandListCache.begin();
1161 ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) {
1162 auto &ZeCommandList = *ZeCommandListIt;
1165 if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue)
1168 if (CommandList->second.ZeFence !=
nullptr)
1169 CommandList->second.ZeFenceInUse =
true;
1176 uint32_t QueueGroupOrdinal;
1177 auto &ZeCommandQueue = ForcedCmdQueue
1181 QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);
1183 ze_fence_handle_t ZeFence;
1185 ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
1187 .emplace(ZeCommandList,
1193 ZeCommandListCache.erase(ZeCommandListIt);
1211 if (UseCopyEngine != it->second.isCopy(Queue))
1214 ze_result_t ZeResult =
1216 if (ZeResult == ZE_RESULT_SUCCESS) {
1217 std::vector<pi_event> EventListToCleanup;
1222 CommandList->second.ZeFenceInUse =
true;
1232 CommandList->second.ZeFenceInUse =
true;
1238 auto &InitialGroup = Map.begin()->second;
1241 if (
Device->ImmCommandListUsed == _pi_device::PerThreadPerQueue) {
1243 auto Result = Map.insert({std::this_thread::get_id(), InitialGroup});
1244 auto &QueueGroupRef = Result.first->second;
1246 if (Result.second) {
1248 QueueGroupRef.
ImmCmdLists = std::vector<pi_command_list_ptr_t>(
1251 return QueueGroupRef;
1255 return InitialGroup;
1267 ze_fence_handle_t ZeFence;
1269 ze_command_list_handle_t ZeCommandList;
1271 uint32_t QueueGroupOrdinal;
1273 auto &ZeCommandQueue =
1274 ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue(&QueueGroupOrdinal);
1276 QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);
1279 ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
1282 &ZeCommandListDesc, &ZeCommandList));
1284 ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
1286 std::pair<ze_command_list_handle_t, pi_command_list_info_t>(
1288 {ZeFence,
false,
false, ZeCommandQueue, QueueGroupOrdinal}));
1299 pi_uint32 &QueueBatchSize = CommandBatch.QueueBatchSize;
1303 CommandBatch.NumTimesClosedFull += 1;
1309 if (CommandBatch.NumTimesClosedEarly <=
1311 CommandBatch.NumTimesClosedFull >
1315 zePrint(
"Raising QueueBatchSize to %d\n", QueueBatchSize);
1317 CommandBatch.NumTimesClosedEarly = 0;
1318 CommandBatch.NumTimesClosedFull = 0;
1326 pi_uint32 &QueueBatchSize = CommandBatch.QueueBatchSize;
1330 CommandBatch.NumTimesClosedEarly += 1;
1337 if (CommandBatch.NumTimesClosedEarly >
1338 (CommandBatch.NumTimesClosedFull + 1) * 3) {
1339 QueueBatchSize = CommandBatch.OpenCommandList->second.size() - 1;
1340 if (QueueBatchSize < 1)
1342 zePrint(
"Lowering QueueBatchSize to %d\n", QueueBatchSize);
1343 CommandBatch.NumTimesClosedEarly = 0;
1344 CommandBatch.NumTimesClosedFull = 0;
1350 if (!Kernel->hasIndirectAccess())
1354 for (
auto &Ctx : Contexts) {
1355 for (
auto &Elem : Ctx->MemAllocs) {
1356 const auto &Pair = Kernel->MemAllocs.insert(&Elem);
1364 Elem.second.RefCount.increment();
1367 Kernel->SubmissionsCount++;
1374 bool OKToBatchCommand) {
1376 if (CommandList->second.IsClosed)
1379 bool UseCopyEngine = CommandList->second.isCopy(
this);
1400 if (!CommandList->second.EventList.empty() &&
1401 this->LastCommandEvent != CommandList->second.EventList.back()) {
1402 this->LastCommandEvent = CommandList->second.EventList.back();
1410 if (!
Device->ImmCommandListUsed) {
1427 CommandBatch.OpenCommandList != CommandList)
1428 die(
"executeCommandList: OpenCommandList should be equal to"
1429 "null or CommandList");
1431 if (CommandList->second.size() < CommandBatch.QueueBatchSize) {
1432 CommandBatch.OpenCommandList = CommandList;
1441 auto &ZeCommandQueue = CommandList->second.ZeQueue;
1449 std::unique_lock<pi_shared_mutex> ContextsLock(
1452 if (IndirectAccessTrackingEnabled) {
1459 ContextsLock.lock();
1463 if (!
Device->ImmCommandListUsed) {
1473 !CommandList->second.EventList.empty()) {
1477 std::find_if(CommandList->second.EventList.begin(),
1478 CommandList->second.EventList.end(),
1479 [](
pi_event E) { return E->hasExternalRefs(); });
1480 if (Result != CommandList->second.EventList.end()) {
1492 for (
auto &Event : CommandList->second.EventList) {
1493 std::scoped_lock<pi_shared_mutex> EventLock(Event->Mutex);
1495 if (!Event->hasExternalRefs())
1498 if (!Event->HostVisibleEvent) {
1499 Event->HostVisibleEvent = HostVisibleEvent;
1500 HostVisibleEvent->RefCount.increment();
1534 ZE_CALL(zeCommandListAppendSignalEvent,
1535 (CommandList->first, HostVisibleEvent->
ZeEvent));
1537 ZE_CALL(zeCommandListAppendBarrier,
1538 (CommandList->first, HostVisibleEvent->
ZeEvent, 0,
nullptr));
1550 ZE_CALL(zeCommandListClose, (CommandList->first));
1552 CommandList->second.IsClosed =
true;
1555 auto ZeCommandList = CommandList->first;
1557 zeCommandQueueExecuteCommandLists,
1558 (ZeCommandQueue, 1, &ZeCommandList, CommandList->second.ZeFence));
1559 if (ZeResult != ZE_RESULT_SUCCESS) {
1561 if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
1563 return PI_ERROR_COMMAND_EXECUTION_FAILURE;
1565 return mapError(ZeResult);
1571 if (
Device->ImmCommandListUsed) {
1575 ZE_CALL(zeHostSynchronize, (ZeCommandQueue));
1583 return (CommandBatch.QueueBatchSize > 0 &&
1590 uint32_t *QueueIndex,
1602 auto QueueType =
Type;
1603 if (QueueType != queue_type::Compute)
1604 QueueType = (CurrentIndex == 0 &&
Queue->
Device->hasMainCopyEngine())
1605 ? queue_type::MainCopy
1606 : queue_type::LinkCopy;
1608 *QueueGroupOrdinal =
Queue->
Device->QueueGroup[QueueType].ZeOrdinal;
1612 auto ZeCommandQueueIndex = CurrentIndex;
1613 if (QueueType == queue_type::LinkCopy &&
Queue->
Device->hasMainCopyEngine()) {
1614 ZeCommandQueueIndex -= 1;
1616 *QueueIndex = ZeCommandQueueIndex;
1618 return CurrentIndex;
1625 auto QueueType = Type;
1626 if (QueueType != queue_type::Compute)
1627 QueueType = (ZeQueues[0] == CmdQueue && Queue->Device->hasMainCopyEngine())
1628 ? queue_type::MainCopy
1629 : queue_type::LinkCopy;
1630 return Queue->Device->QueueGroup[QueueType].ZeOrdinal;
1641 uint32_t QueueIndex;
1642 auto Index = getQueueIndex(QueueGroupOrdinal, &QueueIndex);
1649 ZeCommandQueueDesc.ordinal = *QueueGroupOrdinal;
1650 ZeCommandQueueDesc.index = QueueIndex;
1651 ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
1652 const char *Priority =
"Normal";
1653 if (Queue->isPriorityLow()) {
1654 ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW;
1656 }
else if (Queue->isPriorityHigh()) {
1657 ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH;
1662 if (QueueIndex != 0) {
1663 ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
1666 zePrint(
"[getZeQueue]: create queue ordinal = %d, index = %d "
1667 "(round robin in [%d, %d]) priority = %s\n",
1668 ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
1669 UpperIndex, Priority);
1672 zeCommandQueueCreate, (Queue->Context->ZeContext, Queue->Device->ZeDevice,
1673 &ZeCommandQueueDesc, &ZeQueue));
1675 die(
"[L0] getZeQueue: failed to create queue");
1685 uint32_t QueueIndex, QueueOrdinal;
1686 auto Index = getQueueIndex(&QueueOrdinal, &QueueIndex);
1688 if (ImmCmdLists[Index] != Queue->CommandListMap.end())
1689 return ImmCmdLists[Index];
1692 ZeCommandQueueDesc.ordinal = QueueOrdinal;
1693 ZeCommandQueueDesc.index = QueueIndex;
1694 ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
1695 const char *Priority =
"Normal";
1696 if (Queue->isPriorityLow()) {
1697 ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW;
1699 }
else if (Queue->isPriorityHigh()) {
1700 ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH;
1705 if (QueueIndex != 0) {
1706 ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
1709 zePrint(
"[getZeQueue]: create queue ordinal = %d, index = %d "
1710 "(round robin in [%d, %d]) priority = %s\n",
1711 ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
1712 UpperIndex, Priority);
1714 ze_command_list_handle_t ZeCommandList;
1716 (Queue->Context->ZeContext, Queue->Device->ZeDevice,
1717 &ZeCommandQueueDesc, &ZeCommandList));
1718 ImmCmdLists[Index] =
1719 Queue->CommandListMap
1720 .insert(std::pair<ze_command_list_handle_t, pi_command_list_info_t>{
1721 ZeCommandList, {
nullptr,
true,
false,
nullptr, QueueOrdinal}})
1725 auto QueueType = Type;
1726 std::scoped_lock<pi_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
1727 auto &ZeCommandListCache =
1728 QueueType == queue_type::Compute
1729 ? Queue->Context->ZeComputeCommandListCache[Queue->Device->ZeDevice]
1730 : Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice];
1731 ZeCommandListCache.push_back(ZeCommandList);
1733 return ImmCmdLists[Index];
1737 using IsCopy = bool;
1739 if (
Device->ImmCommandListUsed) {
1745 const auto &ComputeEventList =
1747 if (std::find(ComputeEventList.begin(), ComputeEventList.end(), Event) !=
1748 ComputeEventList.end())
1752 const auto &CopyEventList =
1754 if (std::find(CopyEventList.begin(), CopyEventList.end(), Event) !=
1755 CopyEventList.end())
1767 ZE_CALL(zeCommandListAppendBarrier,
1775 bool UseCopyEngine) {
1791 if (ActiveBarriersWaitList.
Length == 0) {
1796 auto &Event = ActiveBarriersWaitList.
PiEventList[I];
1805 Event->
WaitList = ActiveBarriersWaitList;
1810 ZE_CALL(zeCommandListAppendBarrier,
1811 (CmdList->first,
nullptr, ActiveBarriersWaitList.
Length,
1831 const char *Ret = std::getenv(
"SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST");
1832 const bool RetVal = Ret ? std::stoi(Ret) : 0;
1838 bool UseCopyEngine) {
1840 this->ZeEventList =
nullptr;
1841 this->PiEventList =
nullptr;
1844 if (CurQueue->
Device->ImmCommandListUsed) {
1851 uint32_t QueueGroupOrdinal, QueueIndex;
1855 auto NextImmCmdList = QueueGroup.ImmCmdLists[NextIndex];
1867 const auto &OpenCommandList =
1870 OpenCommandList->second.isCopy(CurQueue) != UseCopyEngine) {
1873 OpenCommandList->second.isCopy(CurQueue)))
1882 bool IncludeLastCommandEvent =
1890 IncludeLastCommandEvent =
false;
1895 if (IncludeLastCommandEvent) {
1897 this->PiEventList =
new pi_event[EventListLength + 1];
1902 }
else if (EventListLength > 0) {
1904 this->PiEventList =
new pi_event[EventListLength];
1907 if (EventListLength > 0) {
1908 for (
pi_uint32 I = 0; I < EventListLength; I++) {
1909 PI_ASSERT(EventList[I] !=
nullptr, PI_ERROR_INVALID_VALUE);
1911 std::shared_lock<pi_shared_mutex> Lock(EventList[I]->Mutex);
1912 if (EventList[I]->Completed)
1919 (HostVisibleEvent->ZeEvent));
1920 if (Res == ZE_RESULT_SUCCESS) {
1927 auto Queue = EventList[I]->
Queue;
1934 auto Lock = ((Queue == CurQueue)
1935 ? std::unique_lock<pi_shared_mutex>()
1936 : std::unique_lock<pi_shared_mutex>(Queue->Mutex));
1943 const auto &OpenCommandList =
1945 if (OpenCommandList != Queue->CommandListMap.end()) {
1947 if (Queue == CurQueue &&
1948 OpenCommandList->second.isCopy(Queue) == UseCopyEngine) {
1952 if (
auto Res = Queue->executeOpenCommandList(
1953 OpenCommandList->second.isCopy(Queue)))
1979 std::shared_lock<pi_shared_mutex> Lock(EventList[I]->Mutex);
1980 this->ZeEventList[TmpListLength] = EventList[I]->
ZeEvent;
1981 this->PiEventList[TmpListLength] = EventList[I];
1986 this->Length = TmpListLength;
1989 return PI_ERROR_OUT_OF_HOST_MEMORY;
1992 for (
pi_uint32 I = 0; I < this->Length; I++) {
1993 this->PiEventList[I]->RefCount.increment();
2010 std::list<pi_event> &EventsToBeReleased) {
2018 pi_event *LocPiEventList =
nullptr;
2023 std::scoped_lock<pi_mutex> lock(this->PiZeEventListMutex);
2026 LocZeEventList = ZeEventList;
2027 LocPiEventList = PiEventList;
2030 ZeEventList =
nullptr;
2031 PiEventList =
nullptr;
2036 for (
pi_uint32 I = 0; I < LocLength; I++) {
2038 EventsToBeReleased.push_back(LocPiEventList[I]);
2041 if (LocZeEventList !=
nullptr) {
2042 delete[] LocZeEventList;
2044 if (LocPiEventList !=
nullptr) {
2045 delete[] LocPiEventList;
2058 ze_module_build_log_handle_t *ZeBuildLog);
2066 size_t ParamValueSize,
void *ParamValue,
2067 size_t *ParamValueSizeRet) {
2068 zePrint(
"==========================\n");
2069 zePrint(
"SYCL over Level-Zero %s\n", Platform->ZeDriverVersion.c_str());
2070 zePrint(
"==========================\n");
2074 ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
2075 return ReturnValue(
"Intel(R) Level-Zero");
2078 ParamValue, ParamValueSizeRet);
2083 PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
2084 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2086 auto ZeDriver = pi_cast<ze_driver_handle_t *>(NativeHandle);
2088 *ZeDriver = Platform->ZeDriver;
2094 PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
2095 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2097 auto ZeDriver = pi_cast<ze_driver_handle_t>(NativeHandle);
2101 if (Res != PI_SUCCESS) {
2106 std::vector<pi_platform> Platforms(NumPlatforms);
2114 for (
const pi_platform &CachedPlatform : Platforms) {
2115 if (CachedPlatform->ZeDriver == ZeDriver) {
2116 *Platform = CachedPlatform;
2122 return PI_ERROR_INVALID_VALUE;
2145 size_t ParamValueSize,
void *ParamValue,
2146 size_t *ParamValueSizeRet) {
2165 PI_ASSERT(SelectedBinaryInd, PI_ERROR_INVALID_VALUE);
2166 PI_ASSERT(NumBinaries == 0 || Binaries, PI_ERROR_INVALID_VALUE);
2189 for (
pi_uint32 i = 0; i < NumBinaries; ++i) {
2190 if (strcmp(Binaries[i]->DeviceTargetSpec, BinaryTarget) == 0) {
2191 *SelectedBinaryInd = i;
2194 if (strcmp(Binaries[i]->DeviceTargetSpec,
2199 if ((*SelectedBinaryInd = Spirv) != InvalidInd)
2203 return PI_ERROR_INVALID_BINARY;
2209 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2211 auto ZeDevice = pi_cast<ze_device_handle_t *>(NativeHandle);
2213 *ZeDevice =
Device->ZeDevice;
2221 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2223 auto ZeDevice = pi_cast<ze_device_handle_t>(NativeHandle);
2238 Dev = ThePlatform->getDeviceFromNativeHandle(ZeDevice);
2241 PI_ASSERT(!Platform || Platform == ThePlatform,
2242 PI_ERROR_INVALID_PLATFORM);
2248 return PI_ERROR_INVALID_VALUE;
2256 void (*PFnNotify)(
const char *ErrInfo,
2257 const void *PrivateInfo,
size_t CB,
2263 PI_ASSERT(NumDevices, PI_ERROR_INVALID_VALUE);
2264 PI_ASSERT(Devices, PI_ERROR_INVALID_DEVICE);
2265 PI_ASSERT(RetContext, PI_ERROR_INVALID_VALUE);
2269 ContextDesc.flags = 0;
2272 ZE_CALL(zeContextCreate, (Platform->ZeDriver, &ContextDesc, &ZeContext));
2274 *RetContext =
new _pi_context(ZeContext, NumDevices, Devices,
true);
2275 (*RetContext)->initialize();
2276 if (IndirectAccessTrackingEnabled) {
2277 std::scoped_lock<pi_shared_mutex> Lock(Platform->
ContextsMutex);
2278 Platform->
Contexts.push_back(*RetContext);
2280 }
catch (
const std::bad_alloc &) {
2281 return PI_ERROR_OUT_OF_HOST_MEMORY;
2283 return PI_ERROR_UNKNOWN;
2290 size_t ParamValueSize,
void *ParamValue,
2291 size_t *ParamValueSizeRet) {
2295 std::shared_lock<pi_shared_mutex> Lock(
Context->Mutex);
2296 ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
2297 switch (ParamName) {
2307 return ReturnValue(
pi_bool{UseMemcpy2DOperations});
2311 return ReturnValue(
pi_bool{
false});
2319 UR_RESULT_ERROR_INVALID_VALUE);
2320 return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
2324 die(
"piGetContextInfo: unsuppported ParamName.");
2337 die(
"piextContextSetExtendedDeleter: not supported");
2344 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2346 auto ZeContext = pi_cast<ze_context_handle_t *>(NativeHandle);
2355 bool OwnNativeHandle,
2357 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2358 PI_ASSERT(Devices, PI_ERROR_INVALID_DEVICE);
2359 PI_ASSERT(RetContext, PI_ERROR_INVALID_VALUE);
2360 PI_ASSERT(NumDevices, PI_ERROR_INVALID_VALUE);
2363 *RetContext =
new _pi_context(pi_cast<ze_context_handle_t>(NativeHandle),
2364 NumDevices, Devices, OwnNativeHandle);
2365 (*RetContext)->initialize();
2366 }
catch (
const std::bad_alloc &) {
2367 return PI_ERROR_OUT_OF_HOST_MEMORY;
2369 return PI_ERROR_UNKNOWN;
2379 Context->RefCount.increment();
2390 if (!
Context->RefCount.decrementAndTest())
2393 if (IndirectAccessTrackingEnabled) {
2396 auto It = std::find(Contexts.begin(), Contexts.end(),
Context);
2397 if (It != Contexts.end())
2415 if (DestoryZeContext)
2416 ZE_CALL(zeContextDestroy, (DestoryZeContext));
2423 std::unique_lock<pi_shared_mutex> ContextsLock(Plt->
ContextsMutex,
2425 if (IndirectAccessTrackingEnabled)
2426 ContextsLock.lock();
2445 PI_ERROR_INVALID_VALUE);
2458 PI_ERROR_INVALID_VALUE);
2461 PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
2467 std::vector<ze_command_queue_handle_t> ZeComputeCommandQueues(
2468 Device->QueueGroup[_pi_queue::queue_type::Compute].ZeProperties.numQueues,
2474 size_t NumCopyGroups = 0;
2475 if (
Device->hasMainCopyEngine()) {
2476 NumCopyGroups +=
Device->QueueGroup[_pi_queue::queue_type::MainCopy]
2477 .ZeProperties.numQueues;
2479 if (
Device->hasLinkCopyEngine()) {
2480 NumCopyGroups +=
Device->QueueGroup[_pi_queue::queue_type::LinkCopy]
2481 .ZeProperties.numQueues;
2483 std::vector<ze_command_queue_handle_t> ZeCopyCommandQueues(NumCopyGroups,
2487 *Queue =
new _pi_queue(ZeComputeCommandQueues, ZeCopyCommandQueues,
Context,
2488 Device,
true, Flags, ForceComputeIndex);
2489 }
catch (
const std::bad_alloc &) {
2490 return PI_ERROR_OUT_OF_HOST_MEMORY;
2492 return PI_ERROR_UNKNOWN;
2499 auto warmupQueueGroup = [Q](
bool UseCopyEngine,
2502 while (RepeatCount--) {
2503 if (Q->
Device->ImmCommandListUsed) {
2507 for (
int I = 0; I < 10; ++I) {
2510 std::vector<pi_event> EventsUnused;
2525 PI_CALL(warmupQueueGroup(
false, InitialGroup.UpperIndex -
2526 InitialGroup.LowerIndex + 1));
2529 PI_CALL(warmupQueueGroup(
true, InitialGroup.UpperIndex -
2530 InitialGroup.LowerIndex + 1));
2538 size_t ParamValueSize,
void *ParamValue,
2539 size_t *ParamValueSizeRet) {
2541 PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
2543 std::shared_lock<pi_shared_mutex> Lock(Queue->Mutex);
2544 ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
2546 switch (ParamName) {
2548 return ReturnValue(Queue->
Context);
2550 return ReturnValue(Queue->
Device);
2552 return ReturnValue(
pi_uint32{Queue->RefCount.load()});
2554 die(
"PI_QUEUE_INFO_PROPERTIES in piQueueGetInfo not implemented\n");
2557 die(
"PI_QUEUE_INFO_SIZE in piQueueGetInfo not implemented\n");
2560 die(
"PI_QUEUE_INFO_DEVICE_DEFAULT in piQueueGetInfo not implemented\n");
2566 return ReturnValue(
pi_bool{
true});
2576 if (ZeResult == ZE_RESULT_NOT_READY) {
2577 return ReturnValue(
pi_bool{
false});
2578 }
else if (ZeResult != ZE_RESULT_SUCCESS) {
2579 return mapError(ZeResult);
2581 return ReturnValue(
pi_bool{
true});
2587 if (Queue->
Device->ImmCommandListUsed)
2588 return ReturnValue(
pi_bool{
false});
2594 using IsCopy = bool;
2597 return ReturnValue(
pi_bool{
false});
2599 for (
const auto &QueueMap :
2601 for (
const auto &QueueGroup : QueueMap) {
2602 if (Queue->
Device->ImmCommandListUsed) {
2607 for (
const auto &ImmCmdList : QueueGroup.second.ImmCmdLists) {
2611 auto EventList = ImmCmdList->second.EventList;
2612 for (
auto It = EventList.crbegin(); It != EventList.crend(); It++) {
2613 ze_result_t ZeResult =
2615 if (ZeResult == ZE_RESULT_NOT_READY) {
2616 return ReturnValue(
pi_bool{
false});
2617 }
else if (ZeResult != ZE_RESULT_SUCCESS) {
2618 return mapError(ZeResult);
2623 for (
const auto &ZeQueue : QueueGroup.second.ZeQueues) {
2630 if (ZeResult == ZE_RESULT_NOT_READY) {
2631 return ReturnValue(
pi_bool{
false});
2632 }
else if (ZeResult != ZE_RESULT_SUCCESS) {
2633 return mapError(ZeResult);
2639 return ReturnValue(
pi_bool{
true});
2642 zePrint(
"Unsupported ParamName in piQueueGetInfo: ParamName=%d(0x%x)\n",
2643 ParamName, ParamName);
2644 return PI_ERROR_INVALID_VALUE;
2652 std::scoped_lock<pi_shared_mutex> Lock(Queue->Mutex);
2655 Queue->RefCount.increment();
2660 PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
2661 std::vector<pi_event> EventListToCleanup;
2664 std::scoped_lock<pi_shared_mutex> Lock(Queue->Mutex);
2692 if (it->second.ZeFence ==
nullptr || it->second.ZeFenceInUse) {
2699 if (Queue->
Healthy && it->second.ZeFence !=
nullptr)
2700 ZE_CALL(zeFenceDestroy, (it->second.ZeFence));
2705 for (
auto &Event : EventListToCleanup) {
2709 std::scoped_lock<pi_shared_mutex> EventLock(Event->Mutex);
2710 Event->Completed =
true;
2722 PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
2724 if (!Queue->RefCount.decrementAndTest())
2728 for (
auto &Event : Cache)
2732 for (
auto &QueueMap :
2734 for (
auto &QueueGroup : QueueMap)
2735 for (
auto &ZeQueue : QueueGroup.second.ZeQueues)
2737 ZE_CALL(zeCommandQueueDestroy, (ZeQueue));
2740 zePrint(
"piQueueRelease(compute) NumTimesClosedFull %d, "
2741 "NumTimesClosedEarly %d\n",
2744 zePrint(
"piQueueRelease(copy) NumTimesClosedFull %d, NumTimesClosedEarly "
2756 PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
2758 if (Queue->
Device->ImmCommandListUsed) {
2760 std::scoped_lock<pi_shared_mutex> Lock(Queue->Mutex);
2764 std::unique_lock<pi_shared_mutex> Lock(Queue->Mutex);
2765 std::vector<ze_command_queue_handle_t> ZeQueues;
2772 for (
auto &QueueMap :
2774 for (
auto &QueueGroup : QueueMap)
2775 std::copy(QueueGroup.second.ZeQueues.begin(),
2776 QueueGroup.second.ZeQueues.end(),
2777 std::back_inserter(ZeQueues));
2787 static bool HoldLock =
2788 std::getenv(
"SYCL_PI_LEVEL_ZERO_QUEUE_FINISH_HOLD_LOCK") !=
nullptr;
2793 for (
auto &ZeQueue : ZeQueues) {
2795 ZE_CALL(zeHostSynchronize, (ZeQueue));
2802 std::scoped_lock<pi_shared_mutex> Lock(Queue->Mutex);
2813 if (!Queue->
Device->ImmCommandListUsed)
2827 PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
2828 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2831 std::shared_lock<pi_shared_mutex> lock(Queue->Mutex);
2833 auto ZeQueue = pi_cast<ze_command_queue_handle_t *>(NativeHandle);
2836 uint32_t QueueGroupOrdinalUnused;
2837 auto TID = std::this_thread::get_id();
2839 const auto &Result =
2841 auto &ComputeQueueGroupRef = Result.first->second;
2843 *ZeQueue = ComputeQueueGroupRef.getZeQueue(&QueueGroupOrdinalUnused);
2849 bool OwnNativeHandle,
2852 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
2853 PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
2856 auto ZeQueue = pi_cast<ze_command_queue_handle_t>(NativeHandle);
2858 std::vector<ze_command_queue_handle_t> ZeQueues{ZeQueue};
2863 std::vector<ze_command_queue_handle_t> ZeroCopyQueues;
2874 std::unique_lock<pi_shared_mutex> ContextsLock(Plt->
ContextsMutex,
2876 if (IndirectAccessTrackingEnabled) {
2881 ContextsLock.lock();
2889 ze_device_mem_alloc_desc_t ZeDesc = {};
2895 if (IndirectAccessTrackingEnabled) {
2898 std::forward_as_tuple(*ResultPtr),
2899 std::forward_as_tuple(
Context));
2909 std::unique_lock<pi_shared_mutex> ContextsLock(Plt->
ContextsMutex,
2911 if (IndirectAccessTrackingEnabled) {
2916 ContextsLock.lock();
2928 if (IndirectAccessTrackingEnabled) {
2931 std::forward_as_tuple(*ResultPtr),
2932 std::forward_as_tuple(
Context));
2938 void *HostPtr,
pi_mem *RetMem,
2944 die(
"piMemBufferCreate: Level-Zero supports read-write and read-only "
2946 "but not other accesses (such as write-only) yet.");
2950 PI_ASSERT(RetMem, PI_ERROR_INVALID_VALUE);
2952 if (properties !=
nullptr) {
2953 die(
"piMemBufferCreate: no mem properties goes to Level-Zero RT yet");
2970 bool HostPtrImported =
false;
2976 ZE_CALL(zeMemGetAllocProperties,
2981 if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) {
2985 HostPtrImported =
true;
2990 auto HostPtrOrNull =
2994 }
catch (
const std::bad_alloc &) {
2995 return PI_ERROR_OUT_OF_HOST_MEMORY;
2997 return PI_ERROR_UNKNOWN;
3014 if (!HostPtrImported)
3015 memcpy(ZeHandleDst, HostPtr, Size);
3021 ZE_CALL(zeCommandListAppendMemoryCopy,
3023 nullptr, 0,
nullptr));
3028 die(
"piMemBufferCreate: not implemented");
3037 void *ParamValue,
size_t *ParamValueSizeRet) {
3042 std::shared_lock<pi_shared_mutex> Lock(Mem->Mutex);
3043 ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
3045 switch (ParamName) {
3047 return ReturnValue(Mem->
Context);
3050 auto Buffer = pi_cast<pi_buffer>(Mem);
3051 return ReturnValue(
size_t{Buffer->Size});
3054 die(
"piMemGetInfo: Parameter is not implemented");
3061 PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
3063 Mem->RefCount.increment();
3072 std::unique_lock<pi_shared_mutex> ContextsLock(Plt->
ContextsMutex,
3074 if (IndirectAccessTrackingEnabled) {
3075 ContextsLock.lock();
3078 die(
"All memory allocations must be tracked!");
3080 if (!It->second.RefCount.decrementAndTest()) {
3092 if (IndirectAccessTrackingEnabled)
3099 bool OwnZeMemHandle =
true);
3102 PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
3104 if (!Mem->RefCount.decrementAndTest())
3108 char *ZeHandleImage;
3110 ZE_CALL(zeImageDestroy, (pi_cast<ze_image_handle_t>(ZeHandleImage)));
3112 auto Buffer =
static_cast<pi_buffer>(Mem);
3127 die(
"piMemImageCreate: Level-Zero implements only read-write buffer,"
3128 "no read-only or write-only yet.");
3131 PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE);
3132 PI_ASSERT(ImageFormat, PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
3134 ze_image_format_type_t ZeImageFormatType;
3135 size_t ZeImageFormatTypeSize;
3138 ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT;
3139 ZeImageFormatTypeSize = 32;
3142 ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT;
3143 ZeImageFormatTypeSize = 16;
3146 ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
3147 ZeImageFormatTypeSize = 32;
3150 ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
3151 ZeImageFormatTypeSize = 16;
3154 ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
3155 ZeImageFormatTypeSize = 8;
3158 ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM;
3159 ZeImageFormatTypeSize = 16;
3162 ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM;
3163 ZeImageFormatTypeSize = 8;
3166 ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
3167 ZeImageFormatTypeSize = 32;
3170 ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
3171 ZeImageFormatTypeSize = 16;
3174 ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
3175 ZeImageFormatTypeSize = 8;
3178 ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM;
3179 ZeImageFormatTypeSize = 16;
3182 ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM;
3183 ZeImageFormatTypeSize = 8;
3186 zePrint(
"piMemImageCreate: unsupported image data type: data type = %d\n",
3188 return PI_ERROR_INVALID_VALUE;
3192 ze_image_format_layout_t ZeImageFormatLayout;
3195 switch (ZeImageFormatTypeSize) {
3197 ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8;
3200 ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16;
3203 ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
3206 zePrint(
"piMemImageCreate: unexpected data type Size\n");
3207 return PI_ERROR_INVALID_VALUE;
3212 die(
"piMemImageCreate: unsupported image format layout\n");
3216 ze_image_format_t ZeFormatDesc = {
3217 ZeImageFormatLayout, ZeImageFormatType,
3219 ZE_IMAGE_FORMAT_SWIZZLE_R, ZE_IMAGE_FORMAT_SWIZZLE_G,
3220 ZE_IMAGE_FORMAT_SWIZZLE_B, ZE_IMAGE_FORMAT_SWIZZLE_A};
3222 ze_image_type_t ZeImageType;
3225 ZeImageType = ZE_IMAGE_TYPE_1D;
3228 ZeImageType = ZE_IMAGE_TYPE_2D;
3231 ZeImageType = ZE_IMAGE_TYPE_3D;
3234 ZeImageType = ZE_IMAGE_TYPE_1DARRAY;
3237 ZeImageType = ZE_IMAGE_TYPE_2DARRAY;
3240 zePrint(
"piMemImageCreate: unsupported image type\n");
3241 return PI_ERROR_INVALID_VALUE;
3245 ZeImageDesc.arraylevels = ZeImageDesc.flags = 0;
3246 ZeImageDesc.type = ZeImageType;
3247 ZeImageDesc.format = ZeFormatDesc;
3248 ZeImageDesc.width = pi_cast<uint32_t>(ImageDesc->
image_width);
3249 ZeImageDesc.height = pi_cast<uint32_t>(ImageDesc->
image_height);
3250 ZeImageDesc.depth = pi_cast<uint32_t>(ImageDesc->
image_depth);
3254 std::shared_lock<pi_shared_mutex> Lock(
Context->Mutex);
3268 *RetImage = ZePIImage;
3271 ZePIImage->ZeImageDesc = ZeImageDesc;
3281 ZE_CALL(zeCommandListAppendImageCopyFromMemory,
3285 }
catch (
const std::bad_alloc &) {
3286 return PI_ERROR_OUT_OF_HOST_MEMORY;
3288 return PI_ERROR_UNKNOWN;
3294 PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
3295 std::shared_lock<pi_shared_mutex> Guard(Mem->Mutex);
3298 *NativeHandle = pi_cast<pi_native_handle>(ZeHandle);
3304 bool ownNativeHandle,
pi_mem *Mem) {
3306 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3309 std::shared_lock<pi_shared_mutex> Lock(
Context->Mutex);
3314 void *Ptr = pi_cast<void *>(NativeHandle);
3316 PI_ASSERT(Ptr == Base, PI_ERROR_INVALID_VALUE);
3320 ZE_CALL(zeMemGetAllocProperties,
3324 switch (ZeMemProps.type) {
3325 case ZE_MEMORY_TYPE_HOST:
3326 case ZE_MEMORY_TYPE_SHARED:
3327 case ZE_MEMORY_TYPE_DEVICE:
3329 case ZE_MEMORY_TYPE_UNKNOWN:
3331 return PI_ERROR_INVALID_CONTEXT;
3333 die(
"Unexpected memory type");
3347 std::unique_lock<pi_shared_mutex> ContextsLock(Plt->
ContextsMutex,
3353 if (IndirectAccessTrackingEnabled && ownNativeHandle) {
3355 ContextsLock.lock();
3361 std::piecewise_construct, std::forward_as_tuple(Ptr),
3362 std::forward_as_tuple(
Context, ownNativeHandle));
3364 }
catch (
const std::bad_alloc &) {
3365 return PI_ERROR_OUT_OF_HOST_MEMORY;
3367 return PI_ERROR_UNKNOWN;
3371 auto Buffer = pi_cast<pi_buffer>(*Mem);
3375 }
else if (Buffer->OnHost) {
3388 ZE_CALL(zeCommandListAppendMemoryCopy,
3400 PI_ASSERT(ILBytes && Length, PI_ERROR_INVALID_VALUE);
3401 PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3408 }
catch (
const std::bad_alloc &) {
3409 return PI_ERROR_OUT_OF_HOST_MEMORY;
3411 return PI_ERROR_UNKNOWN;
3418 const size_t *Lengths,
const unsigned char **Binaries,
3422 (void)NumMetadataEntries;
3425 PI_ASSERT(DeviceList && NumDevices, PI_ERROR_INVALID_VALUE);
3426 PI_ASSERT(Binaries && Lengths, PI_ERROR_INVALID_VALUE);
3427 PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3430 if (NumDevices != 1) {
3431 zePrint(
"piProgramCreateWithBinary: level_zero supports only one device.");
3432 return PI_ERROR_INVALID_VALUE;
3434 if (!Binaries[0] || !Lengths[0]) {
3436 *BinaryStatus = PI_ERROR_INVALID_VALUE;
3437 return PI_ERROR_INVALID_VALUE;
3440 size_t Length = Lengths[0];
3441 auto Binary = Binaries[0];
3456 }
catch (
const std::bad_alloc &) {
3457 return PI_ERROR_OUT_OF_HOST_MEMORY;
3459 return PI_ERROR_UNKNOWN;
3463 *BinaryStatus = PI_SUCCESS;
3468 const char **Strings,
3469 const size_t *Lengths,
3477 zePrint(
"piclProgramCreateWithSource: not supported in Level Zero\n");
3478 return PI_ERROR_INVALID_OPERATION;
3482 size_t ParamValueSize,
void *ParamValue,
3483 size_t *ParamValueSizeRet) {
3485 PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3487 ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
3488 switch (ParamName) {
3490 return ReturnValue(
pi_uint32{Program->RefCount.load()});
3498 std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
3505 ZE_CALL(zeModuleGetNativeBinary, (Program->
ZeModule, &SzBinary,
nullptr));
3507 return PI_ERROR_INVALID_PROGRAM;
3510 return ReturnValue(
size_t{SzBinary});
3517 uint8_t **PBinary = pi_cast<uint8_t **>(ParamValue);
3521 std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
3527 size_t SzBinary = 0;
3528 ZE_CALL(zeModuleGetNativeBinary,
3529 (Program->
ZeModule, &SzBinary, PBinary[0]));
3531 return PI_ERROR_INVALID_PROGRAM;
3536 std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
3537 uint32_t NumKernels;
3541 return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
3544 ZE_CALL(zeModuleGetKernelNames,
3545 (Program->
ZeModule, &NumKernels,
nullptr));
3547 return PI_ERROR_INVALID_PROGRAM;
3549 return ReturnValue(
size_t{NumKernels});
3553 std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
3554 std::string PINames{
""};
3558 return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
3561 ZE_CALL(zeModuleGetKernelNames, (Program->
ZeModule, &Count,
nullptr));
3562 std::unique_ptr<const char *[]> PNames(
new const char *[Count]);
3563 ZE_CALL(zeModuleGetKernelNames,
3564 (Program->
ZeModule, &Count, PNames.get()));
3565 for (uint32_t I = 0; I < Count; ++I) {
3566 PINames += (I > 0 ?
";" :
"");
3567 PINames += PNames[I];
3570 return PI_ERROR_INVALID_PROGRAM;
3572 return ReturnValue(PINames.c_str());
3573 }
catch (
const std::bad_alloc &) {
3574 return PI_ERROR_OUT_OF_HOST_MEMORY;
3576 return PI_ERROR_UNKNOWN;
3579 die(
"piProgramGetInfo: not implemented");
3586 const pi_device *DeviceList,
const char *Options,
3589 void (*PFnNotify)(
pi_program Program,
void *UserData),
3592 if (NumDevices != 1) {
3593 zePrint(
"piProgramLink: level_zero supports only one device.");
3594 return PI_ERROR_INVALID_VALUE;
3599 if (Options && *Options !=
'\0') {
3601 "Level Zero does not support kernel link flags: \"");
3606 *RetProgram = Program;
3607 return PI_ERROR_LINK_PROGRAM_FAILURE;
3611 PI_ASSERT(DeviceList, PI_ERROR_INVALID_DEVICE);
3613 PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
3614 if (NumInputPrograms == 0 || InputPrograms ==
nullptr)
3615 return PI_ERROR_INVALID_VALUE;
3630 std::vector<std::shared_lock<pi_shared_mutex>> Guards(NumInputPrograms);
3631 for (
pi_uint32 I = 0; I < NumInputPrograms; I++) {
3632 std::shared_lock<pi_shared_mutex> Guard(InputPrograms[I]->Mutex);
3633 Guards[I].swap(Guard);
3635 return PI_ERROR_INVALID_OPERATION;
3647 std::vector<size_t> CodeSizes(NumInputPrograms);
3648 std::vector<const uint8_t *> CodeBufs(NumInputPrograms);
3649 std::vector<const char *> BuildFlagPtrs(NumInputPrograms);
3650 std::vector<const ze_module_constants_t *> SpecConstPtrs(NumInputPrograms);
3651 std::vector<_pi_program::SpecConstantShim> SpecConstShims;
3652 SpecConstShims.reserve(NumInputPrograms);
3654 for (
pi_uint32 I = 0; I < NumInputPrograms; I++) {
3657 CodeBufs[I] = Program->
Code.get();
3658 BuildFlagPtrs[I] = Program->
BuildFlags.c_str();
3659 SpecConstShims.emplace_back(Program);
3660 SpecConstPtrs[I] = SpecConstShims[I].ze();
3663 ZeExtModuleDesc.count = NumInputPrograms;
3664 ZeExtModuleDesc.inputSizes = CodeSizes.data();
3665 ZeExtModuleDesc.pInputModules = CodeBufs.data();
3666 ZeExtModuleDesc.pBuildFlags = BuildFlagPtrs.data();
3667 ZeExtModuleDesc.pConstants = SpecConstPtrs.data();
3670 ZeModuleDesc.pNext = &ZeExtModuleDesc;
3671 ZeModuleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV;
3681 ZeModuleDesc.pInputModule =
reinterpret_cast<const uint8_t *
>(1);
3682 ZeModuleDesc.inputSize = 1;
3696 if (!DeviceList[0]->Platform->ZeDriverModuleProgramExtensionFound ||
3697 (NumInputPrograms == 1)) {
3698 if (NumInputPrograms == 1) {
3699 ZeModuleDesc.pNext =
nullptr;
3700 ZeModuleDesc.inputSize = ZeExtModuleDesc.inputSizes[0];
3701 ZeModuleDesc.pInputModule = ZeExtModuleDesc.pInputModules[0];
3702 ZeModuleDesc.pBuildFlags = ZeExtModuleDesc.pBuildFlags[0];
3703 ZeModuleDesc.pConstants = ZeExtModuleDesc.pConstants[0];
3705 zePrint(
"piProgramLink: level_zero driver does not have static linking "
3707 return PI_ERROR_INVALID_VALUE;
3715 ze_module_build_log_handle_t ZeBuildLog =
nullptr;
3716 ze_result_t ZeResult =
3718 &ZeModule, &ZeBuildLog));
3724 if (ZeResult != ZE_RESULT_SUCCESS &&
3725 ZeResult != ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) {
3736 if (ZeResult == ZE_RESULT_SUCCESS) {
3738 if (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE) {
3739 PiResult = PI_ERROR_LINK_PROGRAM_FAILURE;
3740 }
else if (ZeResult != ZE_RESULT_SUCCESS) {
3741 return mapError(ZeResult);
3748 }
catch (
const std::bad_alloc &) {
3749 return PI_ERROR_OUT_OF_HOST_MEMORY;
3751 return PI_ERROR_UNKNOWN;
3758 const char *Options,
pi_uint32 NumInputHeaders,
3759 const pi_program *InputHeaders,
const char **HeaderIncludeNames,
3760 void (*PFnNotify)(
pi_program Program,
void *UserData),
void *UserData) {
3761 (void)NumInputHeaders;
3763 (void)HeaderIncludeNames;
3765 PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3767 if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList))
3768 return PI_ERROR_INVALID_VALUE;
3771 PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
3773 std::scoped_lock<pi_shared_mutex> Guard(Program->Mutex);
3781 return PI_ERROR_INVALID_OPERATION;
3796 const pi_device *DeviceList,
const char *Options,
3797 void (*PFnNotify)(
pi_program Program,
void *UserData),
3800 PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3801 if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList))
3802 return PI_ERROR_INVALID_VALUE;
3807 if (NumDevices != 1) {
3808 zePrint(
"piProgramBuild: level_zero supports only one device.");
3809 return PI_ERROR_INVALID_VALUE;
3813 PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
3815 std::scoped_lock<pi_shared_mutex> Guard(Program->Mutex);
3819 PI_ERROR_INVALID_VALUE);
3825 return PI_ERROR_INVALID_OPERATION;
3834 ? ZE_MODULE_FORMAT_IL_SPIRV
3835 : ZE_MODULE_FORMAT_NATIVE;
3836 ZeModuleDesc.inputSize = Program->
CodeLength;
3837 ZeModuleDesc.pInputModule = Program->
Code.get();
3838 ZeModuleDesc.pBuildFlags = Options;
3839 ZeModuleDesc.pConstants = Shim.
ze();
3847 ze_result_t ZeResult =
3850 if (ZeResult != ZE_RESULT_SUCCESS) {
3854 Result = mapError(ZeResult);
3870 if (ZeResult != ZE_RESULT_SUCCESS) {
3872 Result = (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE)
3873 ? PI_ERROR_BUILD_PROGRAM_FAILURE
3874 : mapError(ZeResult);
3883 Program->
Code.reset();
3890 size_t ParamValueSize,
void *ParamValue,
3891 size_t *ParamValueSizeRet) {
3894 std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
3895 ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
3911 return ReturnValue(
"");
3920 size_t LogSize = ParamValueSize;
3921 ZE_CALL(zeModuleBuildLogGetString,
3922 (Program->
ZeBuildLog, &LogSize, pi_cast<char *>(ParamValue)));
3923 if (ParamValueSizeRet) {
3924 *ParamValueSizeRet = LogSize;
3932 return ReturnValue(
"");
3934 zePrint(
"piProgramGetBuildInfo: unsupported ParamName\n");
3935 return PI_ERROR_INVALID_VALUE;
3941 PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3942 Program->RefCount.increment();
3947 PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3949 if (!Program->RefCount.decrementAndTest())
3959 PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3960 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3962 auto ZeModule = pi_cast<ze_module_handle_t *>(NativeHandle);
3964 std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
3965 switch (Program->
State) {
3972 return PI_ERROR_INVALID_OPERATION;
3980 bool ownNativeHandle,
3982 PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
3983 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
3986 auto ZeModule = pi_cast<ze_module_handle_t>(NativeHandle);
3995 }
catch (
const std::bad_alloc &) {
3996 return PI_ERROR_OUT_OF_HOST_MEMORY;
3998 return PI_ERROR_UNKNOWN;
4011 if (ZeModule && OwnZeModule) {
4029 ze_module_build_log_handle_t *ZeBuildLog) {
4036 ze_result_t ZeResult =
4038 if (ZeResult != ZE_RESULT_SUCCESS)
4044 if (ZeModuleProps.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS) {
4045 return ZE_CALL_NOCHECK(zeModuleDynamicLink, (1, &ZeModule, ZeBuildLog));
4047 return ZE_RESULT_SUCCESS;
4053 PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4054 PI_ASSERT(RetKernel, PI_ERROR_INVALID_VALUE);
4055 PI_ASSERT(KernelName, PI_ERROR_INVALID_VALUE);
4057 std::shared_lock<pi_shared_mutex> Guard(Program->Mutex);
4059 return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
4063 ZeKernelDesc.flags = 0;
4064 ZeKernelDesc.pKernelName = KernelName;
4067 ZE_CALL(zeKernelCreate, (Program->
ZeModule, &ZeKernelDesc, &ZeKernel));
4070 *RetKernel =
new _pi_kernel(ZeKernel,
true, Program);
4071 }
catch (
const std::bad_alloc &) {
4072 return PI_ERROR_OUT_OF_HOST_MEMORY;
4074 return PI_ERROR_UNKNOWN;
4077 PI_CALL((*RetKernel)->initialize());
4084 if (IndirectAccessTrackingEnabled)
4089 ZeKernelProperties.Compute = [
this](ze_kernel_properties_t &
Properties) {
4094 ZeKernelName.Compute = [
this](std::string &Name) {
4097 char *KernelName =
new char[Size];
4100 delete[] KernelName;
4107 const void *ArgValue) {
4116 if (ArgSize ==
sizeof(
void *) && ArgValue &&
4117 *(
void **)(
const_cast<void *
>(ArgValue)) ==
nullptr) {
4121 PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4123 std::scoped_lock<pi_shared_mutex> Guard(Kernel->Mutex);
4124 ZE_CALL(zeKernelSetArgumentValue,
4125 (pi_cast<ze_kernel_handle_t>(Kernel->
ZeKernel),
4126 pi_cast<uint32_t>(ArgIndex), pi_cast<size_t>(ArgSize),
4127 pi_cast<const void *>(ArgValue)));
4134 const pi_mem *ArgValue) {
4140 PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4151 std::scoped_lock<pi_shared_mutex> Guard(Kernel->Mutex);
4154 auto Arg = ArgValue ? *ArgValue :
nullptr;
4164 PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4166 std::scoped_lock<pi_shared_mutex> Guard(Kernel->Mutex);
4167 ZE_CALL(zeKernelSetArgumentValue,
4168 (pi_cast<ze_kernel_handle_t>(Kernel->
ZeKernel),
4169 pi_cast<uint32_t>(ArgIndex),
sizeof(
void *),
4170 &(*ArgValue)->ZeSampler));
4176 size_t ParamValueSize,
void *ParamValue,
4177 size_t *ParamValueSizeRet) {
4178 PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4180 ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4182 std::shared_lock<pi_shared_mutex> Guard(Kernel->Mutex);
4183 switch (ParamName) {
4190 std::string &KernelName = *Kernel->
ZeKernelName.operator->();
4191 return ReturnValue(
static_cast<const char *
>(KernelName.c_str()));
4192 }
catch (
const std::bad_alloc &) {
4193 return PI_ERROR_OUT_OF_HOST_MEMORY;
4195 return PI_ERROR_UNKNOWN;
4200 return ReturnValue(
pi_uint32{Kernel->RefCount.load()});
4204 ZE_CALL(zeKernelGetSourceAttributes, (Kernel->
ZeKernel, &Size,
nullptr));
4205 char *attributes =
new char[Size];
4206 ZE_CALL(zeKernelGetSourceAttributes,
4207 (Kernel->
ZeKernel, &Size, &attributes));
4208 auto Res = ReturnValue(attributes);
4209 delete[] attributes;
4211 }
catch (
const std::bad_alloc &) {
4212 return PI_ERROR_OUT_OF_HOST_MEMORY;
4214 return PI_ERROR_UNKNOWN;
4217 zePrint(
"Unsupported ParamName in piKernelGetInfo: ParamName=%d(0x%x)\n",
4218 ParamName, ParamName);
4219 return PI_ERROR_INVALID_VALUE;
4227 size_t ParamValueSize,
void *ParamValue,
4228 size_t *ParamValueSizeRet) {
4229 PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4232 ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4234 std::shared_lock<pi_shared_mutex> Guard(Kernel->Mutex);
4235 switch (ParamName) {
4240 } WorkSize = {{
Device->ZeDeviceComputeProperties->maxGroupSizeX,
4241 Device->ZeDeviceComputeProperties->maxGroupSizeY,
4242 Device->ZeDeviceComputeProperties->maxGroupSizeZ}};
4243 return ReturnValue(WorkSize);
4257 return ReturnValue(WgSize);
4262 return ReturnValue(
size_t{
Device->ZeDeviceProperties->physicalEUSimdWidth});
4267 die(
"PI_KERNEL_GROUP_INFO_NUM_REGS in piKernelGetGroupInfo not "
4272 zePrint(
"Unknown ParamName in piKernelGetGroupInfo: ParamName=%d(0x%x)\n",
4273 ParamName, ParamName);
4274 return PI_ERROR_INVALID_VALUE;
4281 size_t InputValueSize,
const void *InputValue,
4282 size_t ParamValueSize,
void *ParamValue,
4283 size_t *ParamValueSizeRet) {
4285 (void)InputValueSize;
4288 ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4290 std::shared_lock<pi_shared_mutex> Guard(Kernel->Mutex);
4300 die(
"piKernelGetSubGroupInfo: parameter not implemented");
4308 PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4310 Kernel->RefCount.increment();
4315 PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4317 if (!Kernel->RefCount.decrementAndTest())
4320 auto KernelProgram = Kernel->
Program;
4323 if (IndirectAccessTrackingEnabled) {
4335 const size_t *GlobalWorkOffset,
4336 const size_t *GlobalWorkSize,
const size_t *LocalWorkSize,
4339 PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4340 PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
4341 PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION);
4344 std::scoped_lock<pi_shared_mutex, pi_shared_mutex, pi_shared_mutex> Lock(
4345 Queue->Mutex, Kernel->Mutex, Kernel->
Program->Mutex);
4346 if (GlobalWorkOffset != NULL) {
4347 if (!Queue->
Device->
Platform->ZeDriverGlobalOffsetExtensionFound) {
4348 zePrint(
"No global offset extension found on this driver\n");
4349 return PI_ERROR_INVALID_VALUE;
4352 ZE_CALL(zeKernelSetGlobalOffsetExp,
4353 (Kernel->
ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1],
4354 GlobalWorkOffset[2]));
4361 char **ZeHandlePtr =
nullptr;
4363 PI_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode,
4366 ZE_CALL(zeKernelSetArgumentValue,
4367 (Kernel->
ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr));
4371 ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
4375 PI_ASSERT(WorkDim == 3 || GlobalWorkSize[2] == 1, PI_ERROR_INVALID_VALUE);
4376 PI_ASSERT(WorkDim >= 2 || GlobalWorkSize[1] == 1, PI_ERROR_INVALID_VALUE);
4378 if (LocalWorkSize) {
4379 WG[0] = pi_cast<uint32_t>(LocalWorkSize[0]);
4380 WG[1] = pi_cast<uint32_t>(LocalWorkSize[1]);
4381 WG[2] = pi_cast<uint32_t>(LocalWorkSize[2]);
4385 bool SuggestGroupSize =
true;
4386 for (
int I : {0, 1, 2}) {
4387 if (GlobalWorkSize[I] > UINT32_MAX) {
4388 SuggestGroupSize =
false;
4391 if (SuggestGroupSize) {
4392 ZE_CALL(zeKernelSuggestGroupSize,
4393 (Kernel->
ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1],
4394 GlobalWorkSize[2], &WG[0], &WG[1], &WG[2]));
4396 for (
int I : {0, 1, 2}) {
4400 uint32_t GroupSize[] = {
4401 Queue->
Device->ZeDeviceComputeProperties->maxGroupSizeX,
4402 Queue->
Device->ZeDeviceComputeProperties->maxGroupSizeY,
4403 Queue->
Device->ZeDeviceComputeProperties->maxGroupSizeZ};
4404 GroupSize[I] = std::min(
size_t(GroupSize[I]), GlobalWorkSize[I]);
4405 while (GlobalWorkSize[I] % GroupSize[I]) {
4408 if (GlobalWorkSize[I] / GroupSize[I] > UINT32_MAX) {
4409 zePrint(
"piEnqueueKernelLaunch: can't find a WG size "
4410 "suitable for global work size > UINT32_MAX\n");
4411 return PI_ERROR_INVALID_WORK_GROUP_SIZE;
4413 WG[I] = GroupSize[I];
4415 zePrint(
"piEnqueueKernelLaunch: using computed WG size = {%d, %d, %d}\n",
4416 WG[0], WG[1], WG[2]);
4423 ZeThreadGroupDimensions.groupCountX =
4424 pi_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
4425 ZeThreadGroupDimensions.groupCountY =
4426 pi_cast<uint32_t>(GlobalWorkSize[1] / WG[1]);
4427 ZeThreadGroupDimensions.groupCountZ =
4428 pi_cast<uint32_t>(GlobalWorkSize[2] / WG[2]);
4431 ZeThreadGroupDimensions.groupCountX =
4432 pi_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
4433 ZeThreadGroupDimensions.groupCountY =
4434 pi_cast<uint32_t>(GlobalWorkSize[1] / WG[1]);
4438 ZeThreadGroupDimensions.groupCountX =
4439 pi_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
4444 zePrint(
"piEnqueueKernelLaunch: unsupported work_dim\n");
4445 return PI_ERROR_INVALID_VALUE;
4449 if (GlobalWorkSize[0] !=
4450 size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) {
4451 zePrint(
"piEnqueueKernelLaunch: invalid work_dim. The range is not a "
4452 "multiple of the group size in the 1st dimension\n");
4453 return PI_ERROR_INVALID_WORK_GROUP_SIZE;
4455 if (GlobalWorkSize[1] !=
4456 size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) {
4457 zePrint(
"piEnqueueKernelLaunch: invalid work_dim. The range is not a "
4458 "multiple of the group size in the 2nd dimension\n");
4459 return PI_ERROR_INVALID_WORK_GROUP_SIZE;
4461 if (GlobalWorkSize[2] !=
4462 size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) {
4463 zePrint(
"piEnqueueKernelLaunch: invalid work_dim. The range is not a "
4464 "multiple of the group size in the 3rd dimension\n");
4465 return PI_ERROR_INVALID_WORK_GROUP_SIZE;
4468 ZE_CALL(zeKernelSetGroupSize, (Kernel->
ZeKernel, WG[0], WG[1], WG[2]));
4470 bool UseCopyEngine =
false;
4473 NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
4479 Queue, CommandList, UseCopyEngine,
true ))
4484 bool IsInternal = OutEvent ==
nullptr;
4485 pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
4488 if (Res != PI_SUCCESS)
4490 ZeEvent = (*Event)->ZeEvent;
4491 (*Event)->WaitList = TmpWaitList;
4495 (*Event)->CommandData = (
void *)Kernel;
4504 if (IndirectAccessTrackingEnabled)
4507 if (Queue->
Device->ImmCommandListUsed && IndirectAccessTrackingEnabled) {
4511 std::unique_lock<pi_shared_mutex> ContextsLock(
4519 ContextsLock.lock();
4522 ZE_CALL(zeCommandListAppendLaunchKernel,
4523 (CommandList->first, Kernel->
ZeKernel, &ZeThreadGroupDimensions,
4524 ZeEvent, (*Event)->WaitList.Length,
4525 (*Event)->WaitList.ZeEventList));
4531 ZE_CALL(zeCommandListAppendLaunchKernel,
4532 (CommandList->first, Kernel->
ZeKernel, &ZeThreadGroupDimensions,
4533 ZeEvent, (*Event)->WaitList.Length,
4534 (*Event)->WaitList.ZeEventList));
4537 zePrint(
"calling zeCommandListAppendLaunchKernel() with"
4539 pi_cast<std::uintptr_t>(ZeEvent));
4553 bool OwnNativeHandle,
4556 PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
4557 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
4558 PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4560 auto ZeKernel = pi_cast<ze_kernel_handle_t>(NativeHandle);
4561 *Kernel =
new _pi_kernel(ZeKernel, OwnNativeHandle, Program);
4562 PI_CALL((*Kernel)->initialize());
4568 PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
4569 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
4571 std::shared_lock<pi_shared_mutex> Guard(Kernel->Mutex);
4572 auto *ZeKernel = pi_cast<ze_kernel_handle_t *>(NativeHandle);
4582 PI_ASSERT(Queue, PI_ERROR_INVALID_EVENT);
4584 std::scoped_lock<pi_shared_mutex, pi_shared_mutex> Lock(Queue->Mutex,
4587 if (!HostVisibleEvent) {
4589 die(
"getOrCreateHostVisibleEvent: missing host-visible event");
4598 bool OkToBatch =
true;
4601 if (
auto Res = Queue->Context->getAvailableCommandList(
4602 Queue, CommandList,
false , OkToBatch))
4609 if (Res != PI_SUCCESS)
4612 ZE_CALL(zeCommandListAppendWaitOnEvents, (CommandList->first, 1, &ZeEvent));
4613 ZE_CALL(zeCommandListAppendSignalEvent,
4614 (CommandList->first, HostVisibleEvent->ZeEvent));
4616 if (
auto Res = Queue->executeCommandList(CommandList,
false, OkToBatch))
4620 ZeHostVisibleEvent = HostVisibleEvent->ZeEvent;
4628 CommandData =
nullptr;
4633 CommandList = std::nullopt;
4635 if (!isHostVisible())
4636 HostVisibleEvent =
nullptr;
4638 ZE_CALL(zeEventHostReset, (ZeEvent));
4643 bool WithProfiling) {
4644 std::scoped_lock<pi_mutex> Lock(EventCacheMutex);
4645 auto Cache = getEventCache(HostVisible, WithProfiling);
4649 auto It = Cache->begin();
4658 std::scoped_lock<pi_mutex> Lock(EventCacheMutex);
4661 Cache->emplace_back(Event);
4670 bool HostVisible,
pi_event *RetEvent) {
4671 bool ProfilingEnabled =
4674 if (
auto CachedEvent =
4676 *RetEvent = CachedEvent;
4681 ze_event_pool_handle_t ZeEventPool = {};
4686 ZeEventPool, Index, HostVisible, ProfilingEnabled))
4690 ZeEventDesc.index = Index;
4691 ZeEventDesc.wait = 0;
4694 ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
4704 ZeEventDesc.signal = 0;
4707 ZE_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeEvent));
4710 PI_ASSERT(RetEvent, PI_ERROR_INVALID_VALUE);
4714 }
catch (
const std::bad_alloc &) {
4715 return PI_ERROR_OUT_OF_HOST_MEMORY;
4717 return PI_ERROR_UNKNOWN;
4721 (*RetEvent)->HostVisibleEvent = *RetEvent;
4729 (*RetEvent)->RefCountExternal++;
4730 if (Result != PI_SUCCESS)
4732 ZE_CALL(zeEventHostSignal, ((*RetEvent)->ZeEvent));
4737 size_t ParamValueSize,
void *ParamValue,
4738 size_t *ParamValueSizeRet) {
4740 PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
4742 ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4743 switch (ParamName) {
4745 std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex);
4749 std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex);
4753 std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex);
4754 return ReturnValue(pi_cast<pi_uint64>(Event->
CommandType));
4762 auto Queue = Event->
Queue;
4765 std::scoped_lock<pi_shared_mutex> lock(Queue->Mutex);
4766 const auto &OpenCommandList = Queue->eventOpenCommandList(Event);
4767 if (OpenCommandList != Queue->CommandListMap.end()) {
4768 if (
auto Res = Queue->executeOpenCommandList(
4769 OpenCommandList->second.isCopy(Queue)))
4782 std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex);
4786 }
else if (HostVisibleEvent) {
4787 ze_result_t ZeResult;
4790 if (ZeResult == ZE_RESULT_SUCCESS) {
4794 return ReturnValue(pi_cast<pi_int32>(Result));
4797 return ReturnValue(
pi_uint32{Event->RefCount.load()});
4799 zePrint(
"Unsupported ParamName in piEventGetInfo: ParamName=%d(%x)\n",
4800 ParamName, ParamName);
4801 return PI_ERROR_INVALID_VALUE;
4808 size_t ParamValueSize,
void *ParamValue,
4809 size_t *ParamValueSizeRet) {
4811 PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
4813 std::shared_lock<pi_shared_mutex> EventLock(Event->Mutex);
4816 return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE;
4822 uint64_t ZeTimerResolution =
Device->ZeDeviceProperties->timerResolution;
4823 const uint64_t TimestampMaxValue =
4824 ((1ULL <<
Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL);
4826 ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
4828 ze_kernel_timestamp_result_t tsResult;
4830 switch (ParamName) {
4832 ZE_CALL(zeEventQueryKernelTimestamp, (Event->
ZeEvent, &tsResult));
4833 uint64_t ContextStartTime =
4834 (tsResult.global.kernelStart & TimestampMaxValue) * ZeTimerResolution;
4835 return ReturnValue(ContextStartTime);
4838 ZE_CALL(zeEventQueryKernelTimestamp, (Event->
ZeEvent, &tsResult));
4840 uint64_t ContextStartTime =
4841 (tsResult.global.kernelStart & TimestampMaxValue);
4842 uint64_t ContextEndTime = (tsResult.global.kernelEnd & TimestampMaxValue);
4850 if (ContextEndTime <= ContextStartTime) {
4851 ContextEndTime += TimestampMaxValue;
4853 ContextEndTime *= ZeTimerResolution;
4854 return ReturnValue(ContextEndTime);
4862 return ReturnValue(uint64_t{0});
4864 zePrint(
"piEventGetProfilingInfo: not supported ParamName\n");
4865 return PI_ERROR_INVALID_VALUE;
4881 std::list<pi_event> EventsToBeReleased;
4882 pi_queue AssociatedQueue =
nullptr;
4884 std::scoped_lock<pi_shared_mutex> EventLock(Event->Mutex);
4889 AssociatedQueue = Event->
Queue;
4895 AssociatedKernel = pi_cast<pi_kernel>(Event->
CommandData);
4902 EventsToBeReleased);
4907 auto ReleaseIndirectMem = [](
pi_kernel Kernel) {
4908 if (IndirectAccessTrackingEnabled) {
4915 pi_platform Plt = Kernel->Program->Context->getPlatform();
4916 std::scoped_lock<pi_shared_mutex> ContextsLock(Plt->
ContextsMutex);
4918 if (--Kernel->SubmissionsCount == 0) {
4921 for (
auto &MemAlloc : Kernel->MemAllocs) {
4924 MemAlloc->second.OwnZeMemHandle);
4926 Kernel->MemAllocs.clear();
4932 if (AssociatedKernel) {
4933 ReleaseIndirectMem(AssociatedKernel);
4937 if (AssociatedQueue) {
4940 std::unique_lock<pi_shared_mutex> QueueLock(AssociatedQueue->Mutex,
4967 while (!EventsToBeReleased.empty()) {
4968 pi_event DepEvent = EventsToBeReleased.front();
4970 EventsToBeReleased.pop_front();
4974 std::scoped_lock<pi_shared_mutex> DepEventLock(DepEvent->Mutex);
4976 EventsToBeReleased);
4977 if (IndirectAccessTrackingEnabled) {
4985 DepEventKernel = pi_cast<pi_kernel>(DepEvent->
CommandData);
4990 if (DepEventKernel) {
4991 ReleaseIndirectMem(DepEventKernel);
5004 if (NumEvents && !EventList) {
5005 return PI_ERROR_INVALID_EVENT;
5007 for (uint32_t I = 0; I < NumEvents; I++) {
5008 if (EventList[I]->Queue->Device->ZeEventsScope ==
5014 if (!EventList[I]->hasExternalRefs())
5015 die(
"piEventsWait must not be called for an internal event");
5019 EventList[I]->getOrCreateHostVisibleEvent(ZeHostVisibleEvent))
5024 for (uint32_t I = 0; I < NumEvents; I++) {
5025 auto Queue = EventList[I]->
Queue;
5028 std::scoped_lock<pi_shared_mutex> lock(Queue->Mutex);
5030 if (
auto Res = Queue->executeAllOpenCommandLists())
5034 std::unordered_set<pi_queue> Queues;
5035 for (uint32_t I = 0; I < NumEvents; I++) {
5038 std::shared_lock<pi_shared_mutex> EventLock(EventList[I]->Mutex);
5039 if (!EventList[I]->hasExternalRefs())
5040 die(
"piEventsWait must not be called for an internal event");
5042 if (!EventList[I]->Completed) {
5044 if (!HostVisibleEvent)
5045 die(
"The host-visible proxy event missing");
5048 zePrint(
"ZeEvent = %#llx\n", pi_cast<std::uintptr_t>(ZeEvent));
5049 ZE_CALL(zeHostSynchronize, (ZeEvent));
5053 if (
auto Q = EventList[I]->Queue) {
5054 if (Q->Device->ImmCommandListUsed && Q->isInOrderQueue())
5059 false, EventList[I]);
5075 for (
auto &Q : Queues)
5087 (void)CommandExecCallbackType;
5090 die(
"piEventSetCallback: deprecated, to be removed");
5096 (void)ExecutionStatus;
5097 die(
"piEventSetStatus: deprecated, to be removed");
5102 PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5104 Event->RefCount.increment();
5109 PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5116 Event->RefCount.increment();
5117 Events.push_back(Event);
5121 for (
const auto &Event : Events)
5128 PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5130 if (!Event->RefCount.decrementAndTest())
5141 if (DisableEventsCaching) {
5160 auto Queue = Event->
Queue;
5161 if (DisableEventsCaching || !Event->
OwnZeEvent) {
5180 PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5181 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
5184 std::shared_lock<pi_shared_mutex> Lock(Event->Mutex);
5185 auto *ZeEvent = pi_cast<ze_event_handle_t *>(NativeHandle);
5191 auto Queue = Event->
Queue;
5193 std::scoped_lock<pi_shared_mutex> lock(Queue->Mutex);
5194 const auto &OpenCommandList = Queue->eventOpenCommandList(Event);
5195 if (OpenCommandList != Queue->CommandListMap.end()) {
5196 if (
auto Res = Queue->executeOpenCommandList(
5197 OpenCommandList->second.isCopy(Queue)))
5206 bool OwnNativeHandle,
5209 PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
5210 PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
5212 auto ZeEvent = pi_cast<ze_event_handle_t>(NativeHandle);
5218 (*Event)->HostVisibleEvent = *Event;
5238 PI_ASSERT(RetSampler, PI_ERROR_INVALID_VALUE);
5240 std::shared_lock<pi_shared_mutex> Lock(
Context->Mutex);
5251 ze_sampler_handle_t ZeSampler;
5255 ZeSamplerDesc.isNormalized =
PI_TRUE;
5256 ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_CLAMP;
5257 ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST;
5264 if (SamplerProperties) {
5267 while (*CurProperty != 0) {
5268 switch (*CurProperty) {
5270 pi_bool CurValueBool = pi_cast<pi_bool>(*(++CurProperty));
5273 ZeSamplerDesc.isNormalized =
PI_TRUE;
5275 ZeSamplerDesc.isNormalized =
PI_FALSE;
5277 zePrint(
"piSamplerCreate: unsupported "
5278 "PI_SAMPLER_NORMALIZED_COORDS value\n");
5279 return PI_ERROR_INVALID_VALUE;
5285 pi_cast<pi_sampler_addressing_mode>(
5286 pi_cast<pi_uint32>(*(++CurProperty)));
5296 switch (CurValueAddressingMode) {
5298 ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_NONE;
5301 ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_REPEAT;
5304 ZeSamplerDesc.addressMode =
5305 ZeApiVersion < ZE_MAKE_VERSION(1, 3)
5306 ? ZE_SAMPLER_ADDRESS_MODE_CLAMP
5307 : ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER;
5310 ZeSamplerDesc.addressMode =
5311 ZeApiVersion < ZE_MAKE_VERSION(1, 3)
5312 ? ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER
5313 : ZE_SAMPLER_ADDRESS_MODE_CLAMP;
5316 ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_MIRROR;
5319 zePrint(
"piSamplerCreate: unsupported PI_SAMPLER_ADDRESSING_MODE "
5321 zePrint(
"PI_SAMPLER_ADDRESSING_MODE=%d\n", CurValueAddressingMode);
5322 return PI_ERROR_INVALID_VALUE;
5328 pi_cast<pi_sampler_filter_mode>(
5329 pi_cast<pi_uint32>(*(++CurProperty)));
5332 ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST;
5334 ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_LINEAR;
5336 zePrint(
"PI_SAMPLER_FILTER_MODE=%d\n", CurValueFilterMode);
5338 "piSamplerCreate: unsupported PI_SAMPLER_FILTER_MODE value\n");
5339 return PI_ERROR_INVALID_VALUE;
5356 }
catch (
const std::bad_alloc &) {
5357 return PI_ERROR_OUT_OF_HOST_MEMORY;
5359 return PI_ERROR_UNKNOWN;
5365 size_t ParamValueSize,
void *ParamValue,
5366 size_t *ParamValueSizeRet) {
5369 (void)ParamValueSize;
5371 (void)ParamValueSizeRet;
5373 die(
"piSamplerGetInfo: not implemented");
5378 PI_ASSERT(Sampler, PI_ERROR_INVALID_SAMPLER);
5380 Sampler->RefCount.increment();
5385 PI_ASSERT(Sampler, PI_ERROR_INVALID_SAMPLER);
5387 if (!Sampler->RefCount.decrementAndTest())
5403 PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
5405 if (EventWaitList) {
5406 PI_ASSERT(NumEventsInWaitList > 0, PI_ERROR_INVALID_VALUE);
5408 bool UseCopyEngine =
false;
5411 std::scoped_lock<pi_shared_mutex> lock(Queue->Mutex);
5415 NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
5426 bool IsInternal = OutEvent ==
nullptr;
5427 pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
5429 CommandList, IsInternal);
5430 if (Res != PI_SUCCESS)
5433 ZeEvent = (*Event)->ZeEvent;
5434 (*Event)->WaitList = TmpWaitList;
5436 const auto &WaitList = (*Event)->WaitList;
5437 auto ZeCommandList = CommandList->first;
5438 ZE_CALL(zeCommandListAppendWaitOnEvents,
5439 (ZeCommandList, WaitList.Length, WaitList.ZeEventList));
5441 ZE_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent));
5455 std::scoped_lock<pi_shared_mutex> lock(Queue->Mutex);
5460 if (Res != PI_SUCCESS)
5469 ZE_CALL(zeEventHostSignal, ((*OutEvent)->ZeEvent));
5470 (*OutEvent)->Completed =
true;
5474 if (!Queue->
Device->ImmCommandListUsed)
5484 PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
5487 std::scoped_lock<pi_shared_mutex> lock(Queue->Mutex);
5490 auto insertBarrierIntoCmdList =
5498 Event->WaitList = EventWaitList;
5499 ZE_CALL(zeCommandListAppendBarrier,
5500 (CmdList->first, Event->ZeEvent, EventWaitList.Length,
5501 EventWaitList.ZeEventList));
5506 bool IsInternal = OutEvent ==
nullptr;
5507 pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
5511 bool OkToBatch =
true;
5521 if (NumEventsInWaitList || !UseMultipleCmdlistBarriers ||
5526 NumEventsInWaitList, EventWaitList, Queue,
5539 insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal))
5557 std::vector<pi_command_list_ptr_t> CmdLists;
5562 PI_ASSERT(!InitialComputeGroup.ZeQueues.empty() ||
5563 !InitialCopyGroup.ZeQueues.empty(),
5564 PI_ERROR_INVALID_QUEUE);
5566 size_t NumQueues = 0;
5567 for (
auto &QueueMap :
5569 for (
auto &QueueGroup : QueueMap)
5570 NumQueues += QueueGroup.second.ZeQueues.size();
5576 CmdLists.reserve(NumQueues);
5577 for (
auto &QueueMap :
5579 for (
auto &QueueGroup : QueueMap) {
5580 bool UseCopyEngine =
5581 QueueGroup.second.Type != _pi_queue::queue_type::Compute;
5582 if (Queue->
Device->ImmCommandListUsed) {
5585 for (
auto &ImmCmdList : QueueGroup.second.ImmCmdLists)
5587 CmdLists.push_back(ImmCmdList);
5589 for (
auto ZeQueue : QueueGroup.second.ZeQueues) {
5593 Queue, CmdList, UseCopyEngine, OkToBatch, &ZeQueue))
5595 CmdLists.push_back(CmdList);
5603 if (CmdLists.size() == 0) {
5610 CmdLists.push_back(CmdList);
5613 if (CmdLists.size() > 1) {
5616 std::vector<pi_event> EventWaitVector(CmdLists.size());
5617 for (
size_t I = 0; I < CmdLists.size(); ++I) {
5620 EventWaitVector[I],
true))
5633 EventWaitVector.size(), EventWaitVector.data(), Queue,
5634 ConvergenceCmdList->second.isCopy(Queue)))
5640 if (
auto Res = insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList,
5641 *Event, IsInternal))
5648 *Event, IsInternal))
5664 pi_bool BlockingRead,
size_t Offset,
5665 size_t Size,
void *Dst,
5669 PI_ASSERT(Src, PI_ERROR_INVALID_MEM_OBJECT);
5670 PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
5672 std::shared_lock<pi_shared_mutex> SrcLock(Src->Mutex, std::defer_lock);
5673 std::scoped_lock<std::shared_lock<pi_shared_mutex>,
pi_shared_mutex> LockAll(
5674 SrcLock, Queue->Mutex);
5679 BlockingRead, Size, ZeHandleSrc + Offset,
5680 NumEventsInWaitList, EventWaitList, Event,
5688 size_t HostRowPitch,
size_t HostSlicePitch,
void *Ptr,
5692 PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT);
5693 PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
5695 std::shared_lock<pi_shared_mutex> SrcLock(Buffer->Mutex, std::defer_lock);
5696 std::scoped_lock<std::shared_lock<pi_shared_mutex>,
pi_shared_mutex> LockAll(
5697 SrcLock, Queue->Mutex);
5703 static_cast<char *
>(Ptr), BufferOffset, HostOffset, Region,
5704 BufferRowPitch, HostRowPitch, BufferSlicePitch, HostSlicePitch,
5705 BlockingRead, NumEventsInWaitList, EventWaitList, Event);
5736 if (Res != PI_SUCCESS)
5738 auto zeEvent = Event->
ZeEvent;
5739 ZE_CALL(zeCommandListAppendBarrier,
5740 (ImmCmdList->first, zeEvent, 0,
nullptr));
5741 ZE_CALL(zeHostSynchronize, (zeEvent));
5746 auto EventListToCleanup = std::move(ImmCmdList->second.EventList);
5747 ImmCmdList->second.EventList.clear();
5762 for (
auto &QueueGroup : QueueMap) {
5763 if (
Device->ImmCommandListUsed) {
5764 for (
auto ImmCmdList : QueueGroup.second.ImmCmdLists)
5765 syncImmCmdList(
this, ImmCmdList);
5767 for (
auto &ZeQueue : QueueGroup.second.ZeQueues)
5769 ZE_CALL(zeHostSynchronize, (ZeQueue));
5789 pi_bool BlockingWrite,
size_t Size,
const void *Src,
5793 PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
5799 NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
5803 bool OkToBatch =
true;
5808 Queue, CommandList, UseCopyEngine, OkToBatch))
5813 bool IsInternal = OutEvent ==
nullptr;
5814 pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
5816 CommandList, IsInternal);
5817 if (Res != PI_SUCCESS)
5819 ZeEvent = (*Event)->ZeEvent;
5820 (*Event)->WaitList = TmpWaitList;
5822 const auto &ZeCommandList = CommandList->first;
5823 const auto &WaitList = (*Event)->WaitList;
5825 zePrint(
"calling zeCommandListAppendMemoryCopy() with\n"
5827 pi_cast<std::uintptr_t>(ZeEvent));
5830 ZE_CALL(zeCommandListAppendMemoryCopy,
5831 (ZeCommandList, Dst, Src, Size, ZeEvent, WaitList.Length,
5832 WaitList.ZeEventList));