8 #include <sycl/feature_test.hpp>
9 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
10 #include <KernelFusion.h>
22 inline namespace _V1 {
25 static inline void printPerformanceWarning(
const std::string &Message) {
27 std::cerr <<
"WARNING: " << Message <<
"\n";
31 jit_compiler::jit_compiler() {
32 auto checkJITLibrary = [
this]() ->
bool {
33 static const std::string JITLibraryName =
"libsycl-fusion.so";
36 if (LibraryPtr ==
nullptr) {
37 printPerformanceWarning(
"Could not find JIT library " + JITLibraryName);
41 this->AddToConfigHandle =
reinterpret_cast<AddToConfigFuncT
>(
43 "addToJITConfiguration"));
44 if (!this->AddToConfigHandle) {
45 printPerformanceWarning(
46 "Cannot resolve JIT library function entry point");
50 this->ResetConfigHandle =
reinterpret_cast<ResetConfigFuncT
>(
52 "resetJITConfiguration"));
53 if (!this->ResetConfigHandle) {
54 printPerformanceWarning(
55 "Cannot resolve JIT library function entry point");
59 this->FuseKernelsHandle =
reinterpret_cast<FuseKernelsFuncT
>(
61 if (!this->FuseKernelsHandle) {
62 printPerformanceWarning(
63 "Cannot resolve JIT library function entry point");
67 this->MaterializeSpecConstHandle =
68 reinterpret_cast<MaterializeSpecConstFuncT
>(
70 LibraryPtr,
"materializeSpecConstants"));
71 if (!this->MaterializeSpecConstHandle) {
72 printPerformanceWarning(
73 "Cannot resolve JIT library function entry point");
79 Available = checkJITLibrary();
82 static ::jit_compiler::BinaryFormat
86 return ::jit_compiler::BinaryFormat::SPIRV;
88 return ::jit_compiler::BinaryFormat::LLVM;
91 "Format unsupported for JIT compiler");
95 ::jit_compiler::BinaryFormat getTargetFormat(
QueueImplPtr &Queue) {
96 auto Backend = Queue->getDeviceImplPtr()->getBackend();
100 return ::jit_compiler::BinaryFormat::SPIRV;
102 return ::jit_compiler::BinaryFormat::PTX;
104 return ::jit_compiler::BinaryFormat::AMDGCN;
108 "Backend unsupported by kernel fusion");
112 ::jit_compiler::TargetInfo getTargetInfo(
QueueImplPtr &Queue) {
113 ::jit_compiler::BinaryFormat Format = getTargetFormat(Queue);
115 Format,
static_cast<::jit_compiler::DeviceArchitecture
>(
116 Queue->getDeviceImplPtr()->getDeviceArch()));
119 static ::jit_compiler::ParameterKind
121 using PK = ::jit_compiler::ParameterKind;
124 case kind::kind_accessor:
126 case kind::kind_std_layout:
127 return PK::StdLayout;
128 case kind::kind_sampler:
130 case kind::kind_pointer:
132 case kind::kind_specialization_constants_buffer:
133 return PK::SpecConstBuffer;
134 case kind::kind_stream:
136 case kind::kind_invalid:
142 enum class Promotion {
None, Private, Local };
144 struct PromotionInformation {
145 Promotion PromotionTarget;
146 unsigned KernelIndex;
152 std::vector<bool> UsedParams;
155 using PromotionMap = std::unordered_map<SYCLMemObjI *, PromotionInformation>;
157 template <
typename Obj> Promotion getPromotionTarget(
const Obj &
obj) {
158 auto Result = Promotion::None;
160 ext::codeplay::experimental::property::promote_private>()) {
161 Result = Promotion::Private;
164 ext::codeplay::experimental::property::promote_local>()) {
165 if (Result != Promotion::None) {
167 "Two contradicting promotion properties on the "
168 "same buffer/accessor are not allowed.");
170 Result = Promotion::Local;
175 static Promotion getInternalizationInfo(
Requirement *Req) {
176 auto AccPromotion = getPromotionTarget(Req->MPropertyList);
178 auto *MemObj =
static_cast<sycl::detail::SYCLMemObjT *
>(Req->MSYCLMemObj);
179 if (MemObj->getType() != SYCLMemObjI::MemObjType::Buffer) {
182 return Promotion::None;
184 Promotion BuffPromotion = getPromotionTarget(*MemObj);
185 if (AccPromotion != Promotion::None && BuffPromotion != Promotion::None &&
186 AccPromotion != BuffPromotion) {
188 "Contradicting promotion properties on accessor and "
189 "underlying buffer are not allowed");
191 return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion;
194 static std::optional<size_t> getLocalSize(NDRDescT NDRange,
195 std::optional<size_t> UserGlobalSize,
197 assert((!UserGlobalSize.has_value() || Target != Promotion::Local) &&
198 "Unexpected range rounding");
199 auto NumElementsMem =
static_cast<SYCLMemObjT *
>(Req->MSYCLMemObj)->size();
200 if (Target == Promotion::Private) {
201 if (UserGlobalSize.has_value()) {
203 NDRange.GlobalSize[0] = *UserGlobalSize;
205 auto NumWorkItems = NDRange.GlobalSize.size();
208 return NumElementsMem / NumWorkItems;
209 }
else if (Target == Promotion::Local) {
210 if (NDRange.LocalSize.size() == 0) {
215 auto NumWorkGroups = NDRange.GlobalSize.size() / NDRange.LocalSize.size();
218 return NumElementsMem / NumWorkGroups;
224 return Req->MOffset == Other->MOffset &&
225 Req->MAccessRange == Other->MAccessRange &&
226 Req->MMemoryRange == Other->MMemoryRange &&
227 Req->MSYCLMemObj == Other->MSYCLMemObj && Req->MDims == Other->MDims &&
228 Req->MElemSize == Other->MElemSize &&
229 Req->MOffsetInBytes == Other->MOffsetInBytes &&
230 Req->MIsSubBuffer == Other->MIsSubBuffer;
233 static void resolveInternalization(ArgDesc &Arg,
unsigned KernelIndex,
234 unsigned ArgFunctionIndex, NDRDescT NDRange,
235 std::optional<size_t> UserGlobalSize,
236 PromotionMap &Promotions) {
241 auto ThisPromotionTarget = getInternalizationInfo(Req);
243 getLocalSize(NDRange, UserGlobalSize, Req, ThisPromotionTarget);
245 if (Promotions.count(Req->MSYCLMemObj)) {
247 auto &PreviousDefinition = Promotions.at(Req->MSYCLMemObj);
249 switch (ThisPromotionTarget) {
250 case Promotion::None: {
251 if (PreviousDefinition.PromotionTarget != Promotion::None) {
252 printPerformanceWarning(
253 "Deactivating previously specified promotion, because this "
254 "accessor does not specify promotion");
255 PreviousDefinition.PromotionTarget = Promotion::None;
259 case Promotion::Local: {
260 if (PreviousDefinition.PromotionTarget == Promotion::None) {
261 printPerformanceWarning(
262 "Not performing specified local promotion, due to previous "
263 "mismatch or because previous accessor specified no promotion");
266 if (!ThisLocalSize.has_value()) {
267 printPerformanceWarning(
"Work-group size for local promotion not "
268 "specified, not performing internalization");
269 PreviousDefinition.PromotionTarget = Promotion::None;
272 if (PreviousDefinition.PromotionTarget == Promotion::Private) {
273 printPerformanceWarning(
274 "Overriding previous private promotion with local promotion");
277 auto NewPrevLocalSize =
278 getLocalSize(PreviousDefinition.NDRange, std::nullopt,
279 PreviousDefinition.Definition, Promotion::Local);
281 if (!NewPrevLocalSize.has_value()) {
282 printPerformanceWarning(
283 "Not performing specified local promotion because previous "
284 "kernels did not specify a local size");
285 PreviousDefinition.PromotionTarget = Promotion::None;
289 PreviousDefinition.LocalSize = NewPrevLocalSize.value();
290 PreviousDefinition.PromotionTarget = Promotion::Local;
292 if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
293 printPerformanceWarning(
"Not performing specified local promotion due "
294 "to work-group size mismatch");
295 PreviousDefinition.PromotionTarget = Promotion::None;
298 if (!accessorEquals(Req, PreviousDefinition.Definition)) {
299 printPerformanceWarning(
"Not performing specified promotion, due to "
300 "accessor parameter mismatch");
301 PreviousDefinition.PromotionTarget = Promotion::None;
306 case Promotion::Private: {
307 if (PreviousDefinition.PromotionTarget == Promotion::None) {
308 printPerformanceWarning(
309 "Not performing specified private promotion, due to previous "
310 "mismatch or because previous accessor specified no promotion");
314 if (PreviousDefinition.PromotionTarget == Promotion::Local) {
317 getLocalSize(NDRange, std::nullopt, Req, Promotion::Local);
318 if (!ThisLocalSize.has_value()) {
319 printPerformanceWarning(
"Work-group size for local promotion not "
320 "specified, not performing internalization");
321 PreviousDefinition.PromotionTarget = Promotion::None;
325 if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
326 printPerformanceWarning(
327 "Not performing specified local promotion due "
328 "to work-group size mismatch");
329 PreviousDefinition.PromotionTarget = Promotion::None;
333 if (!accessorEquals(Req, PreviousDefinition.Definition)) {
334 printPerformanceWarning(
"Not performing local promotion, due to "
335 "accessor parameter mismatch");
336 PreviousDefinition.PromotionTarget = Promotion::None;
340 printPerformanceWarning(
341 "Performing local internalization instead, because previous "
342 "accessor specified local promotion");
347 if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
348 printPerformanceWarning(
349 "Not performing specified private promotion due "
350 "to work-group size mismatch");
351 PreviousDefinition.PromotionTarget = Promotion::None;
354 if (!accessorEquals(Req, PreviousDefinition.Definition)) {
355 printPerformanceWarning(
"Not performing specified promotion, due to "
356 "accessor parameter mismatch");
357 PreviousDefinition.PromotionTarget = Promotion::None;
364 if (ThisPromotionTarget == Promotion::Local && !ThisLocalSize.has_value()) {
365 printPerformanceWarning(
"Work-group size for local promotion not "
366 "specified, not performing internalization");
367 ThisPromotionTarget = Promotion::None;
370 assert(ThisLocalSize.has_value());
373 PromotionInformation{ThisPromotionTarget, KernelIndex, ArgFunctionIndex,
374 Req, NDRange, ThisLocalSize.value(),
375 Req->MElemSize, std::vector<bool>()});
383 unsigned KernelIndex;
386 Param(ArgDesc Argument,
unsigned KernelIdx,
unsigned ArgIdx,
bool InUse)
387 : Arg{Argument}, KernelIndex{KernelIdx}, ArgIndex{ArgIdx}, Used{InUse} {}
390 using ParamList = std::vector<Param>;
392 using ParamIterator = std::vector<Param>::iterator;
394 std::vector<Param>::const_iterator
395 detectIdenticalParameter(std::vector<Param> &Params, ArgDesc Arg) {
396 for (
auto I = Params.begin(); I < Params.end(); ++I) {
398 if (I->Arg.MType == Arg.MType) {
402 if ((Arg.MSize == I->Arg.MSize) &&
403 std::memcmp(Arg.MPtr, I->Arg.MPtr, Arg.MSize) == 0) {
409 if (accessorEquals(Req, Other)) {
418 void *storePlainArgRaw(std::vector<std::vector<char>> &ArgStorage,
void *ArgPtr,
420 ArgStorage.emplace_back(ArgSize);
421 void *Storage = ArgStorage.back().data();
422 std::memcpy(Storage, ArgPtr, ArgSize);
426 template <
typename T>
427 void *storePlainArg(std::vector<std::vector<char>> &ArgStorage, T &&Arg) {
428 return storePlainArgRaw(ArgStorage, &Arg,
sizeof(T));
431 static ParamIterator preProcessArguments(
432 std::vector<std::vector<char>> &ArgStorage, ParamIterator Arg,
433 PromotionMap &PromotedAccs,
434 std::vector<::jit_compiler::ParameterInternalization> &InternalizeParams,
435 std::vector<::jit_compiler::JITConstant> &JITConstants,
436 ParamList &NonIdenticalParams,
437 std::vector<::jit_compiler::ParameterIdentity> &ParamIdentities) {
452 storePlainArg(ArgStorage, *
static_cast<void **
>(Arg->Arg.MPtr));
461 storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
464 JITConstants.emplace_back(
465 ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
466 Arg->Arg.MPtr, Arg->Arg.MSize);
476 NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
482 auto Identical = detectIdenticalParameter(NonIdenticalParams, Arg->Arg);
483 if (Identical != NonIdenticalParams.end()) {
484 ::jit_compiler::Parameter ThisParam{Arg->KernelIndex, Arg->ArgIndex};
485 ::jit_compiler::Parameter IdenticalParam{Identical->KernelIndex,
486 Identical->ArgIndex};
487 ::jit_compiler::ParameterIdentity Identity{ThisParam, IdenticalParam};
488 ParamIdentities.push_back(Identity);
495 auto &Internalization = PromotedAccs.at(Req->MSYCLMemObj);
496 auto PromotionTarget = Internalization.PromotionTarget;
497 if (PromotionTarget == Promotion::Private ||
498 PromotionTarget == Promotion::Local) {
500 if (Internalization.KernelIndex == Arg->KernelIndex &&
501 Internalization.ArgIndex == Arg->ArgIndex) {
504 InternalizeParams.emplace_back(
505 ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
506 (PromotionTarget == Promotion::Private)
507 ? ::jit_compiler::Internalization::Private
508 : ::jit_compiler::Internalization::Local,
509 Internalization.LocalSize, Internalization.ElemSize);
517 for (
unsigned I = 0; I < 4; ++I) {
518 Internalization.UsedParams.push_back(Arg->Used);
526 unsigned Increment = 0;
527 for (
unsigned I = 0; I < 4; ++I) {
531 if (Arg->Used && Internalization.UsedParams[I]) {
532 ::jit_compiler::Parameter ThisParam{Arg->KernelIndex,
534 ::jit_compiler::Parameter IdenticalParam{
535 Internalization.KernelIndex,
536 Internalization.ArgIndex + Increment};
537 ::jit_compiler::ParameterIdentity Identity{ThisParam,
539 ParamIdentities.push_back(Identity);
541 if (Internalization.UsedParams[I]) {
551 NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
557 NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
565 updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
566 NDRDescT NDRange, std::vector<ArgDesc> &FusedArgs,
567 std::vector<std::vector<char>> &FusedArgStorage) {
568 auto &ArgUsageInfo = FusedKernelInfo.Args.UsageMask;
569 assert(ArgUsageInfo.size() == FusedArgs.size());
570 for (
size_t ArgIndex = 0; ArgIndex < ArgUsageInfo.size();) {
571 bool PromotedToPrivate =
572 (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedPrivate);
573 bool PromotedToLocal =
574 (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedLocal);
575 if (PromotedToLocal || PromotedToPrivate) {
579 auto &OldArgDesc = FusedArgs[ArgIndex];
581 auto *Req =
static_cast<Requirement *
>(OldArgDesc.MPtr);
586 const size_t SizeAccField =
587 sizeof(size_t) * (Req->MDims == 0 ? 1 : Req->MDims);
590 size_t LocalSize = PromotedToLocal ? *getLocalSize(NDRange, std::nullopt,
591 Req, Promotion::Local)
593 range<3> AccessRange{1, 1, LocalSize};
594 void *RangeArg = storePlainArg(FusedArgStorage, AccessRange);
596 id<3> AcessOffset{0, 0, 0};
597 void *OffsetArg = storePlainArg(FusedArgStorage, AcessOffset);
602 int SizeInBytes = Req->MElemSize * LocalSize;
603 FusedArgs[ArgIndex] =
605 static_cast<int>(ArgIndex)};
608 FusedArgs[ArgIndex] =
610 static_cast<int>(SizeAccField),
static_cast<int>(ArgIndex)};
613 FusedArgs[ArgIndex] =
615 static_cast<int>(SizeAccField),
static_cast<int>(ArgIndex)};
618 FusedArgs[ArgIndex] =
620 static_cast<int>(SizeAccField),
static_cast<int>(ArgIndex)};
629 QueueImplPtr Queue,
const RTDeviceBinaryImage *BinImage,
630 const std::string &KernelName,
631 const std::vector<unsigned char> &SpecConstBlob) {
634 "No suitable IR available for materializing");
636 if (KernelName.empty()) {
639 "Cannot jit kernel with invalid kernel function name");
642 if (
auto CachedKernel =
643 PM.getCachedMaterializedKernel(KernelName, SpecConstBlob))
646 auto &RawDeviceImage = BinImage->getRawData();
647 auto DeviceImageSize =
static_cast<size_t>(RawDeviceImage.BinaryEnd -
648 RawDeviceImage.BinaryStart);
651 auto BinaryImageFormat = translateBinaryImageFormat(BinImage->getFormat());
652 if (BinaryImageFormat == ::jit_compiler::BinaryFormat::INVALID) {
654 "No suitable IR available for materializing");
656 ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
657 BinaryImageFormat, 0, RawDeviceImage.BinaryStart, DeviceImageSize};
659 ::jit_compiler::TargetInfo TargetInfo = getTargetInfo(Queue);
661 ::jit_compiler::option::JITTargetInfo::set(std::move(TargetInfo)));
665 ::jit_compiler::option::JITEnableVerbose::set(DebugEnabled));
667 std::string TargetCPU =
669 std::string TargetFeatures =
672 auto MaterializerResult =
673 MaterializeSpecConstHandle(KernelName.c_str(), BinInfo, SpecConstBlob,
674 TargetCPU.c_str(), TargetFeatures.c_str());
675 if (MaterializerResult.failed()) {
676 std::string Message{
"Compilation for kernel failed with message:\n"};
677 Message.append(MaterializerResult.getErrorMessage());
684 auto &MaterializerKernelInfo = MaterializerResult.getKernelInfo();
687 MaterializerKernelInfo.BinaryInfo.BinaryStart;
688 MaterializedRawDeviceImage.BinaryEnd =
689 MaterializerKernelInfo.BinaryInfo.BinaryStart +
690 MaterializerKernelInfo.BinaryInfo.BinarySize;
694 if (0 != setenv(
"SYCL_CACHE_IN_MEM",
"0",
true)) {
697 "Failed to set env variable in materialize spec constel.");
702 RTDeviceBinaryImage MaterializedRTDevBinImage{&MaterializedRawDeviceImage};
703 const auto &Context = Queue->get_context();
704 const auto &Device = Queue->get_device();
705 auto NewKernel = PM.getOrCreateMaterializedKernel(
706 MaterializedRTDevBinImage, Context, Device, KernelName, SpecConstBlob);
709 if (0 != setenv(
"SYCL_CACHE_IN_MEM",
"1",
true)) {
712 "Failed to set env variable in materialize spec const.");
720 std::unique_ptr<detail::CG>
722 std::vector<ExecCGCommand *> &InputKernels,
723 const property_list &PropList) {
725 printPerformanceWarning(
"JIT library not available");
728 if (InputKernels.empty()) {
729 printPerformanceWarning(
"Fusion list is empty");
735 std::vector<::jit_compiler::SYCLKernelInfo> InputKernelInfo;
736 std::vector<std::string> InputKernelNames;
739 detail::CG::StorageInitHelper CGData;
740 std::vector<std::vector<char>> &ArgsStorage = CGData.MArgsStorage;
741 std::vector<detail::AccessorImplPtr> &AccStorage = CGData.MAccStorage;
742 std::vector<Requirement *> &Requirements = CGData.MRequirements;
743 std::vector<detail::EventImplPtr> &Events = CGData.MEvents;
744 std::vector<::jit_compiler::NDRange> Ranges;
745 ur_kernel_cache_config_t KernelCacheConfig = UR_KERNEL_CACHE_CONFIG_DEFAULT;
746 unsigned KernelIndex = 0;
747 ParamList FusedParams;
748 PromotionMap PromotedAccs;
751 for (
auto &RawCmd : InputKernels) {
752 auto *KernelCmd =
static_cast<ExecCGCommand *
>(RawCmd);
753 auto &CG = KernelCmd->getCG();
754 assert(KernelCmd->isFusable());
755 auto *KernelCG =
static_cast<CGExecKernel *
>(&CG);
757 auto KernelName = KernelCG->MKernelName;
758 if (KernelName.empty()) {
759 printPerformanceWarning(
760 "Cannot fuse kernel with invalid kernel function name");
764 auto [DeviceImage, Program] =
766 if (!DeviceImage || !Program) {
767 printPerformanceWarning(
"No suitable IR available for fusion");
771 if (Program && (KernelCG->MSyclKernel ==
nullptr ||
772 !KernelCG->MSyclKernel->isCreatedFromSource())) {
775 Program, KernelName);
782 auto Args = KernelCG->MArgs;
783 std::sort(Args.begin(), Args.end(), [](
const ArgDesc &A,
const ArgDesc &B) {
784 return A.MIndex < B.MIndex;
789 std::optional<size_t> UserGlobalSize;
790 if ((KernelName.find(
"_ZTSN4sycl3_V16detail18RoundedRangeKernel") == 0 ||
791 KernelName.find(
"_ZTSN4sycl3_V16detail19__pf_kernel_wrapper") == 0) &&
794 [[maybe_unused]]
auto Dims = KernelCG->MNDRDesc.Dims;
795 assert(A0.MPtr && A0.MSize ==
static_cast<int>(Dims *
sizeof(
size_t)) &&
797 "Unexpected signature for rounded range kernel");
799 size_t *UGS =
reinterpret_cast<size_t *
>(A0.MPtr);
801 assert(UGS[0] > KernelCG->MNDRDesc.GlobalSize[1]);
802 assert(Dims < 2 || UGS[1] == KernelCG->MNDRDesc.GlobalSize[1]);
803 assert(Dims < 3 || UGS[2] == KernelCG->MNDRDesc.GlobalSize[2]);
804 UserGlobalSize = UGS[0];
807 ::jit_compiler::SYCLArgumentDescriptor ArgDescriptor{Args.size()};
811 unsigned ArgFunctionIndex = 0;
812 auto KindIt = ArgDescriptor.Kinds.begin();
813 auto UsageMaskIt = ArgDescriptor.UsageMask.begin();
814 for (
auto &Arg : Args) {
815 *KindIt = translateArgType(Arg.MType);
821 bool Eliminated = EliminatedArgs && !EliminatedArgs->empty() &&
822 (*EliminatedArgs)[ArgIndex++];
823 *UsageMaskIt = !Eliminated;
831 resolveInternalization(Arg, KernelIndex, ArgFunctionIndex,
832 KernelCG->MNDRDesc, UserGlobalSize,
835 FusedParams.emplace_back(Arg, KernelIndex, ArgFunctionIndex,
true);
838 FusedParams.emplace_back(Arg, KernelIndex, 0,
false);
843 auto &RawDeviceImage = DeviceImage->getRawData();
844 auto DeviceImageSize =
static_cast<size_t>(RawDeviceImage.BinaryEnd -
845 RawDeviceImage.BinaryStart);
848 auto BinaryImageFormat =
849 translateBinaryImageFormat(DeviceImage->getFormat());
850 if (BinaryImageFormat == ::jit_compiler::BinaryFormat::INVALID) {
851 printPerformanceWarning(
"No suitable IR available for fusion");
854 ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
855 BinaryImageFormat, 0, RawDeviceImage.BinaryStart, DeviceImageSize};
857 constexpr
auto SYCLTypeToIndices = [](
auto Val) -> ::jit_compiler::Indices {
858 return {Val.get(0), Val.get(1), Val.get(2)};
861 auto &CurrentNDR = KernelCG->MNDRDesc;
862 const ::jit_compiler::NDRange JITCompilerNDR{
863 static_cast<int>(CurrentNDR.Dims),
864 SYCLTypeToIndices(CurrentNDR.GlobalSize),
865 SYCLTypeToIndices(CurrentNDR.LocalSize),
866 SYCLTypeToIndices(CurrentNDR.GlobalOffset)};
868 Ranges.push_back(JITCompilerNDR);
869 InputKernelInfo.emplace_back(KernelName.c_str(), ArgDescriptor,
870 JITCompilerNDR, BinInfo);
874 if (CurrentNDR.GlobalSize[0] == 0 && CurrentNDR.NumWorkGroups[0] != 0) {
878 printPerformanceWarning(
879 "Cannot fuse kernel with hierarchical parallelism");
891 ArgsStorage.insert(ArgsStorage.end(), KernelCG->getArgsStorage().begin(),
892 KernelCG->getArgsStorage().end());
893 AccStorage.insert(AccStorage.end(), KernelCG->getAccStorage().begin(),
894 KernelCG->getAccStorage().end());
898 Requirements.insert(Requirements.end(), KernelCG->getRequirements().begin(),
899 KernelCG->getRequirements().end());
900 Events.insert(Events.end(), KernelCG->getEvents().begin(),
901 KernelCG->getEvents().end());
905 if (KernelIndex == 0) {
906 KernelCacheConfig = KernelCG->MKernelCacheConfig;
907 }
else if (KernelCG->MKernelCacheConfig != KernelCacheConfig) {
908 KernelCacheConfig = UR_KERNEL_CACHE_CONFIG_DEFAULT;
916 std::vector<::jit_compiler::ParameterInternalization> InternalizeParams;
917 std::vector<::jit_compiler::JITConstant> JITConstants;
918 std::vector<::jit_compiler::ParameterIdentity> ParamIdentities;
919 ParamList NonIdenticalParameters;
920 for (
auto UR = FusedParams.begin(); UR != FusedParams.end();) {
921 UR = preProcessArguments(ArgsStorage, UR, PromotedAccs, InternalizeParams,
922 JITConstants, NonIdenticalParameters,
927 ::jit_compiler::BarrierFlags BarrierFlags =
929 .has_property<ext::codeplay::experimental::property::no_barriers>())
930 ? ::jit_compiler::getNoBarrierFlag()
933 static size_t FusedKernelNameIndex = 0;
934 auto FusedKernelName =
"fused_" + std::to_string(FusedKernelNameIndex++);
939 ::jit_compiler::option::JITEnableVerbose::set(DebugEnabled));
940 AddToConfigHandle(::jit_compiler::option::JITEnableCaching::set(
943 ::jit_compiler::TargetInfo TargetInfo = getTargetInfo(Queue);
944 ::jit_compiler::BinaryFormat TargetFormat = TargetInfo.getFormat();
946 ::jit_compiler::option::JITTargetInfo::set(std::move(TargetInfo)));
948 auto FusionResult = FuseKernelsHandle(
949 InputKernelInfo, FusedKernelName.c_str(), ParamIdentities, BarrierFlags,
950 InternalizeParams, JITConstants);
952 if (FusionResult.failed()) {
955 <<
"ERROR: JIT compilation for kernel fusion failed with message:\n"
956 << FusionResult.getErrorMessage() <<
"\n";
961 auto &FusedKernelInfo = FusionResult.getKernelInfo();
962 std::string FusedOrCachedKernelName{FusedKernelInfo.Name.c_str()};
964 std::vector<ArgDesc> FusedArgs;
965 int FusedArgIndex = 0;
966 for (
auto &Param : FusedParams) {
969 auto &Arg = Param.Arg;
970 FusedArgs.emplace_back(Arg.MType, Arg.MPtr, Arg.MSize, FusedArgIndex++);
974 const auto NDRDesc = [](
const auto &ND) -> NDRDescT {
975 constexpr
auto ToSYCLType = [](
const auto &Indices) ->
sycl::range<3> {
976 return {Indices[0], Indices[1], Indices[2]};
979 NDRDesc.Dims = ND.getDimensions();
980 NDRDesc.GlobalSize = ToSYCLType(ND.getGlobalSize());
981 NDRDesc.LocalSize = ToSYCLType(ND.getLocalSize());
982 NDRDesc.GlobalOffset = ToSYCLType(ND.getOffset());
984 }(FusedKernelInfo.NDR);
985 updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage);
987 if (!FusionResult.cached()) {
988 auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat);
992 std::cerr <<
"INFO: Re-using existing device binary for fused kernel\n";
999 FusedOrCachedKernelName);
1002 if (TargetFormat == ::jit_compiler::BinaryFormat::SPIRV) {
1004 Queue->get_context(), {Queue->get_device()}, {FusedKernelId}));
1007 std::unique_ptr<detail::CG> FusedCG;
1008 FusedCG.reset(
new detail::CGExecKernel(
1010 std::move(CGData), std::move(FusedArgs), FusedOrCachedKernelName, {}, {},
1017 const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
1018 ::jit_compiler::BinaryFormat Format) {
1020 const char *TargetSpec =
nullptr;
1023 case ::jit_compiler::BinaryFormat::PTX: {
1028 case ::jit_compiler::BinaryFormat::AMDGCN: {
1033 case ::jit_compiler::BinaryFormat::SPIRV: {
1034 TargetSpec = (FusedKernelInfo.BinaryInfo.AddressBits == 64)
1042 "Invalid output format");
1045 std::string FusedKernelName{FusedKernelInfo.Name.c_str()};
1046 DeviceBinaryContainer Binary;
1051 OffloadEntryContainer Entry{FusedKernelName,
nullptr, 0, 0, 0};
1052 Binary.addOffloadEntry(std::move(Entry));
1055 auto ArgMask = encodeArgUsageMask(FusedKernelInfo.Args.UsageMask);
1056 PropertyContainer ArgMaskProp{
1057 FusedKernelName, ArgMask.data(), ArgMask.size(),
1062 PropertySetContainer ArgMaskPropSet{
1065 ArgMaskPropSet.addProperty(std::move(ArgMaskProp));
1067 Binary.addProperty(std::move(ArgMaskPropSet));
1069 if (Format == ::jit_compiler::BinaryFormat::PTX ||
1070 Format == ::jit_compiler::BinaryFormat::AMDGCN) {
1073 auto ReqdWGS = std::find_if(
1074 FusedKernelInfo.Attributes.begin(), FusedKernelInfo.Attributes.end(),
1075 [](const ::jit_compiler::SYCLKernelAttribute &Attr) {
1076 return Attr.Kind == ::jit_compiler::SYCLKernelAttribute::AttrKind::
1079 if (ReqdWGS != FusedKernelInfo.Attributes.end()) {
1080 auto Encoded = encodeReqdWorkGroupSize(*ReqdWGS);
1081 std::stringstream PropName;
1082 PropName << FusedKernelInfo.Name.c_str();
1084 PropertyContainer ReqdWorkGroupSizeProp{
1085 PropName.str(), Encoded.data(), Encoded.size(),
1087 PropertySetContainer ProgramMetadata{
1089 ProgramMetadata.addProperty(std::move(ReqdWorkGroupSizeProp));
1090 Binary.addProperty(std::move(ProgramMetadata));
1093 if (Format == ::jit_compiler::BinaryFormat::AMDGCN) {
1094 PropertyContainer NeedFinalization{
1097 ProgramMetadata.addProperty(std::move(NeedFinalization));
1098 Binary.addProperty(std::move(ProgramMetadata));
1101 DeviceBinariesCollection Collection;
1102 Collection.addDeviceBinary(
1103 std::move(Binary), FusedKernelInfo.BinaryInfo.BinaryStart,
1104 FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec, BinFormat);
1106 JITDeviceBinaries.push_back(std::move(Collection));
1107 return JITDeviceBinaries.back().getPIDeviceStruct();
1110 std::vector<uint8_t> jit_compiler::encodeArgUsageMask(
1113 constexpr uint64_t NBytesForSize = 8;
1114 constexpr uint64_t NBitsInElement = 8;
1115 uint64_t Size =
static_cast<uint64_t
>(Mask.size());
1117 uint64_t RoundedSize =
1118 ((Size + (NBitsInElement - 1)) & (~(NBitsInElement - 1)));
1119 std::vector<uint8_t> Encoded((RoundedSize / NBitsInElement) + NBytesForSize,
1122 for (
size_t i = 0; i < NBytesForSize; ++i) {
1124 static_cast<uint8_t
>((RoundedSize >> i * NBitsInElement) & 0xFF);
1128 for (
size_t i = 0; i < Size; ++i) {
1132 if (!(Mask[i] & ::jit_compiler::ArgUsage::Used)) {
1133 uint8_t &Byte = Encoded[NBytesForSize + (i / NBitsInElement)];
1134 Byte |=
static_cast<uint8_t
>((1 << (i % NBitsInElement)));
1140 std::vector<uint8_t> jit_compiler::encodeReqdWorkGroupSize(
1141 const ::jit_compiler::SYCLKernelAttribute &Attr)
const {
1143 ::jit_compiler::SYCLKernelAttribute::AttrKind::ReqdWorkGroupSize);
1144 size_t NumBytes =
sizeof(uint64_t) + (Attr.Values.size() *
sizeof(uint32_t));
1145 std::vector<uint8_t> Encoded(NumBytes, 0u);
1146 uint8_t *Ptr = Encoded.data();
1149 Ptr +=
sizeof(uint64_t);
1150 for (
const auto &Val : Attr.Values) {
1151 auto UVal =
static_cast<uint32_t
>(Val);
1152 std::memcpy(Ptr, &UVal,
sizeof(uint32_t));
1153 Ptr +=
sizeof(uint32_t);
void addImages(sycl_device_binaries DeviceImages)
static ProgramManager & getInstance()
const KernelArgMask * getEliminatedKernelArgMask(ur_program_handle_t NativePrg, const std::string &KernelName)
Returns the mask for eliminated kernel arguments for the requested kernel within the native program.
kernel_id getSYCLKernelID(const std::string &KernelName)
static const char * get()
std::unique_ptr< detail::CG > fuseKernels(QueueImplPtr Queue, std::vector< ExecCGCommand * > &InputKernels, const property_list &)
ur_kernel_handle_t materializeSpecConstants(QueueImplPtr Queue, const RTDeviceBinaryImage *BinImage, const std::string &KernelName, const std::vector< unsigned char > &SpecConstBlob)
Defines the iteration domain of either a single work-group in a parallel dispatch,...
#define __SYCL_PROPERTY_SET_KERNEL_PARAM_OPT_INFO
PropertySetRegistry::SYCL_KERNEL_PARAM_OPT_INFO defined in PropertySetIO.h.
#define __SYCL_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE
Program metadata tags recognized by the PI backends.
#define __SYCL_PROPERTY_SET_PROGRAM_METADATA
PropertySetRegistry::SYCL_KERNEL_PROGRAM_METADATA defined in PropertySetIO.h.
@ SYCL_PROPERTY_TYPE_BYTE_ARRAY
#define __SYCL_DEVICE_BINARY_TARGET_AMDGCN
#define __SYCL_DEVICE_BINARY_TARGET_NVPTX64
PTX 64-bit image <-> "nvptx64", 64-bit NVIDIA PTX device.
sycl_device_binary_type
Types of device binary.
@ SYCL_DEVICE_BINARY_TYPE_SPIRV
@ SYCL_DEVICE_BINARY_TYPE_LLVMIR_BITCODE
@ SYCL_DEVICE_BINARY_TYPE_NONE
@ SYCL_DEVICE_BINARY_TYPE_NATIVE
#define __SYCL_PROGRAM_METADATA_TAG_NEED_FINALIZATION
#define __SYCL_DEVICE_BINARY_TARGET_SPIRV64
SPIR-V 64-bit image <-> "spir64", 64-bit OpenCL device.
#define __SYCL_DEVICE_BINARY_TARGET_SPIRV32
SPIR-V 32-bit image <-> "spir", 32-bit OpenCL device.
DynArray< uint8_t > ArgUsageMask
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
__SYCL_EXTERN_STREAM_ATTRS ostream cerr
Linked to standard error (unbuffered)
void * getOsLibraryFuncAddress(void *Library, const std::string &FunctionName)
::sycl_device_binary_type DeviceBinaryType
void * loadOsLibrary(const std::string &Library)
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
std::vector< bool > KernelArgMask
AccessorImplHost Requirement
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
std::tuple< const RTDeviceBinaryImage *, ur_program_handle_t > retrieveKernelBinary(const QueueImplPtr &, const char *KernelName, CGExecKernel *CGKernel=nullptr)
static constexpr bool has_property()
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
This struct is a record of all the device code that may be offloaded.
This struct is a record of the device binary information.
const unsigned char * BinaryStart
Pointer to the target code start.
C++ utilities for Unified Runtime integration.