8 #include <sycl/feature_test.hpp>
9 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
10 #include <KernelFusion.h>
25 jit_compiler::jit_compiler() : MJITContext{new ::jit_compiler::JITContext{}} {}
27 jit_compiler::~jit_compiler() =
default;
29 static ::jit_compiler::BinaryFormat
33 return ::jit_compiler::BinaryFormat::SPIRV;
35 return ::jit_compiler::BinaryFormat::LLVM;
38 "Format unsupported for JIT compiler");
42 static ::jit_compiler::ParameterKind
44 using PK = ::jit_compiler::ParameterKind;
47 case kind::kind_accessor:
49 case kind::kind_std_layout:
51 case kind::kind_sampler:
53 case kind::kind_pointer:
55 case kind::kind_specialization_constants_buffer:
56 return PK::SpecConstBuffer;
57 case kind::kind_stream:
59 case kind::kind_invalid:
65 enum class Promotion {
None, Private, Local };
67 struct PromotionInformation {
68 Promotion PromotionTarget;
74 std::vector<bool> UsedParams;
77 using PromotionMap = std::unordered_map<SYCLMemObjI *, PromotionInformation>;
79 static inline void printPerformanceWarning(
const std::string &Message) {
81 std::cerr <<
"WARNING: " << Message <<
"\n";
85 template <
typename Obj> Promotion getPromotionTarget(
const Obj &
obj) {
86 auto Result = Promotion::None;
88 ext::codeplay::experimental::property::promote_private>()) {
89 Result = Promotion::Private;
92 ext::codeplay::experimental::property::promote_local>()) {
93 if (Result != Promotion::None) {
95 "Two contradicting promotion properties on the "
96 "same buffer/accessor are not allowed.");
98 Result = Promotion::Local;
103 static Promotion getInternalizationInfo(
Requirement *Req) {
104 auto AccPromotion = getPromotionTarget(Req->MPropertyList);
106 auto *MemObj =
static_cast<sycl::detail::SYCLMemObjT *
>(Req->MSYCLMemObj);
107 if (MemObj->getType() != SYCLMemObjI::MemObjType::Buffer) {
110 return Promotion::None;
112 Promotion BuffPromotion = getPromotionTarget(*MemObj);
113 if (AccPromotion != Promotion::None && BuffPromotion != Promotion::None &&
114 AccPromotion != BuffPromotion) {
116 "Contradicting promotion properties on accessor and "
117 "underlying buffer are not allowed");
119 return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion;
122 static std::optional<size_t> getLocalSize(NDRDescT NDRange,
Requirement *Req,
124 auto NumElementsMem =
static_cast<SYCLMemObjT *
>(Req->MSYCLMemObj)->size();
125 if (Target == Promotion::Private) {
126 auto NumWorkItems = NDRange.GlobalSize.size();
129 return NumElementsMem / NumWorkItems;
130 }
else if (Target == Promotion::Local) {
131 if (NDRange.LocalSize.size() == 0) {
136 auto NumWorkGroups = NDRange.GlobalSize.size() / NDRange.LocalSize.size();
139 return NumElementsMem / NumWorkGroups;
145 return Req->MOffset == Other->MOffset &&
146 Req->MAccessRange == Other->MAccessRange &&
147 Req->MMemoryRange == Other->MMemoryRange &&
148 Req->MSYCLMemObj == Other->MSYCLMemObj && Req->MDims == Other->MDims &&
149 Req->MElemSize == Other->MElemSize &&
150 Req->MOffsetInBytes == Other->MOffsetInBytes &&
151 Req->MIsSubBuffer == Other->MIsSubBuffer;
154 static void resolveInternalization(ArgDesc &Arg,
unsigned KernelIndex,
155 unsigned ArgFunctionIndex, NDRDescT NDRange,
156 PromotionMap &Promotions) {
161 auto ThisPromotionTarget = getInternalizationInfo(Req);
162 auto ThisLocalSize = getLocalSize(NDRange, Req, ThisPromotionTarget);
164 if (Promotions.count(Req->MSYCLMemObj)) {
166 auto &PreviousDefinition = Promotions.at(Req->MSYCLMemObj);
168 switch (ThisPromotionTarget) {
169 case Promotion::None: {
170 if (PreviousDefinition.PromotionTarget != Promotion::None) {
171 printPerformanceWarning(
172 "Deactivating previously specified promotion, because this "
173 "accessor does not specify promotion");
174 PreviousDefinition.PromotionTarget = Promotion::None;
178 case Promotion::Local: {
179 if (PreviousDefinition.PromotionTarget == Promotion::None) {
180 printPerformanceWarning(
181 "Not performing specified local promotion, due to previous "
182 "mismatch or because previous accessor specified no promotion");
185 if (!ThisLocalSize.has_value()) {
186 printPerformanceWarning(
"Work-group size for local promotion not "
187 "specified, not performing internalization");
188 PreviousDefinition.PromotionTarget = Promotion::None;
191 if (PreviousDefinition.PromotionTarget == Promotion::Private) {
192 printPerformanceWarning(
193 "Overriding previous private promotion with local promotion");
196 auto NewPrevLocalSize =
197 getLocalSize(PreviousDefinition.NDRange,
198 PreviousDefinition.Definition, Promotion::Local);
200 if (!NewPrevLocalSize.has_value()) {
201 printPerformanceWarning(
202 "Not performing specified local promotion because previous "
203 "kernels did not specify a local size");
204 PreviousDefinition.PromotionTarget = Promotion::None;
208 PreviousDefinition.LocalSize = NewPrevLocalSize.value();
209 PreviousDefinition.PromotionTarget = Promotion::Local;
211 if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
212 printPerformanceWarning(
"Not performing specified local promotion due "
213 "to work-group size mismatch");
214 PreviousDefinition.PromotionTarget = Promotion::None;
217 if (!accessorEquals(Req, PreviousDefinition.Definition)) {
218 printPerformanceWarning(
"Not performing specified promotion, due to "
219 "accessor parameter mismatch");
220 PreviousDefinition.PromotionTarget = Promotion::None;
225 case Promotion::Private: {
226 if (PreviousDefinition.PromotionTarget == Promotion::None) {
227 printPerformanceWarning(
228 "Not performing specified private promotion, due to previous "
229 "mismatch or because previous accessor specified no promotion");
233 if (PreviousDefinition.PromotionTarget == Promotion::Local) {
235 auto ThisLocalSize = getLocalSize(NDRange, Req, Promotion::Local);
236 if (!ThisLocalSize.has_value()) {
237 printPerformanceWarning(
"Work-group size for local promotion not "
238 "specified, not performing internalization");
239 PreviousDefinition.PromotionTarget = Promotion::None;
243 if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
244 printPerformanceWarning(
245 "Not performing specified local promotion due "
246 "to work-group size mismatch");
247 PreviousDefinition.PromotionTarget = Promotion::None;
251 if (!accessorEquals(Req, PreviousDefinition.Definition)) {
252 printPerformanceWarning(
"Not performing local promotion, due to "
253 "accessor parameter mismatch");
254 PreviousDefinition.PromotionTarget = Promotion::None;
258 printPerformanceWarning(
259 "Performing local internalization instead, because previous "
260 "accessor specified local promotion");
265 if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
266 printPerformanceWarning(
267 "Not performing specified private promotion due "
268 "to work-group size mismatch");
269 PreviousDefinition.PromotionTarget = Promotion::None;
272 if (!accessorEquals(Req, PreviousDefinition.Definition)) {
273 printPerformanceWarning(
"Not performing specified promotion, due to "
274 "accessor parameter mismatch");
275 PreviousDefinition.PromotionTarget = Promotion::None;
282 if (ThisPromotionTarget == Promotion::Local && !ThisLocalSize.has_value()) {
283 printPerformanceWarning(
"Work-group size for local promotion not "
284 "specified, not performing internalization");
285 ThisPromotionTarget = Promotion::None;
288 assert(ThisLocalSize.has_value());
289 Promotions.emplace(Req->MSYCLMemObj,
290 PromotionInformation{ThisPromotionTarget, KernelIndex,
291 ArgFunctionIndex, Req, NDRange,
292 ThisLocalSize.value(),
293 std::vector<bool>()});
301 unsigned KernelIndex;
304 Param(ArgDesc Argument,
unsigned KernelIdx,
unsigned ArgIdx,
bool InUse)
305 : Arg{Argument}, KernelIndex{KernelIdx}, ArgIndex{ArgIdx}, Used{InUse} {}
308 using ParamList = std::vector<Param>;
310 using ParamIterator = std::vector<Param>::iterator;
312 std::vector<Param>::const_iterator
313 detectIdenticalParameter(std::vector<Param> &Params, ArgDesc Arg) {
314 for (
auto I = Params.begin(); I < Params.end(); ++I) {
316 if (I->Arg.MType == Arg.MType) {
320 if ((Arg.MSize == I->Arg.MSize) &&
321 std::memcmp(Arg.MPtr, I->Arg.MPtr, Arg.MSize) == 0) {
327 if (accessorEquals(Req, Other)) {
337 typename std::remove_reference_t<T>>>
338 F *storePlainArg(std::vector<std::vector<char>> &ArgStorage, T &&Arg) {
339 ArgStorage.emplace_back(
sizeof(T));
340 auto Storage =
reinterpret_cast<F *
>(ArgStorage.back().data());
345 void *storePlainArgRaw(std::vector<std::vector<char>> &ArgStorage,
void *ArgPtr,
347 ArgStorage.emplace_back(ArgSize);
348 void *Storage = ArgStorage.back().data();
353 static ParamIterator preProcessArguments(
354 std::vector<std::vector<char>> &ArgStorage, ParamIterator Arg,
355 PromotionMap &PromotedAccs,
356 std::vector<::jit_compiler::ParameterInternalization> &InternalizeParams,
357 std::vector<::jit_compiler::JITConstant> &JITConstants,
358 ParamList &NonIdenticalParams,
359 ::jit_compiler::ParamIdentList &ParamIdentities) {
374 storePlainArg(ArgStorage, *
static_cast<void **
>(Arg->Arg.MPtr));
381 Arg->Arg.MPtr = storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
390 NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
394 JITConstants.emplace_back(
395 ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
396 Arg->Arg.MPtr, Arg->Arg.MSize);
401 auto Identical = detectIdenticalParameter(NonIdenticalParams, Arg->Arg);
402 if (Identical != NonIdenticalParams.end()) {
403 ::jit_compiler::Parameter ThisParam{Arg->KernelIndex, Arg->ArgIndex};
404 ::jit_compiler::Parameter IdenticalParam{Identical->KernelIndex,
405 Identical->ArgIndex};
406 ::jit_compiler::ParameterIdentity Identity{ThisParam, IdenticalParam};
407 ParamIdentities.push_back(Identity);
414 auto &Internalization = PromotedAccs.at(Req->MSYCLMemObj);
415 auto PromotionTarget = Internalization.PromotionTarget;
416 if (PromotionTarget == Promotion::Private ||
417 PromotionTarget == Promotion::Local) {
419 if (Internalization.KernelIndex == Arg->KernelIndex &&
420 Internalization.ArgIndex == Arg->ArgIndex) {
423 InternalizeParams.emplace_back(
424 ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
425 (PromotionTarget == Promotion::Private)
426 ? ::jit_compiler::Internalization::Private
427 : ::jit_compiler::Internalization::Local,
428 Internalization.LocalSize);
436 for (
unsigned I = 0; I < 4; ++I) {
437 Internalization.UsedParams.push_back(Arg->Used);
445 unsigned Increment = 0;
446 for (
unsigned I = 0; I < 4; ++I) {
450 if (Arg->Used && Internalization.UsedParams[I]) {
451 ::jit_compiler::Parameter ThisParam{Arg->KernelIndex,
453 ::jit_compiler::Parameter IdenticalParam{
454 Internalization.KernelIndex,
455 Internalization.ArgIndex + Increment};
456 ::jit_compiler::ParameterIdentity Identity{ThisParam,
458 ParamIdentities.push_back(Identity);
460 if (Internalization.UsedParams[I]) {
470 NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
476 NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
484 updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
485 NDRDescT NDRange, std::vector<ArgDesc> &FusedArgs,
486 std::vector<std::vector<char>> &FusedArgStorage) {
487 auto &ArgUsageInfo = FusedKernelInfo.Args.UsageMask;
488 assert(ArgUsageInfo.size() == FusedArgs.size());
489 for (
size_t ArgIndex = 0; ArgIndex < ArgUsageInfo.size();) {
490 bool PromotedToPrivate =
491 (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedPrivate);
492 bool PromotedToLocal =
493 (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedLocal);
494 if (PromotedToLocal || PromotedToPrivate) {
498 auto &OldArgDesc = FusedArgs[ArgIndex];
500 auto *Req =
static_cast<Requirement *
>(OldArgDesc.MPtr);
505 const size_t SizeAccField =
506 sizeof(size_t) * (Req->MDims == 0 ? 1 : Req->MDims);
508 auto LocalSize = getLocalSize(NDRange, Req,
509 (PromotedToPrivate) ? Promotion::Private
511 range<3> AccessRange{1, 1, LocalSize.value()};
512 auto *RangeArg = storePlainArg(FusedArgStorage, AccessRange);
514 id<3> AcessOffset{0, 0, 0};
515 auto *OffsetArg = storePlainArg(FusedArgStorage, AcessOffset);
520 int SizeInBytes = Req->MElemSize * LocalSize.value();
521 FusedArgs[ArgIndex] =
523 static_cast<int>(ArgIndex)};
526 FusedArgs[ArgIndex] =
528 static_cast<int>(SizeAccField),
static_cast<int>(ArgIndex)};
531 FusedArgs[ArgIndex] =
533 static_cast<int>(SizeAccField),
static_cast<int>(ArgIndex)};
536 FusedArgs[ArgIndex] =
538 static_cast<int>(SizeAccField),
static_cast<int>(ArgIndex)};
546 std::unique_ptr<detail::CG>
548 std::vector<ExecCGCommand *> &InputKernels,
549 const property_list &PropList) {
552 std::vector<::jit_compiler::SYCLKernelInfo> InputKernelInfo;
553 std::vector<std::string> InputKernelNames;
555 std::vector<std::vector<char>> ArgsStorage;
556 std::vector<detail::AccessorImplPtr> AccStorage;
557 std::vector<Requirement *> Requirements;
558 std::vector<detail::EventImplPtr> Events;
559 std::vector<::jit_compiler::NDRange> Ranges;
562 unsigned KernelIndex = 0;
563 ParamList FusedParams;
564 PromotionMap PromotedAccs;
567 for (
auto &RawCmd : InputKernels) {
568 auto *KernelCmd =
static_cast<ExecCGCommand *
>(RawCmd);
569 auto &CG = KernelCmd->getCG();
571 auto *KernelCG =
static_cast<CGExecKernel *
>(&CG);
573 auto KernelName = KernelCG->MKernelName;
574 if (KernelName.empty()) {
575 printPerformanceWarning(
576 "Cannot fuse kernel with invalid kernel function name");
579 const RTDeviceBinaryImage *DeviceImage =
nullptr;
581 if (KernelCG->getKernelBundle() !=
nullptr) {
583 auto KernelBundle = KernelCG->getKernelBundle();
588 KernelBundle->get_kernel(KernelID, KernelBundle));
590 DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
591 Program = SyclKernel->getDeviceImage()->get_program_ref();
592 }
else if (KernelCG->MSyclKernel !=
nullptr) {
594 KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
595 Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
597 auto ContextImpl = Queue->getContextImplPtr();
598 auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
599 auto DeviceImpl = Queue->getDeviceImplPtr();
600 auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
602 KernelCG->MOSModuleHandle, KernelName, Context, Device);
604 *DeviceImage, Context, Device);
606 if (!DeviceImage || !Program) {
607 printPerformanceWarning(
"No suitable IR available for fusion");
611 if (Program && (KernelCG->MSyclKernel ==
nullptr ||
612 !KernelCG->MSyclKernel->isCreatedFromSource())) {
615 KernelCG->MOSModuleHandle, Program, KernelName);
622 auto Args = KernelCG->MArgs;
623 std::sort(Args.begin(), Args.end(), [](
const ArgDesc &A,
const ArgDesc &B) {
624 return A.MIndex < B.MIndex;
627 ::jit_compiler::SYCLArgumentDescriptor ArgDescriptor;
631 unsigned ArgFunctionIndex = 0;
632 for (
auto &Arg : Args) {
633 ArgDescriptor.Kinds.push_back(translateArgType(Arg.MType));
637 bool Eliminated = !EliminatedArgs.empty() && EliminatedArgs[ArgIndex++];
638 ArgDescriptor.UsageMask.emplace_back(!Eliminated);
645 resolveInternalization(Arg, KernelIndex, ArgFunctionIndex,
646 KernelCG->MNDRDesc, PromotedAccs);
648 FusedParams.emplace_back(Arg, KernelIndex, ArgFunctionIndex,
true);
651 FusedParams.emplace_back(Arg, KernelIndex, 0,
false);
657 auto &RawDeviceImage = DeviceImage->getRawData();
658 auto DeviceImageSize =
static_cast<size_t>(RawDeviceImage.BinaryEnd -
659 RawDeviceImage.BinaryStart);
662 auto BinaryImageFormat =
663 translateBinaryImageFormat(DeviceImage->getFormat());
664 if (BinaryImageFormat == ::jit_compiler::BinaryFormat::INVALID) {
665 printPerformanceWarning(
"No suitable IR available for fusion");
668 ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
669 translateBinaryImageFormat(DeviceImage->getFormat()), 0,
670 RawDeviceImage.BinaryStart, DeviceImageSize};
672 constexpr
auto SYCLTypeToIndices = [](
auto Val) -> ::jit_compiler::Indices {
673 return {Val.get(0), Val.get(1), Val.get(2)};
676 auto &CurrentNDR = KernelCG->MNDRDesc;
677 const ::jit_compiler::NDRange JITCompilerNDR{
678 static_cast<int>(CurrentNDR.Dims),
679 SYCLTypeToIndices(CurrentNDR.GlobalSize),
680 SYCLTypeToIndices(CurrentNDR.LocalSize),
681 SYCLTypeToIndices(CurrentNDR.GlobalOffset)};
683 Ranges.push_back(JITCompilerNDR);
684 InputKernelInfo.emplace_back(KernelName, ArgDescriptor, JITCompilerNDR,
686 InputKernelNames.push_back(KernelName);
690 if (CurrentNDR.GlobalSize[0] == 0 && CurrentNDR.NumWorkGroups[0] != 0) {
694 printPerformanceWarning(
695 "Cannot fuse kernel with hierarchical parallelism");
708 ArgsStorage.insert(ArgsStorage.end(), KernelCG->getArgsStorage().begin(),
709 KernelCG->getArgsStorage().end());
710 AccStorage.insert(AccStorage.end(), KernelCG->getAccStorage().begin(),
711 KernelCG->getAccStorage().end());
715 Requirements.insert(Requirements.end(), KernelCG->MRequirements.begin(),
716 KernelCG->MRequirements.end());
717 Events.insert(Events.end(), KernelCG->MEvents.begin(),
718 KernelCG->MEvents.end());
722 if (KernelIndex == 0) {
723 KernelCacheConfig = KernelCG->MKernelCacheConfig;
724 }
else if (KernelCG->MKernelCacheConfig != KernelCacheConfig) {
733 std::vector<::jit_compiler::ParameterInternalization> InternalizeParams;
734 std::vector<::jit_compiler::JITConstant> JITConstants;
735 ::jit_compiler::ParamIdentList ParamIdentities;
736 ParamList NonIdenticalParameters;
737 for (
auto PI = FusedParams.begin(); PI != FusedParams.end();) {
738 PI = preProcessArguments(ArgsStorage, PI, PromotedAccs, InternalizeParams,
739 JITConstants, NonIdenticalParameters,
746 .has_property<ext::codeplay::experimental::property::no_barriers>())
750 static size_t FusedKernelNameIndex = 0;
751 std::stringstream FusedKernelName;
752 FusedKernelName <<
"fused_" << FusedKernelNameIndex++;
753 ::jit_compiler::Config JITConfig;
756 JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled);
757 JITConfig.set<::jit_compiler::option::JITEnableCaching>(
760 auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels(
761 *MJITContext, std::move(JITConfig), InputKernelInfo, InputKernelNames,
762 FusedKernelName.str(), ParamIdentities, BarrierFlags, InternalizeParams,
765 if (FusionResult.failed()) {
768 <<
"ERROR: JIT compilation for kernel fusion failed with message:\n"
769 << FusionResult.getErrorMessage() <<
"\n";
774 auto &FusedKernelInfo = FusionResult.getKernelInfo();
776 std::vector<ArgDesc> FusedArgs;
777 int FusedArgIndex = 0;
778 for (
auto &Param : FusedParams) {
781 auto &Arg = Param.Arg;
782 FusedArgs.emplace_back(Arg.MType, Arg.MPtr, Arg.MSize, FusedArgIndex++);
786 const auto NDRDesc = [](
const auto &ND) -> NDRDescT {
787 constexpr
auto ToSYCLType = [](
const auto &Indices) -> sycl::range<3> {
788 return {Indices[0], Indices[1], Indices[2]};
791 NDRDesc.Dims = ND.getDimensions();
792 NDRDesc.GlobalSize = ToSYCLType(ND.getGlobalSize());
793 NDRDesc.LocalSize = ToSYCLType(ND.getLocalSize());
794 NDRDesc.GlobalOffset = ToSYCLType(ND.getOffset());
796 }(FusedKernelInfo.NDR);
797 updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage);
799 if (!FusionResult.cached()) {
800 auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo);
802 }
else if (DebugEnabled) {
803 std::cerr <<
"INFO: Re-using existing device binary for fused kernel\n";
809 FusedKernelInfo.Name);
810 std::vector<std::shared_ptr<const void>> RawExtendedMembers;
814 Queue->get_context(), {Queue->get_device()}, {FusedKernelId}));
816 std::unique_ptr<detail::CG> FusedCG;
817 FusedCG.reset(
new detail::CGExecKernel(
819 std::move(ArgsStorage), std::move(AccStorage),
820 std::move(RawExtendedMembers), std::move(Requirements), std::move(Events),
822 {}, CG::CGTYPE::Kernel, KernelCacheConfig));
827 const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo) {
829 DeviceBinaryContainer Binary;
834 OffloadEntryContainer Entry{FusedKernelInfo.Name,
nullptr, 0, 0, 0};
835 Binary.addOffloadEntry(std::move(Entry));
838 auto ArgMask = encodeArgUsageMask(FusedKernelInfo.Args.UsageMask);
839 PropertyContainer ArgMaskProp{FusedKernelInfo.Name, ArgMask.data(),
845 PropertySetContainer ArgMaskPropSet{
848 ArgMaskPropSet.addProperty(std::move(ArgMaskProp));
850 Binary.addProperty(std::move(ArgMaskPropSet));
852 DeviceBinariesCollection Collection;
853 Collection.addDeviceBinary(std::move(Binary),
854 FusedKernelInfo.BinaryInfo.BinaryStart,
855 FusedKernelInfo.BinaryInfo.BinarySize,
856 FusedKernelInfo.BinaryInfo.AddressBits);
858 JITDeviceBinaries.push_back(std::move(Collection));
859 return JITDeviceBinaries.back().getPIDeviceStruct();
862 std::vector<uint8_t> jit_compiler::encodeArgUsageMask(
865 constexpr uint64_t NBytesForSize = 8;
866 constexpr uint64_t NBitsInElement = 8;
867 uint64_t Size =
static_cast<uint64_t
>(Mask.size());
869 uint64_t RoundedSize =
870 ((Size + (NBitsInElement - 1)) & (~(NBitsInElement - 1)));
871 std::vector<uint8_t> Encoded((RoundedSize / NBitsInElement) + NBytesForSize,
874 for (
size_t i = 0; i < NBytesForSize; ++i) {
876 static_cast<uint8_t
>((RoundedSize >> i * NBitsInElement) & 0xFF);
880 for (
size_t i = 0; i < Size; ++i) {
884 if (!(Mask[i] & ::jit_compiler::ArgUsage::Used)) {
885 uint8_t &Byte = Encoded[NBytesForSize + (i / NBitsInElement)];
886 Byte |=
static_cast<uint8_t
>((1 << (i % NBitsInElement)));
896 #endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION