DPC++ Runtime
Runtime libraries for oneAPI DPC++
jit_compiler.cpp
Go to the documentation of this file.
1 //==--- jit_compiler.cpp - SYCL runtime JIT compiler for kernel fusion -----==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include <sycl/feature_test.hpp>
9 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
10 #include <KernelFusion.h>
12 #include <detail/jit_compiler.hpp>
14 #include <detail/kernel_impl.hpp>
15 #include <detail/queue_impl.hpp>
17 #include <sycl/detail/ur.hpp>
19 #include <sycl/kernel_bundle.hpp>
20 
21 namespace sycl {
22 inline namespace _V1 {
23 namespace detail {
24 
25 static inline void printPerformanceWarning(const std::string &Message) {
27  std::cerr << "WARNING: " << Message << "\n";
28  }
29 }
30 
31 jit_compiler::jit_compiler() {
32  auto checkJITLibrary = [this]() -> bool {
33  static const std::string JITLibraryName = "libsycl-fusion.so";
34 
35  void *LibraryPtr = sycl::detail::ur::loadOsLibrary(JITLibraryName);
36  if (LibraryPtr == nullptr) {
37  printPerformanceWarning("Could not find JIT library " + JITLibraryName);
38  return false;
39  }
40 
41  this->AddToConfigHandle = reinterpret_cast<AddToConfigFuncT>(
43  "addToJITConfiguration"));
44  if (!this->AddToConfigHandle) {
45  printPerformanceWarning(
46  "Cannot resolve JIT library function entry point");
47  return false;
48  }
49 
50  this->ResetConfigHandle = reinterpret_cast<ResetConfigFuncT>(
52  "resetJITConfiguration"));
53  if (!this->ResetConfigHandle) {
54  printPerformanceWarning(
55  "Cannot resolve JIT library function entry point");
56  return false;
57  }
58 
59  this->FuseKernelsHandle = reinterpret_cast<FuseKernelsFuncT>(
60  sycl::detail::ur::getOsLibraryFuncAddress(LibraryPtr, "fuseKernels"));
61  if (!this->FuseKernelsHandle) {
62  printPerformanceWarning(
63  "Cannot resolve JIT library function entry point");
64  return false;
65  }
66 
67  this->MaterializeSpecConstHandle =
68  reinterpret_cast<MaterializeSpecConstFuncT>(
70  LibraryPtr, "materializeSpecConstants"));
71  if (!this->MaterializeSpecConstHandle) {
72  printPerformanceWarning(
73  "Cannot resolve JIT library function entry point");
74  return false;
75  }
76 
77  return true;
78  };
79  Available = checkJITLibrary();
80 }
81 
82 static ::jit_compiler::BinaryFormat
83 translateBinaryImageFormat(ur::DeviceBinaryType Type) {
84  switch (Type) {
86  return ::jit_compiler::BinaryFormat::SPIRV;
88  return ::jit_compiler::BinaryFormat::LLVM;
89  default:
90  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
91  "Format unsupported for JIT compiler");
92  }
93 }
94 
95 ::jit_compiler::BinaryFormat getTargetFormat(QueueImplPtr &Queue) {
96  auto Backend = Queue->getDeviceImplPtr()->getBackend();
97  switch (Backend) {
99  case backend::opencl:
100  return ::jit_compiler::BinaryFormat::SPIRV;
102  return ::jit_compiler::BinaryFormat::PTX;
104  return ::jit_compiler::BinaryFormat::AMDGCN;
105  default:
106  throw sycl::exception(
107  sycl::make_error_code(sycl::errc::feature_not_supported),
108  "Backend unsupported by kernel fusion");
109  }
110 }
111 
112 ::jit_compiler::TargetInfo getTargetInfo(QueueImplPtr &Queue) {
113  ::jit_compiler::BinaryFormat Format = getTargetFormat(Queue);
115  Format, static_cast<::jit_compiler::DeviceArchitecture>(
116  Queue->getDeviceImplPtr()->getDeviceArch()));
117 }
118 
119 static ::jit_compiler::ParameterKind
120 translateArgType(kernel_param_kind_t Kind) {
121  using PK = ::jit_compiler::ParameterKind;
122  using kind = kernel_param_kind_t;
123  switch (Kind) {
124  case kind::kind_accessor:
125  return PK::Accessor;
126  case kind::kind_std_layout:
127  return PK::StdLayout;
128  case kind::kind_sampler:
129  return PK::Sampler;
130  case kind::kind_pointer:
131  return PK::Pointer;
132  case kind::kind_specialization_constants_buffer:
133  return PK::SpecConstBuffer;
134  case kind::kind_stream:
135  return PK::Stream;
136  case kind::kind_invalid:
137  return PK::Invalid;
138  }
139  return PK::Invalid;
140 }
141 
142 enum class Promotion { None, Private, Local };
143 
144 struct PromotionInformation {
145  Promotion PromotionTarget;
146  unsigned KernelIndex;
147  unsigned ArgIndex;
148  Requirement *Definition;
149  NDRDescT NDRange;
150  size_t LocalSize;
151  size_t ElemSize;
152  std::vector<bool> UsedParams;
153 };
154 
155 using PromotionMap = std::unordered_map<SYCLMemObjI *, PromotionInformation>;
156 
157 template <typename Obj> Promotion getPromotionTarget(const Obj &obj) {
158  auto Result = Promotion::None;
159  if (obj.template has_property<
160  ext::codeplay::experimental::property::promote_private>()) {
161  Result = Promotion::Private;
162  }
163  if (obj.template has_property<
164  ext::codeplay::experimental::property::promote_local>()) {
165  if (Result != Promotion::None) {
166  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
167  "Two contradicting promotion properties on the "
168  "same buffer/accessor are not allowed.");
169  }
170  Result = Promotion::Local;
171  }
172  return Result;
173 }
174 
175 static Promotion getInternalizationInfo(Requirement *Req) {
176  auto AccPromotion = getPromotionTarget(Req->MPropertyList);
177 
178  auto *MemObj = static_cast<sycl::detail::SYCLMemObjT *>(Req->MSYCLMemObj);
179  if (MemObj->getType() != SYCLMemObjI::MemObjType::Buffer) {
180  // We currently do not support promotion on non-buffer memory objects (e.g.,
181  // images).
182  return Promotion::None;
183  }
184  Promotion BuffPromotion = getPromotionTarget(*MemObj);
185  if (AccPromotion != Promotion::None && BuffPromotion != Promotion::None &&
186  AccPromotion != BuffPromotion) {
187  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
188  "Contradicting promotion properties on accessor and "
189  "underlying buffer are not allowed");
190  }
191  return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion;
192 }
193 
194 static std::optional<size_t> getLocalSize(NDRDescT NDRange,
195  std::optional<size_t> UserGlobalSize,
196  Requirement *Req, Promotion Target) {
197  assert((!UserGlobalSize.has_value() || Target != Promotion::Local) &&
198  "Unexpected range rounding");
199  auto NumElementsMem = static_cast<SYCLMemObjT *>(Req->MSYCLMemObj)->size();
200  if (Target == Promotion::Private) {
201  if (UserGlobalSize.has_value()) {
202  // Only the first dimension is affected by range rounding.
203  NDRange.GlobalSize[0] = *UserGlobalSize;
204  }
205  auto NumWorkItems = NDRange.GlobalSize.size();
206  // For private internalization, the local size is
207  // (Number of elements in buffer)/(number of work-items)
208  return NumElementsMem / NumWorkItems;
209  } else if (Target == Promotion::Local) {
210  if (NDRange.LocalSize.size() == 0) {
211  // No work-group size provided, cannot calculate the local size
212  // and need to bail out.
213  return {};
214  }
215  auto NumWorkGroups = NDRange.GlobalSize.size() / NDRange.LocalSize.size();
216  // For local internalization, the local size is
217  // (Number of elements in buffer)/(number of work-groups)
218  return NumElementsMem / NumWorkGroups;
219  }
220  return 0;
221 }
222 
223 static bool accessorEquals(Requirement *Req, Requirement *Other) {
224  return Req->MOffset == Other->MOffset &&
225  Req->MAccessRange == Other->MAccessRange &&
226  Req->MMemoryRange == Other->MMemoryRange &&
227  Req->MSYCLMemObj == Other->MSYCLMemObj && Req->MDims == Other->MDims &&
228  Req->MElemSize == Other->MElemSize &&
229  Req->MOffsetInBytes == Other->MOffsetInBytes &&
230  Req->MIsSubBuffer == Other->MIsSubBuffer;
231 }
232 
233 static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
234  unsigned ArgFunctionIndex, NDRDescT NDRange,
235  std::optional<size_t> UserGlobalSize,
236  PromotionMap &Promotions) {
237  assert(Arg.MType == kernel_param_kind_t::kind_accessor);
238 
239  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
240 
241  auto ThisPromotionTarget = getInternalizationInfo(Req);
242  auto ThisLocalSize =
243  getLocalSize(NDRange, UserGlobalSize, Req, ThisPromotionTarget);
244 
245  if (Promotions.count(Req->MSYCLMemObj)) {
246  // We previously encountered an accessor for the same buffer.
247  auto &PreviousDefinition = Promotions.at(Req->MSYCLMemObj);
248 
249  switch (ThisPromotionTarget) {
250  case Promotion::None: {
251  if (PreviousDefinition.PromotionTarget != Promotion::None) {
252  printPerformanceWarning(
253  "Deactivating previously specified promotion, because this "
254  "accessor does not specify promotion");
255  PreviousDefinition.PromotionTarget = Promotion::None;
256  }
257  return;
258  }
259  case Promotion::Local: {
260  if (PreviousDefinition.PromotionTarget == Promotion::None) {
261  printPerformanceWarning(
262  "Not performing specified local promotion, due to previous "
263  "mismatch or because previous accessor specified no promotion");
264  return;
265  }
266  if (!ThisLocalSize.has_value()) {
267  printPerformanceWarning("Work-group size for local promotion not "
268  "specified, not performing internalization");
269  PreviousDefinition.PromotionTarget = Promotion::None;
270  return;
271  }
272  if (PreviousDefinition.PromotionTarget == Promotion::Private) {
273  printPerformanceWarning(
274  "Overriding previous private promotion with local promotion");
275  // Recompute the local size for the previous definition with adapted
276  // promotion target.
277  auto NewPrevLocalSize =
278  getLocalSize(PreviousDefinition.NDRange, std::nullopt,
279  PreviousDefinition.Definition, Promotion::Local);
280 
281  if (!NewPrevLocalSize.has_value()) {
282  printPerformanceWarning(
283  "Not performing specified local promotion because previous "
284  "kernels did not specify a local size");
285  PreviousDefinition.PromotionTarget = Promotion::None;
286  return;
287  }
288 
289  PreviousDefinition.LocalSize = NewPrevLocalSize.value();
290  PreviousDefinition.PromotionTarget = Promotion::Local;
291  }
292  if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
293  printPerformanceWarning("Not performing specified local promotion due "
294  "to work-group size mismatch");
295  PreviousDefinition.PromotionTarget = Promotion::None;
296  return;
297  }
298  if (!accessorEquals(Req, PreviousDefinition.Definition)) {
299  printPerformanceWarning("Not performing specified promotion, due to "
300  "accessor parameter mismatch");
301  PreviousDefinition.PromotionTarget = Promotion::None;
302  return;
303  }
304  return;
305  }
306  case Promotion::Private: {
307  if (PreviousDefinition.PromotionTarget == Promotion::None) {
308  printPerformanceWarning(
309  "Not performing specified private promotion, due to previous "
310  "mismatch or because previous accessor specified no promotion");
311  return;
312  }
313 
314  if (PreviousDefinition.PromotionTarget == Promotion::Local) {
315  // Recompute the local size with adapted promotion target.
316  auto ThisLocalSize =
317  getLocalSize(NDRange, std::nullopt, Req, Promotion::Local);
318  if (!ThisLocalSize.has_value()) {
319  printPerformanceWarning("Work-group size for local promotion not "
320  "specified, not performing internalization");
321  PreviousDefinition.PromotionTarget = Promotion::None;
322  return;
323  }
324 
325  if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
326  printPerformanceWarning(
327  "Not performing specified local promotion due "
328  "to work-group size mismatch");
329  PreviousDefinition.PromotionTarget = Promotion::None;
330  return;
331  }
332 
333  if (!accessorEquals(Req, PreviousDefinition.Definition)) {
334  printPerformanceWarning("Not performing local promotion, due to "
335  "accessor parameter mismatch");
336  PreviousDefinition.PromotionTarget = Promotion::None;
337  return;
338  }
339 
340  printPerformanceWarning(
341  "Performing local internalization instead, because previous "
342  "accessor specified local promotion");
343  return;
344  }
345 
346  // Previous accessors also specified private promotion.
347  if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
348  printPerformanceWarning(
349  "Not performing specified private promotion due "
350  "to work-group size mismatch");
351  PreviousDefinition.PromotionTarget = Promotion::None;
352  return;
353  }
354  if (!accessorEquals(Req, PreviousDefinition.Definition)) {
355  printPerformanceWarning("Not performing specified promotion, due to "
356  "accessor parameter mismatch");
357  PreviousDefinition.PromotionTarget = Promotion::None;
358  return;
359  }
360  return;
361  }
362  }
363  } else {
364  if (ThisPromotionTarget == Promotion::Local && !ThisLocalSize.has_value()) {
365  printPerformanceWarning("Work-group size for local promotion not "
366  "specified, not performing internalization");
367  ThisPromotionTarget = Promotion::None;
368  ThisLocalSize = 0;
369  }
370  assert(ThisLocalSize.has_value());
371  Promotions.emplace(
372  Req->MSYCLMemObj,
373  PromotionInformation{ThisPromotionTarget, KernelIndex, ArgFunctionIndex,
374  Req, NDRange, ThisLocalSize.value(),
375  Req->MElemSize, std::vector<bool>()});
376  }
377 }
378 
379 // Identify a parameter by the argument description, the kernel index and the
380 // parameter index in that kernel.
381 struct Param {
382  ArgDesc Arg;
383  unsigned KernelIndex;
384  unsigned ArgIndex;
385  bool Used;
386  Param(ArgDesc Argument, unsigned KernelIdx, unsigned ArgIdx, bool InUse)
387  : Arg{Argument}, KernelIndex{KernelIdx}, ArgIndex{ArgIdx}, Used{InUse} {}
388 };
389 
390 using ParamList = std::vector<Param>;
391 
392 using ParamIterator = std::vector<Param>::iterator;
393 
394 std::vector<Param>::const_iterator
395 detectIdenticalParameter(std::vector<Param> &Params, ArgDesc Arg) {
396  for (auto I = Params.begin(); I < Params.end(); ++I) {
397  // Two arguments of different type can never be identical.
398  if (I->Arg.MType == Arg.MType) {
399  if (Arg.MType == kernel_param_kind_t::kind_pointer ||
401  // Compare size and, if the size is identical, the content byte-by-byte.
402  if ((Arg.MSize == I->Arg.MSize) &&
403  std::memcmp(Arg.MPtr, I->Arg.MPtr, Arg.MSize) == 0) {
404  return I;
405  }
406  } else if (Arg.MType == kernel_param_kind_t::kind_accessor) {
407  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
408  Requirement *Other = static_cast<Requirement *>(I->Arg.MPtr);
409  if (accessorEquals(Req, Other)) {
410  return I;
411  }
412  }
413  }
414  }
415  return Params.end();
416 }
417 
418 void *storePlainArgRaw(std::vector<std::vector<char>> &ArgStorage, void *ArgPtr,
419  size_t ArgSize) {
420  ArgStorage.emplace_back(ArgSize);
421  void *Storage = ArgStorage.back().data();
422  std::memcpy(Storage, ArgPtr, ArgSize);
423  return Storage;
424 }
425 
426 template <typename T>
427 void *storePlainArg(std::vector<std::vector<char>> &ArgStorage, T &&Arg) {
428  return storePlainArgRaw(ArgStorage, &Arg, sizeof(T));
429 }
430 
431 static ParamIterator preProcessArguments(
432  std::vector<std::vector<char>> &ArgStorage, ParamIterator Arg,
433  PromotionMap &PromotedAccs,
434  std::vector<::jit_compiler::ParameterInternalization> &InternalizeParams,
435  std::vector<::jit_compiler::JITConstant> &JITConstants,
436  ParamList &NonIdenticalParams,
437  std::vector<::jit_compiler::ParameterIdentity> &ParamIdentities) {
438 
439  // Unused arguments are still in the list at this point (because we
440  // need them for accessor handling), but there's not pre-processing
441  // that needs to be done.
442  if (!Arg->Used) {
443  return ++Arg;
444  }
445 
446  if (Arg->Arg.MType == kernel_param_kind_t::kind_pointer) {
447  // Pointer arguments are only stored in the kernel functor object, which
448  // will go out-of-scope before we execute the fused kernel. Therefore, we
449  // need to copy the pointer (not the memory it's pointing to) to a permanent
450  // location and update the argument.
451  Arg->Arg.MPtr =
452  storePlainArg(ArgStorage, *static_cast<void **>(Arg->Arg.MPtr));
453  }
454  if (Arg->Arg.MType == kernel_param_kind_t::kind_std_layout) {
455  // Standard layout arguments are only stored in the kernel functor object,
456  // which will go out-of-scope before we execute the fused kernel. Therefore,
457  // we need to copy the argument to a permant location and update the
458  // argument.
459  if (Arg->Arg.MPtr) {
460  Arg->Arg.MPtr =
461  storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
462  // Propagate values of scalar parameters as constants to the JIT
463  // compiler.
464  JITConstants.emplace_back(
465  ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
466  Arg->Arg.MPtr, Arg->Arg.MSize);
467  }
468  // Standard layout arguments do not participate in identical argument
469  // detection, but we still add it to the list here. As the SYCL runtime can
470  // only check the raw bytes for identical content, but is unaware of the
471  // underlying datatype, some identities that would be detected here could
472  // not be materialized by the JIT compiler. Instead of removing some
473  // standard layout arguments due to identity and missing some in case the
474  // materialization is not possible, we rely on constant propagation to
475  // replace standard layout arguments by constants.
476  NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
477  true);
478  return ++Arg;
479  }
480  // First check if there's already another parameter with identical
481  // value.
482  auto Identical = detectIdenticalParameter(NonIdenticalParams, Arg->Arg);
483  if (Identical != NonIdenticalParams.end()) {
484  ::jit_compiler::Parameter ThisParam{Arg->KernelIndex, Arg->ArgIndex};
485  ::jit_compiler::Parameter IdenticalParam{Identical->KernelIndex,
486  Identical->ArgIndex};
487  ::jit_compiler::ParameterIdentity Identity{ThisParam, IdenticalParam};
488  ParamIdentities.push_back(Identity);
489  return ++Arg;
490  }
491 
492  if (Arg->Arg.MType == kernel_param_kind_t::kind_accessor) {
493  // Get local and private promotion information from accessors.
494  Requirement *Req = static_cast<Requirement *>(Arg->Arg.MPtr);
495  auto &Internalization = PromotedAccs.at(Req->MSYCLMemObj);
496  auto PromotionTarget = Internalization.PromotionTarget;
497  if (PromotionTarget == Promotion::Private ||
498  PromotionTarget == Promotion::Local) {
499  // The accessor should be promoted.
500  if (Internalization.KernelIndex == Arg->KernelIndex &&
501  Internalization.ArgIndex == Arg->ArgIndex) {
502  // This is the first accessor for this buffer that should be
503  // internalized.
504  InternalizeParams.emplace_back(
505  ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
506  (PromotionTarget == Promotion::Private)
507  ? ::jit_compiler::Internalization::Private
508  : ::jit_compiler::Internalization::Local,
509  Internalization.LocalSize, Internalization.ElemSize);
510  // If an accessor will be promoted, i.e., if it has the promotion
511  // property attached to it, the next three arguments, that are
512  // associated with the accessor (access range, memory range, offset),
513  // must not participate in identical parameter detection or constant
514  // propagation, because their values will change if promotion happens.
515  // Therefore, we can just skip them here, but we need to remember which
516  // of them are used.
517  for (unsigned I = 0; I < 4; ++I) {
518  Internalization.UsedParams.push_back(Arg->Used);
519  ++Arg;
520  }
521  } else {
522  // We have previously encountered an accessor the same buffer, which
523  // should be internalized. We can add parameter identities for the
524  // accessor argument and the next three arguments (range, memory range
525  // and offset, if they are used).
526  unsigned Increment = 0;
527  for (unsigned I = 0; I < 4; ++I) {
528  // If the argument is used in both cases, i.e., on the original
529  // accessor to be internalized, and this one, we can insert a
530  // parameter identity.
531  if (Arg->Used && Internalization.UsedParams[I]) {
532  ::jit_compiler::Parameter ThisParam{Arg->KernelIndex,
533  Arg->ArgIndex};
534  ::jit_compiler::Parameter IdenticalParam{
535  Internalization.KernelIndex,
536  Internalization.ArgIndex + Increment};
537  ::jit_compiler::ParameterIdentity Identity{ThisParam,
538  IdenticalParam};
539  ParamIdentities.push_back(Identity);
540  }
541  if (Internalization.UsedParams[I]) {
542  ++Increment;
543  }
544  ++Arg;
545  }
546  }
547  return Arg;
548  } else {
549  // The accessor will not be promoted, so it can participate in identical
550  // parameter detection.
551  NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
552  true);
553  return ++Arg;
554  }
555  } else if (Arg->Arg.MType == kernel_param_kind_t::kind_pointer) {
556  // No identical parameter exists, so add this to the list.
557  NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
558  true);
559  return ++Arg;
560  }
561  return ++Arg;
562 }
563 
564 static void
565 updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
566  NDRDescT NDRange, std::vector<ArgDesc> &FusedArgs,
567  std::vector<std::vector<char>> &FusedArgStorage) {
568  auto &ArgUsageInfo = FusedKernelInfo.Args.UsageMask;
569  assert(ArgUsageInfo.size() == FusedArgs.size());
570  for (size_t ArgIndex = 0; ArgIndex < ArgUsageInfo.size();) {
571  bool PromotedToPrivate =
572  (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedPrivate);
573  bool PromotedToLocal =
574  (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedLocal);
575  if (PromotedToLocal || PromotedToPrivate) {
576  // For each internalized accessor, we need to override four arguments
577  // (see 'addArgsForGlobalAccessor' in handler.cpp for reference), i.e.,
578  // the pointer itself, plus twice the range and the offset.
579  auto &OldArgDesc = FusedArgs[ArgIndex];
580  assert(OldArgDesc.MType == kernel_param_kind_t::kind_accessor);
581  auto *Req = static_cast<Requirement *>(OldArgDesc.MPtr);
582 
583  // The stored args are all three-dimensional, but depending on the
584  // actual number of dimensions of the accessor, only a part of that
585  // argument is later on passed to the kernel.
586  const size_t SizeAccField =
587  sizeof(size_t) * (Req->MDims == 0 ? 1 : Req->MDims);
588  // Compute the local size and use it for the range parameters (only
589  // relevant for local promotion).
590  size_t LocalSize = PromotedToLocal ? *getLocalSize(NDRange, std::nullopt,
591  Req, Promotion::Local)
592  : 0;
593  range<3> AccessRange{1, 1, LocalSize};
594  void *RangeArg = storePlainArg(FusedArgStorage, AccessRange);
595  // Use all-zero as the offset
596  id<3> AcessOffset{0, 0, 0};
597  void *OffsetArg = storePlainArg(FusedArgStorage, AcessOffset);
598 
599  // Override the arguments.
600  // 1. Override the pointer with a std-layout argument with 'nullptr' as
601  // value. handler.cpp does the same for local accessors.
602  int SizeInBytes = Req->MElemSize * LocalSize;
603  FusedArgs[ArgIndex] =
604  ArgDesc{kernel_param_kind_t::kind_std_layout, nullptr, SizeInBytes,
605  static_cast<int>(ArgIndex)};
606  ++ArgIndex;
607  // 2. Access Range
608  FusedArgs[ArgIndex] =
609  ArgDesc{kernel_param_kind_t::kind_std_layout, RangeArg,
610  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
611  ++ArgIndex;
612  // 3. Memory Range
613  FusedArgs[ArgIndex] =
614  ArgDesc{kernel_param_kind_t::kind_std_layout, RangeArg,
615  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
616  ++ArgIndex;
617  // 4. Offset
618  FusedArgs[ArgIndex] =
619  ArgDesc{kernel_param_kind_t::kind_std_layout, OffsetArg,
620  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
621  ++ArgIndex;
622  } else {
623  ++ArgIndex;
624  }
625  }
626 }
627 
628 ur_kernel_handle_t jit_compiler::materializeSpecConstants(
629  QueueImplPtr Queue, const RTDeviceBinaryImage *BinImage,
630  const std::string &KernelName,
631  const std::vector<unsigned char> &SpecConstBlob) {
632  if (!BinImage) {
633  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
634  "No suitable IR available for materializing");
635  }
636  if (KernelName.empty()) {
637  throw sycl::exception(
638  sycl::make_error_code(sycl::errc::invalid),
639  "Cannot jit kernel with invalid kernel function name");
640  }
642  if (auto CachedKernel =
643  PM.getCachedMaterializedKernel(KernelName, SpecConstBlob))
644  return CachedKernel;
645 
646  auto &RawDeviceImage = BinImage->getRawData();
647  auto DeviceImageSize = static_cast<size_t>(RawDeviceImage.BinaryEnd -
648  RawDeviceImage.BinaryStart);
649  // Set 0 as the number of address bits, because the JIT compiler can set this
650  // field based on information from LLVM module's data-layout.
651  auto BinaryImageFormat = translateBinaryImageFormat(BinImage->getFormat());
652  if (BinaryImageFormat == ::jit_compiler::BinaryFormat::INVALID) {
653  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
654  "No suitable IR available for materializing");
655  }
656  ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
657  BinaryImageFormat, 0, RawDeviceImage.BinaryStart, DeviceImageSize};
658 
659  ::jit_compiler::TargetInfo TargetInfo = getTargetInfo(Queue);
660  AddToConfigHandle(
661  ::jit_compiler::option::JITTargetInfo::set(std::move(TargetInfo)));
662  bool DebugEnabled =
664  AddToConfigHandle(
665  ::jit_compiler::option::JITEnableVerbose::set(DebugEnabled));
666 
667  std::string TargetCPU =
669  std::string TargetFeatures =
671 
672  auto MaterializerResult =
673  MaterializeSpecConstHandle(KernelName.c_str(), BinInfo, SpecConstBlob,
674  TargetCPU.c_str(), TargetFeatures.c_str());
675  if (MaterializerResult.failed()) {
676  std::string Message{"Compilation for kernel failed with message:\n"};
677  Message.append(MaterializerResult.getErrorMessage());
678  if (DebugEnabled) {
679  std::cerr << Message << "\n";
680  }
681  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), Message);
682  }
683 
684  auto &MaterializerKernelInfo = MaterializerResult.getKernelInfo();
685  sycl_device_binary_struct MaterializedRawDeviceImage{RawDeviceImage};
686  MaterializedRawDeviceImage.BinaryStart =
687  MaterializerKernelInfo.BinaryInfo.BinaryStart;
688  MaterializedRawDeviceImage.BinaryEnd =
689  MaterializerKernelInfo.BinaryInfo.BinaryStart +
690  MaterializerKernelInfo.BinaryInfo.BinarySize;
691 
692  const bool OrigCacheCfg = SYCLConfig<SYCL_CACHE_IN_MEM>::get();
693  if (OrigCacheCfg) {
694  if (0 != setenv("SYCL_CACHE_IN_MEM", "0", true)) {
695  throw sycl::exception(
696  sycl::make_error_code(sycl::errc::invalid),
697  "Failed to set env variable in materialize spec constel.");
698  }
700  }
701 
702  RTDeviceBinaryImage MaterializedRTDevBinImage{&MaterializedRawDeviceImage};
703  const auto &Context = Queue->get_context();
704  const auto &Device = Queue->get_device();
705  auto NewKernel = PM.getOrCreateMaterializedKernel(
706  MaterializedRTDevBinImage, Context, Device, KernelName, SpecConstBlob);
707 
708  if (OrigCacheCfg) {
709  if (0 != setenv("SYCL_CACHE_IN_MEM", "1", true)) {
710  throw sycl::exception(
711  sycl::make_error_code(sycl::errc::invalid),
712  "Failed to set env variable in materialize spec const.");
713  }
715  }
716 
717  return NewKernel;
718 }
719 
720 std::unique_ptr<detail::CG>
722  std::vector<ExecCGCommand *> &InputKernels,
723  const property_list &PropList) {
724  if (!isAvailable()) {
725  printPerformanceWarning("JIT library not available");
726  return nullptr;
727  }
728  if (InputKernels.empty()) {
729  printPerformanceWarning("Fusion list is empty");
730  return nullptr;
731  }
732 
733  // Retrieve the device binary from each of the input
734  // kernels to hand them over to the JIT compiler.
735  std::vector<::jit_compiler::SYCLKernelInfo> InputKernelInfo;
736  std::vector<std::string> InputKernelNames;
737  // Collect argument information from all input kernels.
738 
739  detail::CG::StorageInitHelper CGData;
740  std::vector<std::vector<char>> &ArgsStorage = CGData.MArgsStorage;
741  std::vector<detail::AccessorImplPtr> &AccStorage = CGData.MAccStorage;
742  std::vector<Requirement *> &Requirements = CGData.MRequirements;
743  std::vector<detail::EventImplPtr> &Events = CGData.MEvents;
744  std::vector<::jit_compiler::NDRange> Ranges;
745  ur_kernel_cache_config_t KernelCacheConfig = UR_KERNEL_CACHE_CONFIG_DEFAULT;
746  unsigned KernelIndex = 0;
747  ParamList FusedParams;
748  PromotionMap PromotedAccs;
749  // TODO: Collect information about streams and figure out how
750  // to fuse them.
751  for (auto &RawCmd : InputKernels) {
752  auto *KernelCmd = static_cast<ExecCGCommand *>(RawCmd);
753  auto &CG = KernelCmd->getCG();
754  assert(KernelCmd->isFusable());
755  auto *KernelCG = static_cast<CGExecKernel *>(&CG);
756 
757  auto KernelName = KernelCG->MKernelName;
758  if (KernelName.empty()) {
759  printPerformanceWarning(
760  "Cannot fuse kernel with invalid kernel function name");
761  return nullptr;
762  }
763 
764  auto [DeviceImage, Program] =
765  retrieveKernelBinary(Queue, KernelName.c_str(), KernelCG);
766  if (!DeviceImage || !Program) {
767  printPerformanceWarning("No suitable IR available for fusion");
768  return nullptr;
769  }
770  const KernelArgMask *EliminatedArgs = nullptr;
771  if (Program && (KernelCG->MSyclKernel == nullptr ||
772  !KernelCG->MSyclKernel->isCreatedFromSource())) {
773  EliminatedArgs =
775  Program, KernelName);
776  }
777 
778  // Collect information about the arguments of this kernel.
779 
780  // Might need to sort the arguments in case they are not already sorted,
781  // see also the similar code in commands.cpp.
782  auto Args = KernelCG->MArgs;
783  std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
784  return A.MIndex < B.MIndex;
785  });
786 
787  // Determine whether the kernel has been subject to DPCPP's range rounding.
788  // If so, the first argument will be the original ("user") range.
789  std::optional<size_t> UserGlobalSize;
790  if ((KernelName.find("_ZTSN4sycl3_V16detail18RoundedRangeKernel") == 0 ||
791  KernelName.find("_ZTSN4sycl3_V16detail19__pf_kernel_wrapper") == 0) &&
792  !Args.empty()) {
793  auto &A0 = Args[0];
794  [[maybe_unused]] auto Dims = KernelCG->MNDRDesc.Dims;
795  assert(A0.MPtr && A0.MSize == static_cast<int>(Dims * sizeof(size_t)) &&
797  "Unexpected signature for rounded range kernel");
798 
799  size_t *UGS = reinterpret_cast<size_t *>(A0.MPtr);
800  // Range-rounding only applies to the first dimension.
801  assert(UGS[0] > KernelCG->MNDRDesc.GlobalSize[1]);
802  assert(Dims < 2 || UGS[1] == KernelCG->MNDRDesc.GlobalSize[1]);
803  assert(Dims < 3 || UGS[2] == KernelCG->MNDRDesc.GlobalSize[2]);
804  UserGlobalSize = UGS[0];
805  }
806 
807  ::jit_compiler::SYCLArgumentDescriptor ArgDescriptor{Args.size()};
808  size_t ArgIndex = 0;
809  // The kernel function in SPIR-V will only have the non-eliminated
810  // arguments, so keep track of this "actual" argument index.
811  unsigned ArgFunctionIndex = 0;
812  auto KindIt = ArgDescriptor.Kinds.begin();
813  auto UsageMaskIt = ArgDescriptor.UsageMask.begin();
814  for (auto &Arg : Args) {
815  *KindIt = translateArgType(Arg.MType);
816  ++KindIt;
817 
818  // DPC++ internally uses 'true' to indicate that an argument has been
819  // eliminated, while the JIT compiler uses 'true' to indicate an
820  // argument is used. Translate this here.
821  bool Eliminated = EliminatedArgs && !EliminatedArgs->empty() &&
822  (*EliminatedArgs)[ArgIndex++];
823  *UsageMaskIt = !Eliminated;
824  ++UsageMaskIt;
825 
826  // If the argument has not been eliminated, i.e., is still present on
827  // the kernel function in LLVM-IR/SPIR-V, collect information about the
828  // argument for performance optimizations in the JIT compiler.
829  if (!Eliminated) {
830  if (Arg.MType == kernel_param_kind_t::kind_accessor) {
831  resolveInternalization(Arg, KernelIndex, ArgFunctionIndex,
832  KernelCG->MNDRDesc, UserGlobalSize,
833  PromotedAccs);
834  }
835  FusedParams.emplace_back(Arg, KernelIndex, ArgFunctionIndex, true);
836  ++ArgFunctionIndex;
837  } else {
838  FusedParams.emplace_back(Arg, KernelIndex, 0, false);
839  }
840  }
841 
842  // TODO: Check for the correct kernel bundle state of the device image?
843  auto &RawDeviceImage = DeviceImage->getRawData();
844  auto DeviceImageSize = static_cast<size_t>(RawDeviceImage.BinaryEnd -
845  RawDeviceImage.BinaryStart);
846  // Set 0 as the number of address bits, because the JIT compiler can set
847  // this field based on information from SPIR-V/LLVM module's data-layout.
848  auto BinaryImageFormat =
849  translateBinaryImageFormat(DeviceImage->getFormat());
850  if (BinaryImageFormat == ::jit_compiler::BinaryFormat::INVALID) {
851  printPerformanceWarning("No suitable IR available for fusion");
852  return nullptr;
853  }
854  ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
855  BinaryImageFormat, 0, RawDeviceImage.BinaryStart, DeviceImageSize};
856 
857  constexpr auto SYCLTypeToIndices = [](auto Val) -> ::jit_compiler::Indices {
858  return {Val.get(0), Val.get(1), Val.get(2)};
859  };
860 
861  auto &CurrentNDR = KernelCG->MNDRDesc;
862  const ::jit_compiler::NDRange JITCompilerNDR{
863  static_cast<int>(CurrentNDR.Dims),
864  SYCLTypeToIndices(CurrentNDR.GlobalSize),
865  SYCLTypeToIndices(CurrentNDR.LocalSize),
866  SYCLTypeToIndices(CurrentNDR.GlobalOffset)};
867 
868  Ranges.push_back(JITCompilerNDR);
869  InputKernelInfo.emplace_back(KernelName.c_str(), ArgDescriptor,
870  JITCompilerNDR, BinInfo);
871 
872  // Collect information for the fused kernel
873 
874  if (CurrentNDR.GlobalSize[0] == 0 && CurrentNDR.NumWorkGroups[0] != 0) {
875  // Some overloads of parallel_for_work_group only specify the number of
876  // work-groups, so this can be used to identify hierarchical parallel
877  // kernels, which are not supported by fusion for now.
878  printPerformanceWarning(
879  "Cannot fuse kernel with hierarchical parallelism");
880  return nullptr;
881  // Not all overloads of parallel_for_work_group only specify the number of
882  // work-groups, so the above mechanism might not detect all hierarchical
883  // parallelism.
884  // TODO: Find a more reliable way to detect hierarchical parallelism.
885  }
886 
887  // We need to copy the storages here. The input CGs might be eliminated
888  // before the fused kernel gets executed, so we need to copy the storages
889  // here to make sure the arguments don't die on us before executing the
890  // fused kernel.
891  ArgsStorage.insert(ArgsStorage.end(), KernelCG->getArgsStorage().begin(),
892  KernelCG->getArgsStorage().end());
893  AccStorage.insert(AccStorage.end(), KernelCG->getAccStorage().begin(),
894  KernelCG->getAccStorage().end());
895  // TODO: Does the MSharedPtrStorage contain any information about actual
896  // shared pointers beside the kernel bundle and handler impl? If yes, we
897  // might need to copy it here.
898  Requirements.insert(Requirements.end(), KernelCG->getRequirements().begin(),
899  KernelCG->getRequirements().end());
900  Events.insert(Events.end(), KernelCG->getEvents().begin(),
901  KernelCG->getEvents().end());
902 
903  // If all kernels have the same cache config then use it for the merged
904  // kernel, otherwise use default configuration.
905  if (KernelIndex == 0) {
906  KernelCacheConfig = KernelCG->MKernelCacheConfig;
907  } else if (KernelCG->MKernelCacheConfig != KernelCacheConfig) {
908  KernelCacheConfig = UR_KERNEL_CACHE_CONFIG_DEFAULT;
909  }
910 
911  ++KernelIndex;
912  }
913 
914  // Pre-process the arguments, to detect identical parameters or arguments that
915  // can be constant-propagated by the JIT compiler.
916  std::vector<::jit_compiler::ParameterInternalization> InternalizeParams;
917  std::vector<::jit_compiler::JITConstant> JITConstants;
918  std::vector<::jit_compiler::ParameterIdentity> ParamIdentities;
919  ParamList NonIdenticalParameters;
920  for (auto UR = FusedParams.begin(); UR != FusedParams.end();) {
921  UR = preProcessArguments(ArgsStorage, UR, PromotedAccs, InternalizeParams,
922  JITConstants, NonIdenticalParameters,
923  ParamIdentities);
924  }
925 
926  // Retrieve barrier flags.
927  ::jit_compiler::BarrierFlags BarrierFlags =
928  (PropList
929  .has_property<ext::codeplay::experimental::property::no_barriers>())
930  ? ::jit_compiler::getNoBarrierFlag()
931  : ::jit_compiler::getLocalAndGlobalBarrierFlag();
932 
933  static size_t FusedKernelNameIndex = 0;
934  auto FusedKernelName = "fused_" + std::to_string(FusedKernelNameIndex++);
935  ResetConfigHandle();
936  bool DebugEnabled =
938  AddToConfigHandle(
939  ::jit_compiler::option::JITEnableVerbose::set(DebugEnabled));
940  AddToConfigHandle(::jit_compiler::option::JITEnableCaching::set(
942 
943  ::jit_compiler::TargetInfo TargetInfo = getTargetInfo(Queue);
944  ::jit_compiler::BinaryFormat TargetFormat = TargetInfo.getFormat();
945  AddToConfigHandle(
946  ::jit_compiler::option::JITTargetInfo::set(std::move(TargetInfo)));
947 
948  auto FusionResult = FuseKernelsHandle(
949  InputKernelInfo, FusedKernelName.c_str(), ParamIdentities, BarrierFlags,
950  InternalizeParams, JITConstants);
951 
952  if (FusionResult.failed()) {
953  if (DebugEnabled) {
954  std::cerr
955  << "ERROR: JIT compilation for kernel fusion failed with message:\n"
956  << FusionResult.getErrorMessage() << "\n";
957  }
958  return nullptr;
959  }
960 
961  auto &FusedKernelInfo = FusionResult.getKernelInfo();
962  std::string FusedOrCachedKernelName{FusedKernelInfo.Name.c_str()};
963 
964  std::vector<ArgDesc> FusedArgs;
965  int FusedArgIndex = 0;
966  for (auto &Param : FusedParams) {
967  // Add to the argument list of the fused kernel, but with the correct
968  // new index in the fused kernel.
969  auto &Arg = Param.Arg;
970  FusedArgs.emplace_back(Arg.MType, Arg.MPtr, Arg.MSize, FusedArgIndex++);
971  }
972 
973  // Update the kernel arguments for internalized accessors.
974  const auto NDRDesc = [](const auto &ND) -> NDRDescT {
975  constexpr auto ToSYCLType = [](const auto &Indices) -> sycl::range<3> {
976  return {Indices[0], Indices[1], Indices[2]};
977  };
978  NDRDescT NDRDesc;
979  NDRDesc.Dims = ND.getDimensions();
980  NDRDesc.GlobalSize = ToSYCLType(ND.getGlobalSize());
981  NDRDesc.LocalSize = ToSYCLType(ND.getLocalSize());
982  NDRDesc.GlobalOffset = ToSYCLType(ND.getOffset());
983  return NDRDesc;
984  }(FusedKernelInfo.NDR);
985  updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage);
986 
987  if (!FusionResult.cached()) {
988  auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat);
990  } else {
991  if (DebugEnabled) {
992  std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
993  }
994  }
995 
996  // Create a kernel bundle for the fused kernel.
997  // Kernel bundles are stored in the CG as one of the "extended" members.
999  FusedOrCachedKernelName);
1000 
1001  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr;
1002  if (TargetFormat == ::jit_compiler::BinaryFormat::SPIRV) {
1003  detail::getSyclObjImpl(get_kernel_bundle<bundle_state::executable>(
1004  Queue->get_context(), {Queue->get_device()}, {FusedKernelId}));
1005  }
1006 
1007  std::unique_ptr<detail::CG> FusedCG;
1008  FusedCG.reset(new detail::CGExecKernel(
1009  NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr),
1010  std::move(CGData), std::move(FusedArgs), FusedOrCachedKernelName, {}, {},
1011  CGType::Kernel, KernelCacheConfig, false /* KernelIsCooperative */,
1012  false /* KernelUsesClusterLaunch*/));
1013  return FusedCG;
1014 }
1015 
1016 sycl_device_binaries jit_compiler::createPIDeviceBinary(
1017  const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
1018  ::jit_compiler::BinaryFormat Format) {
1019 
1020  const char *TargetSpec = nullptr;
1022  switch (Format) {
1023  case ::jit_compiler::BinaryFormat::PTX: {
1025  BinFormat = SYCL_DEVICE_BINARY_TYPE_NONE;
1026  break;
1027  }
1028  case ::jit_compiler::BinaryFormat::AMDGCN: {
1030  BinFormat = SYCL_DEVICE_BINARY_TYPE_NONE;
1031  break;
1032  }
1033  case ::jit_compiler::BinaryFormat::SPIRV: {
1034  TargetSpec = (FusedKernelInfo.BinaryInfo.AddressBits == 64)
1037  BinFormat = SYCL_DEVICE_BINARY_TYPE_SPIRV;
1038  break;
1039  }
1040  default:
1041  sycl::exception(sycl::make_error_code(sycl::errc::invalid),
1042  "Invalid output format");
1043  }
1044 
1045  std::string FusedKernelName{FusedKernelInfo.Name.c_str()};
1046  DeviceBinaryContainer Binary;
1047 
1048  // Create an offload entry for the fused kernel.
1049  // It seems to be OK to set zero for most of the information here, at least
1050  // that is the case for compiled SPIR-V binaries.
1051  OffloadEntryContainer Entry{FusedKernelName, nullptr, 0, 0, 0};
1052  Binary.addOffloadEntry(std::move(Entry));
1053 
1054  // Create a property entry for the argument usage mask for the fused kernel.
1055  auto ArgMask = encodeArgUsageMask(FusedKernelInfo.Args.UsageMask);
1056  PropertyContainer ArgMaskProp{
1057  FusedKernelName, ArgMask.data(), ArgMask.size(),
1059 
1060  // Create a property set for the argument usage masks of all kernels
1061  // (currently only one).
1062  PropertySetContainer ArgMaskPropSet{
1064 
1065  ArgMaskPropSet.addProperty(std::move(ArgMaskProp));
1066 
1067  Binary.addProperty(std::move(ArgMaskPropSet));
1068 
1069  if (Format == ::jit_compiler::BinaryFormat::PTX ||
1070  Format == ::jit_compiler::BinaryFormat::AMDGCN) {
1071  // Add a program metadata property with the reqd_work_group_size attribute.
1072  // See CUDA UR (ur_cuda.cpp) _ur_program::set_metadata for reference.
1073  auto ReqdWGS = std::find_if(
1074  FusedKernelInfo.Attributes.begin(), FusedKernelInfo.Attributes.end(),
1075  [](const ::jit_compiler::SYCLKernelAttribute &Attr) {
1076  return Attr.Kind == ::jit_compiler::SYCLKernelAttribute::AttrKind::
1077  ReqdWorkGroupSize;
1078  });
1079  if (ReqdWGS != FusedKernelInfo.Attributes.end()) {
1080  auto Encoded = encodeReqdWorkGroupSize(*ReqdWGS);
1081  std::stringstream PropName;
1082  PropName << FusedKernelInfo.Name.c_str();
1084  PropertyContainer ReqdWorkGroupSizeProp{
1085  PropName.str(), Encoded.data(), Encoded.size(),
1087  PropertySetContainer ProgramMetadata{
1089  ProgramMetadata.addProperty(std::move(ReqdWorkGroupSizeProp));
1090  Binary.addProperty(std::move(ProgramMetadata));
1091  }
1092  }
1093  if (Format == ::jit_compiler::BinaryFormat::AMDGCN) {
1094  PropertyContainer NeedFinalization{
1096  PropertySetContainer ProgramMetadata{__SYCL_PROPERTY_SET_PROGRAM_METADATA};
1097  ProgramMetadata.addProperty(std::move(NeedFinalization));
1098  Binary.addProperty(std::move(ProgramMetadata));
1099  }
1100 
1101  DeviceBinariesCollection Collection;
1102  Collection.addDeviceBinary(
1103  std::move(Binary), FusedKernelInfo.BinaryInfo.BinaryStart,
1104  FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec, BinFormat);
1105 
1106  JITDeviceBinaries.push_back(std::move(Collection));
1107  return JITDeviceBinaries.back().getPIDeviceStruct();
1108 }
1109 
1110 std::vector<uint8_t> jit_compiler::encodeArgUsageMask(
1111  const ::jit_compiler::ArgUsageMask &Mask) const {
1112  // This must match the decoding logic in program_manager.cpp.
1113  constexpr uint64_t NBytesForSize = 8;
1114  constexpr uint64_t NBitsInElement = 8;
1115  uint64_t Size = static_cast<uint64_t>(Mask.size());
1116  // Round the size to the next multiple of 8
1117  uint64_t RoundedSize =
1118  ((Size + (NBitsInElement - 1)) & (~(NBitsInElement - 1)));
1119  std::vector<uint8_t> Encoded((RoundedSize / NBitsInElement) + NBytesForSize,
1120  0u);
1121  // First encode the size of the actual mask
1122  for (size_t i = 0; i < NBytesForSize; ++i) {
1123  uint8_t Byte =
1124  static_cast<uint8_t>((RoundedSize >> i * NBitsInElement) & 0xFF);
1125  Encoded[i] = Byte;
1126  }
1127  // Encode the actual mask bit-wise
1128  for (size_t i = 0; i < Size; ++i) {
1129  // DPC++ internally uses 'true' to indicate that an argument has been
1130  // eliminated, while the JIT compiler uses 'true' to indicate an argument
1131  // is used. Translate this here.
1132  if (!(Mask[i] & ::jit_compiler::ArgUsage::Used)) {
1133  uint8_t &Byte = Encoded[NBytesForSize + (i / NBitsInElement)];
1134  Byte |= static_cast<uint8_t>((1 << (i % NBitsInElement)));
1135  }
1136  }
1137  return Encoded;
1138 }
1139 
1140 std::vector<uint8_t> jit_compiler::encodeReqdWorkGroupSize(
1141  const ::jit_compiler::SYCLKernelAttribute &Attr) const {
1142  assert(Attr.Kind ==
1143  ::jit_compiler::SYCLKernelAttribute::AttrKind::ReqdWorkGroupSize);
1144  size_t NumBytes = sizeof(uint64_t) + (Attr.Values.size() * sizeof(uint32_t));
1145  std::vector<uint8_t> Encoded(NumBytes, 0u);
1146  uint8_t *Ptr = Encoded.data();
1147  // Skip 64-bit wide size argument with value 0 at the start of the data.
1148  // See CUDA UR (ur_cuda.cpp) _ur_program::set_metadata for reference.
1149  Ptr += sizeof(uint64_t);
1150  for (const auto &Val : Attr.Values) {
1151  auto UVal = static_cast<uint32_t>(Val);
1152  std::memcpy(Ptr, &UVal, sizeof(uint32_t));
1153  Ptr += sizeof(uint32_t);
1154  }
1155  return Encoded;
1156 }
1157 
1158 } // namespace detail
1159 } // namespace _V1
1160 } // namespace sycl
1161 
1162 #endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
void addImages(sycl_device_binaries DeviceImages)
static ProgramManager & getInstance()
const KernelArgMask * getEliminatedKernelArgMask(ur_program_handle_t NativePrg, const std::string &KernelName)
Returns the mask for eliminated kernel arguments for the requested kernel within the native program.
kernel_id getSYCLKernelID(const std::string &KernelName)
static const char * get()
Definition: config.hpp:115
std::unique_ptr< detail::CG > fuseKernels(QueueImplPtr Queue, std::vector< ExecCGCommand * > &InputKernels, const property_list &)
ur_kernel_handle_t materializeSpecConstants(QueueImplPtr Queue, const RTDeviceBinaryImage *BinImage, const std::string &KernelName, const std::vector< unsigned char > &SpecConstBlob)
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: range.hpp:26
#define __SYCL_PROPERTY_SET_KERNEL_PARAM_OPT_INFO
PropertySetRegistry::SYCL_KERNEL_PARAM_OPT_INFO defined in PropertySetIO.h.
Definition: compiler.hpp:47
#define __SYCL_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE
Program metadata tags recognized by the PI backends.
Definition: compiler.hpp:69
#define __SYCL_PROPERTY_SET_PROGRAM_METADATA
PropertySetRegistry::SYCL_KERNEL_PROGRAM_METADATA defined in PropertySetIO.h.
Definition: compiler.hpp:49
@ SYCL_PROPERTY_TYPE_BYTE_ARRAY
Definition: compiler.hpp:88
#define __SYCL_DEVICE_BINARY_TARGET_AMDGCN
Definition: compiler.hpp:32
#define __SYCL_DEVICE_BINARY_TARGET_NVPTX64
PTX 64-bit image <-> "nvptx64", 64-bit NVIDIA PTX device.
Definition: compiler.hpp:31
sycl_device_binary_type
Types of device binary.
Definition: compiler.hpp:114
@ SYCL_DEVICE_BINARY_TYPE_SPIRV
Definition: compiler.hpp:117
@ SYCL_DEVICE_BINARY_TYPE_LLVMIR_BITCODE
Definition: compiler.hpp:118
@ SYCL_DEVICE_BINARY_TYPE_NONE
Definition: compiler.hpp:115
@ SYCL_DEVICE_BINARY_TYPE_NATIVE
Definition: compiler.hpp:116
#define __SYCL_PROGRAM_METADATA_TAG_NEED_FINALIZATION
Definition: compiler.hpp:72
#define __SYCL_DEVICE_BINARY_TARGET_SPIRV64
SPIR-V 64-bit image <-> "spir64", 64-bit OpenCL device.
Definition: compiler.hpp:24
#define __SYCL_DEVICE_BINARY_TARGET_SPIRV32
SPIR-V 32-bit image <-> "spir", 32-bit OpenCL device.
Definition: compiler.hpp:22
DynArray< uint8_t > ArgUsageMask
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:198
__SYCL_EXTERN_STREAM_ATTRS ostream cerr
Linked to standard error (unbuffered)
void * getOsLibraryFuncAddress(void *Library, const std::string &FunctionName)
Definition: posix_ur.cpp:34
::sycl_device_binary_type DeviceBinaryType
Definition: ur_utils.hpp:72
void * loadOsLibrary(const std::string &Library)
Definition: posix_ur.cpp:20
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:31
std::vector< bool > KernelArgMask
AccessorImplHost Requirement
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: helpers.hpp:45
std::tuple< const RTDeviceBinaryImage *, ur_program_handle_t > retrieveKernelBinary(const QueueImplPtr &, const char *KernelName, CGExecKernel *CGKernel=nullptr)
Definition: helpers.cpp:38
static constexpr bool has_property()
PropertyListT Accessor
Definition: multi_ptr.hpp:510
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:65
Definition: access.hpp:18
This struct is a record of all the device code that may be offloaded.
Definition: compiler.hpp:186
This struct is a record of the device binary information.
Definition: compiler.hpp:132
const unsigned char * BinaryStart
Pointer to the target code start.
Definition: compiler.hpp:164
C++ utilities for Unified Runtime integration.