DPC++ Runtime
Runtime libraries for oneAPI DPC++
jit_compiler.cpp
Go to the documentation of this file.
1 //==--- jit_compiler.cpp - SYCL runtime JIT compiler for kernel fusion -----==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include <sycl/feature_test.hpp>
9 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
10 #include <KernelFusion.h>
12 #include <detail/jit_compiler.hpp>
14 #include <detail/kernel_impl.hpp>
15 #include <detail/queue_impl.hpp>
17 #include <sycl/detail/pi.hpp>
19 #include <sycl/kernel_bundle.hpp>
20 
21 namespace sycl {
23 namespace detail {
24 
25 jit_compiler::jit_compiler() : MJITContext{new ::jit_compiler::JITContext{}} {}
26 
27 jit_compiler::~jit_compiler() = default;
28 
29 static ::jit_compiler::BinaryFormat
30 translateBinaryImageFormat(pi::PiDeviceBinaryType Type) {
31  switch (Type) {
33  return ::jit_compiler::BinaryFormat::SPIRV;
35  return ::jit_compiler::BinaryFormat::LLVM;
36  default:
37  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
38  "Format unsupported for JIT compiler");
39  }
40 }
41 
42 static ::jit_compiler::ParameterKind
43 translateArgType(kernel_param_kind_t Kind) {
44  using PK = ::jit_compiler::ParameterKind;
45  using kind = kernel_param_kind_t;
46  switch (Kind) {
47  case kind::kind_accessor:
48  return PK::Accessor;
49  case kind::kind_std_layout:
50  return PK::StdLayout;
51  case kind::kind_sampler:
52  return PK::Sampler;
53  case kind::kind_pointer:
54  return PK::Pointer;
55  case kind::kind_specialization_constants_buffer:
56  return PK::SpecConstBuffer;
57  case kind::kind_stream:
58  return PK::Stream;
59  case kind::kind_invalid:
60  return PK::Invalid;
61  }
62  return PK::Invalid;
63 }
64 
65 enum class Promotion { None, Private, Local };
66 
67 struct PromotionInformation {
68  Promotion PromotionTarget;
69  unsigned KernelIndex;
70  unsigned ArgIndex;
71  Requirement *Definition;
72  NDRDescT NDRange;
73  size_t LocalSize;
74  std::vector<bool> UsedParams;
75 };
76 
77 using PromotionMap = std::unordered_map<SYCLMemObjI *, PromotionInformation>;
78 
79 static inline void printPerformanceWarning(const std::string &Message) {
81  std::cerr << "WARNING: " << Message << "\n";
82  }
83 }
84 
85 template <typename Obj> Promotion getPromotionTarget(const Obj &obj) {
86  auto Result = Promotion::None;
87  if (obj.template has_property<
88  ext::codeplay::experimental::property::promote_private>()) {
89  Result = Promotion::Private;
90  }
91  if (obj.template has_property<
92  ext::codeplay::experimental::property::promote_local>()) {
93  if (Result != Promotion::None) {
94  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
95  "Two contradicting promotion properties on the "
96  "same buffer/accessor are not allowed.");
97  }
98  Result = Promotion::Local;
99  }
100  return Result;
101 }
102 
103 static Promotion getInternalizationInfo(Requirement *Req) {
104  auto AccPromotion = getPromotionTarget(Req->MPropertyList);
105 
106  auto *MemObj = static_cast<sycl::detail::SYCLMemObjT *>(Req->MSYCLMemObj);
107  if (MemObj->getType() != SYCLMemObjI::MemObjType::Buffer) {
108  // We currently do not support promotion on non-buffer memory objects (e.g.,
109  // images).
110  return Promotion::None;
111  }
112  Promotion BuffPromotion = getPromotionTarget(*MemObj);
113  if (AccPromotion != Promotion::None && BuffPromotion != Promotion::None &&
114  AccPromotion != BuffPromotion) {
115  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
116  "Contradicting promotion properties on accessor and "
117  "underlying buffer are not allowed");
118  }
119  return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion;
120 }
121 
122 static std::optional<size_t> getLocalSize(NDRDescT NDRange, Requirement *Req,
123  Promotion Target) {
124  auto NumElementsMem = static_cast<SYCLMemObjT *>(Req->MSYCLMemObj)->size();
125  if (Target == Promotion::Private) {
126  auto NumWorkItems = NDRange.GlobalSize.size();
127  // For private internalization, the local size is
128  // (Number of elements in buffer)/(number of work-items)
129  return NumElementsMem / NumWorkItems;
130  } else if (Target == Promotion::Local) {
131  if (NDRange.LocalSize.size() == 0) {
132  // No work-group size provided, cannot calculate the local size
133  // and need to bail out.
134  return {};
135  }
136  auto NumWorkGroups = NDRange.GlobalSize.size() / NDRange.LocalSize.size();
137  // For local internalization, the local size is
138  // (Number of elements in buffer)/(number of work-groups)
139  return NumElementsMem / NumWorkGroups;
140  }
141  return 0;
142 }
143 
144 static bool accessorEquals(Requirement *Req, Requirement *Other) {
145  return Req->MOffset == Other->MOffset &&
146  Req->MAccessRange == Other->MAccessRange &&
147  Req->MMemoryRange == Other->MMemoryRange &&
148  Req->MSYCLMemObj == Other->MSYCLMemObj && Req->MDims == Other->MDims &&
149  Req->MElemSize == Other->MElemSize &&
150  Req->MOffsetInBytes == Other->MOffsetInBytes &&
151  Req->MIsSubBuffer == Other->MIsSubBuffer;
152 }
153 
154 static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
155  unsigned ArgFunctionIndex, NDRDescT NDRange,
156  PromotionMap &Promotions) {
157  assert(Arg.MType == kernel_param_kind_t::kind_accessor);
158 
159  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
160 
161  auto ThisPromotionTarget = getInternalizationInfo(Req);
162  auto ThisLocalSize = getLocalSize(NDRange, Req, ThisPromotionTarget);
163 
164  if (Promotions.count(Req->MSYCLMemObj)) {
165  // We previously encountered an accessor for the same buffer.
166  auto &PreviousDefinition = Promotions.at(Req->MSYCLMemObj);
167 
168  switch (ThisPromotionTarget) {
169  case Promotion::None: {
170  if (PreviousDefinition.PromotionTarget != Promotion::None) {
171  printPerformanceWarning(
172  "Deactivating previously specified promotion, because this "
173  "accessor does not specify promotion");
174  PreviousDefinition.PromotionTarget = Promotion::None;
175  }
176  return;
177  }
178  case Promotion::Local: {
179  if (PreviousDefinition.PromotionTarget == Promotion::None) {
180  printPerformanceWarning(
181  "Not performing specified local promotion, due to previous "
182  "mismatch or because previous accessor specified no promotion");
183  return;
184  }
185  if (!ThisLocalSize.has_value()) {
186  printPerformanceWarning("Work-group size for local promotion not "
187  "specified, not performing internalization");
188  PreviousDefinition.PromotionTarget = Promotion::None;
189  return;
190  }
191  if (PreviousDefinition.PromotionTarget == Promotion::Private) {
192  printPerformanceWarning(
193  "Overriding previous private promotion with local promotion");
194  // Recompute the local size for the previous definition with adapted
195  // promotion target.
196  auto NewPrevLocalSize =
197  getLocalSize(PreviousDefinition.NDRange,
198  PreviousDefinition.Definition, Promotion::Local);
199 
200  if (!NewPrevLocalSize.has_value()) {
201  printPerformanceWarning(
202  "Not performing specified local promotion because previous "
203  "kernels did not specify a local size");
204  PreviousDefinition.PromotionTarget = Promotion::None;
205  return;
206  }
207 
208  PreviousDefinition.LocalSize = NewPrevLocalSize.value();
209  PreviousDefinition.PromotionTarget = Promotion::Local;
210  }
211  if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
212  printPerformanceWarning("Not performing specified local promotion due "
213  "to work-group size mismatch");
214  PreviousDefinition.PromotionTarget = Promotion::None;
215  return;
216  }
217  if (!accessorEquals(Req, PreviousDefinition.Definition)) {
218  printPerformanceWarning("Not performing specified promotion, due to "
219  "accessor parameter mismatch");
220  PreviousDefinition.PromotionTarget = Promotion::None;
221  return;
222  }
223  return;
224  }
225  case Promotion::Private: {
226  if (PreviousDefinition.PromotionTarget == Promotion::None) {
227  printPerformanceWarning(
228  "Not performing specified private promotion, due to previous "
229  "mismatch or because previous accessor specified no promotion");
230  return;
231  }
232 
233  if (PreviousDefinition.PromotionTarget == Promotion::Local) {
234  // Recompute the local size with adapted promotion target.
235  auto ThisLocalSize = getLocalSize(NDRange, Req, Promotion::Local);
236  if (!ThisLocalSize.has_value()) {
237  printPerformanceWarning("Work-group size for local promotion not "
238  "specified, not performing internalization");
239  PreviousDefinition.PromotionTarget = Promotion::None;
240  return;
241  }
242 
243  if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
244  printPerformanceWarning(
245  "Not performing specified local promotion due "
246  "to work-group size mismatch");
247  PreviousDefinition.PromotionTarget = Promotion::None;
248  return;
249  }
250 
251  if (!accessorEquals(Req, PreviousDefinition.Definition)) {
252  printPerformanceWarning("Not performing local promotion, due to "
253  "accessor parameter mismatch");
254  PreviousDefinition.PromotionTarget = Promotion::None;
255  return;
256  }
257 
258  printPerformanceWarning(
259  "Performing local internalization instead, because previous "
260  "accessor specified local promotion");
261  return;
262  }
263 
264  // Previous accessors also specified private promotion.
265  if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
266  printPerformanceWarning(
267  "Not performing specified private promotion due "
268  "to work-group size mismatch");
269  PreviousDefinition.PromotionTarget = Promotion::None;
270  return;
271  }
272  if (!accessorEquals(Req, PreviousDefinition.Definition)) {
273  printPerformanceWarning("Not performing specified promotion, due to "
274  "accessor parameter mismatch");
275  PreviousDefinition.PromotionTarget = Promotion::None;
276  return;
277  }
278  return;
279  }
280  }
281  } else {
282  if (ThisPromotionTarget == Promotion::Local && !ThisLocalSize.has_value()) {
283  printPerformanceWarning("Work-group size for local promotion not "
284  "specified, not performing internalization");
285  ThisPromotionTarget = Promotion::None;
286  ThisLocalSize = 0;
287  }
288  assert(ThisLocalSize.has_value());
289  Promotions.emplace(Req->MSYCLMemObj,
290  PromotionInformation{ThisPromotionTarget, KernelIndex,
291  ArgFunctionIndex, Req, NDRange,
292  ThisLocalSize.value(),
293  std::vector<bool>()});
294  }
295 }
296 
297 // Identify a parameter by the argument description, the kernel index and the
298 // parameter index in that kernel.
299 struct Param {
300  ArgDesc Arg;
301  unsigned KernelIndex;
302  unsigned ArgIndex;
303  bool Used;
304  Param(ArgDesc Argument, unsigned KernelIdx, unsigned ArgIdx, bool InUse)
305  : Arg{Argument}, KernelIndex{KernelIdx}, ArgIndex{ArgIdx}, Used{InUse} {}
306 };
307 
308 using ParamList = std::vector<Param>;
309 
310 using ParamIterator = std::vector<Param>::iterator;
311 
312 std::vector<Param>::const_iterator
313 detectIdenticalParameter(std::vector<Param> &Params, ArgDesc Arg) {
314  for (auto I = Params.begin(); I < Params.end(); ++I) {
315  // Two arguments of different type can never be identical.
316  if (I->Arg.MType == Arg.MType) {
317  if (Arg.MType == kernel_param_kind_t::kind_pointer ||
319  // Compare size and, if the size is identical, the content byte-by-byte.
320  if ((Arg.MSize == I->Arg.MSize) &&
321  std::memcmp(Arg.MPtr, I->Arg.MPtr, Arg.MSize) == 0) {
322  return I;
323  }
324  } else if (Arg.MType == kernel_param_kind_t::kind_accessor) {
325  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
326  Requirement *Other = static_cast<Requirement *>(I->Arg.MPtr);
327  if (accessorEquals(Req, Other)) {
328  return I;
329  }
330  }
331  }
332  }
333  return Params.end();
334 }
335 
336 template <typename T, typename F = typename std::remove_const_t<
337  typename std::remove_reference_t<T>>>
338 F *storePlainArg(std::vector<std::vector<char>> &ArgStorage, T &&Arg) {
339  ArgStorage.emplace_back(sizeof(T));
340  auto Storage = reinterpret_cast<F *>(ArgStorage.back().data());
341  *Storage = Arg;
342  return Storage;
343 }
344 
345 void *storePlainArgRaw(std::vector<std::vector<char>> &ArgStorage, void *ArgPtr,
346  size_t ArgSize) {
347  ArgStorage.emplace_back(ArgSize);
348  void *Storage = ArgStorage.back().data();
349  std::memcpy(Storage, ArgPtr, ArgSize);
350  return Storage;
351 }
352 
353 static ParamIterator preProcessArguments(
354  std::vector<std::vector<char>> &ArgStorage, ParamIterator Arg,
355  PromotionMap &PromotedAccs,
356  std::vector<::jit_compiler::ParameterInternalization> &InternalizeParams,
357  std::vector<::jit_compiler::JITConstant> &JITConstants,
358  ParamList &NonIdenticalParams,
359  ::jit_compiler::ParamIdentList &ParamIdentities) {
360 
361  // Unused arguments are still in the list at this point (because we
362  // need them for accessor handling), but there's not pre-processing
363  // that needs to be done.
364  if (!Arg->Used) {
365  return ++Arg;
366  }
367 
368  if (Arg->Arg.MType == kernel_param_kind_t::kind_pointer) {
369  // Pointer arguments are only stored in the kernel functor object, which
370  // will go out-of-scope before we execute the fused kernel. Therefore, we
371  // need to copy the pointer (not the memory it's pointing to) to a permanent
372  // location and update the argument.
373  Arg->Arg.MPtr =
374  storePlainArg(ArgStorage, *static_cast<void **>(Arg->Arg.MPtr));
375  }
376  if (Arg->Arg.MType == kernel_param_kind_t::kind_std_layout) {
377  // Standard layout arguments are only stored in the kernel functor object,
378  // which will go out-of-scope before we execute the fused kernel. Therefore,
379  // we need to copy the argument to a permant location and update the
380  // argument.
381  Arg->Arg.MPtr = storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
382  // Standard layout arguments do not participate in identical argument
383  // detection, but we still add it to the list here. As the SYCL runtime can
384  // only check the raw bytes for identical content, but is unaware of the
385  // underlying datatype, some identities that would be detected here could
386  // not be materialized by the JIT compiler. Instead of removing some
387  // standard layout arguments due to identity and missing some in case the
388  // materialization is not possible, we rely on constant propagation to
389  // replace standard layout arguments by constants (see below).
390  NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
391  true);
392  // Propagate values of scalar parameters as constants to the JIT
393  // compiler.
394  JITConstants.emplace_back(
395  ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
396  Arg->Arg.MPtr, Arg->Arg.MSize);
397  return ++Arg;
398  }
399  // First check if there's already another parameter with identical
400  // value.
401  auto Identical = detectIdenticalParameter(NonIdenticalParams, Arg->Arg);
402  if (Identical != NonIdenticalParams.end()) {
403  ::jit_compiler::Parameter ThisParam{Arg->KernelIndex, Arg->ArgIndex};
404  ::jit_compiler::Parameter IdenticalParam{Identical->KernelIndex,
405  Identical->ArgIndex};
406  ::jit_compiler::ParameterIdentity Identity{ThisParam, IdenticalParam};
407  ParamIdentities.push_back(Identity);
408  return ++Arg;
409  }
410 
411  if (Arg->Arg.MType == kernel_param_kind_t::kind_accessor) {
412  // Get local and private promotion information from accessors.
413  Requirement *Req = static_cast<Requirement *>(Arg->Arg.MPtr);
414  auto &Internalization = PromotedAccs.at(Req->MSYCLMemObj);
415  auto PromotionTarget = Internalization.PromotionTarget;
416  if (PromotionTarget == Promotion::Private ||
417  PromotionTarget == Promotion::Local) {
418  // The accessor should be promoted.
419  if (Internalization.KernelIndex == Arg->KernelIndex &&
420  Internalization.ArgIndex == Arg->ArgIndex) {
421  // This is the first accessor for this buffer that should be
422  // internalized.
423  InternalizeParams.emplace_back(
424  ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
425  (PromotionTarget == Promotion::Private)
426  ? ::jit_compiler::Internalization::Private
427  : ::jit_compiler::Internalization::Local,
428  Internalization.LocalSize);
429  // If an accessor will be promoted, i.e., if it has the promotion
430  // property attached to it, the next three arguments, that are
431  // associated with the accessor (access range, memory range, offset),
432  // must not participate in identical parameter detection or constant
433  // propagation, because their values will change if promotion happens.
434  // Therefore, we can just skip them here, but we need to remember which
435  // of them are used.
436  for (unsigned I = 0; I < 4; ++I) {
437  Internalization.UsedParams.push_back(Arg->Used);
438  ++Arg;
439  }
440  } else {
441  // We have previously encountered an accessor the same buffer, which
442  // should be internalized. We can add parameter identities for the
443  // accessor argument and the next three arguments (range, memory range
444  // and offset, if they are used).
445  unsigned Increment = 0;
446  for (unsigned I = 0; I < 4; ++I) {
447  // If the argument is used in both cases, i.e., on the original
448  // accessor to be internalized, and this one, we can insert a
449  // parameter identity.
450  if (Arg->Used && Internalization.UsedParams[I]) {
451  ::jit_compiler::Parameter ThisParam{Arg->KernelIndex,
452  Arg->ArgIndex};
453  ::jit_compiler::Parameter IdenticalParam{
454  Internalization.KernelIndex,
455  Internalization.ArgIndex + Increment};
456  ::jit_compiler::ParameterIdentity Identity{ThisParam,
457  IdenticalParam};
458  ParamIdentities.push_back(Identity);
459  }
460  if (Internalization.UsedParams[I]) {
461  ++Increment;
462  }
463  ++Arg;
464  }
465  }
466  return Arg;
467  } else {
468  // The accessor will not be promoted, so it can participate in identical
469  // parameter detection.
470  NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
471  true);
472  return ++Arg;
473  }
474  } else if (Arg->Arg.MType == kernel_param_kind_t::kind_pointer) {
475  // No identical parameter exists, so add this to the list.
476  NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
477  true);
478  return ++Arg;
479  }
480  return ++Arg;
481 }
482 
483 static void
484 updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
485  NDRDescT NDRange, std::vector<ArgDesc> &FusedArgs,
486  std::vector<std::vector<char>> &FusedArgStorage) {
487  auto &ArgUsageInfo = FusedKernelInfo.Args.UsageMask;
488  assert(ArgUsageInfo.size() == FusedArgs.size());
489  for (size_t ArgIndex = 0; ArgIndex < ArgUsageInfo.size();) {
490  bool PromotedToPrivate =
491  (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedPrivate);
492  bool PromotedToLocal =
493  (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedLocal);
494  if (PromotedToLocal || PromotedToPrivate) {
495  // For each internalized accessor, we need to override four arguments
496  // (see 'addArgsForGlobalAccessor' in handler.cpp for reference), i.e.,
497  // the pointer itself, plus twice the range and the offset.
498  auto &OldArgDesc = FusedArgs[ArgIndex];
499  assert(OldArgDesc.MType == kernel_param_kind_t::kind_accessor);
500  auto *Req = static_cast<Requirement *>(OldArgDesc.MPtr);
501 
502  // The stored args are all three-dimensional, but depending on the
503  // actual number of dimensions of the accessor, only a part of that
504  // argument is later on passed to the kernel.
505  const size_t SizeAccField =
506  sizeof(size_t) * (Req->MDims == 0 ? 1 : Req->MDims);
507  // Compute the local size and use it for the range parameters.
508  auto LocalSize = getLocalSize(NDRange, Req,
509  (PromotedToPrivate) ? Promotion::Private
510  : Promotion::Local);
511  range<3> AccessRange{1, 1, LocalSize.value()};
512  auto *RangeArg = storePlainArg(FusedArgStorage, AccessRange);
513  // Use all-zero as the offset
514  id<3> AcessOffset{0, 0, 0};
515  auto *OffsetArg = storePlainArg(FusedArgStorage, AcessOffset);
516 
517  // Override the arguments.
518  // 1. Override the pointer with a std-layout argument with 'nullptr' as
519  // value. handler.cpp does the same for local accessors.
520  int SizeInBytes = Req->MElemSize * LocalSize.value();
521  FusedArgs[ArgIndex] =
522  ArgDesc{kernel_param_kind_t::kind_std_layout, nullptr, SizeInBytes,
523  static_cast<int>(ArgIndex)};
524  ++ArgIndex;
525  // 2. Access Range
526  FusedArgs[ArgIndex] =
527  ArgDesc{kernel_param_kind_t::kind_std_layout, RangeArg,
528  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
529  ++ArgIndex;
530  // 3. Memory Range
531  FusedArgs[ArgIndex] =
532  ArgDesc{kernel_param_kind_t::kind_std_layout, RangeArg,
533  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
534  ++ArgIndex;
535  // 4. Offset
536  FusedArgs[ArgIndex] =
537  ArgDesc{kernel_param_kind_t::kind_std_layout, OffsetArg,
538  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
539  ++ArgIndex;
540  } else {
541  ++ArgIndex;
542  }
543  }
544 }
545 
546 std::unique_ptr<detail::CG>
548  std::vector<ExecCGCommand *> &InputKernels,
549  const property_list &PropList) {
550  // Retrieve the device binary from each of the input
551  // kernels to hand them over to the JIT compiler.
552  std::vector<::jit_compiler::SYCLKernelInfo> InputKernelInfo;
553  std::vector<std::string> InputKernelNames;
554  // Collect argument information from all input kernels.
555  std::vector<std::vector<char>> ArgsStorage;
556  std::vector<detail::AccessorImplPtr> AccStorage;
557  std::vector<Requirement *> Requirements;
558  std::vector<detail::EventImplPtr> Events;
559  std::vector<::jit_compiler::NDRange> Ranges;
560  RT::PiKernelCacheConfig KernelCacheConfig =
562  unsigned KernelIndex = 0;
563  ParamList FusedParams;
564  PromotionMap PromotedAccs;
565  // TODO(Lukas, ONNX-399): Collect information about streams and auxiliary
566  // resources (which contain reductions) and figure out how to fuse them.
567  for (auto &RawCmd : InputKernels) {
568  auto *KernelCmd = static_cast<ExecCGCommand *>(RawCmd);
569  auto &CG = KernelCmd->getCG();
570  assert(CG.getType() == CG::Kernel);
571  auto *KernelCG = static_cast<CGExecKernel *>(&CG);
572 
573  auto KernelName = KernelCG->MKernelName;
574  if (KernelName.empty()) {
575  printPerformanceWarning(
576  "Cannot fuse kernel with invalid kernel function name");
577  return nullptr;
578  }
579  const RTDeviceBinaryImage *DeviceImage = nullptr;
580  RT::PiProgram Program = nullptr;
581  if (KernelCG->getKernelBundle() != nullptr) {
582  // Retrieve the device image from the kernel bundle.
583  auto KernelBundle = KernelCG->getKernelBundle();
584  kernel_id KernelID =
586 
587  auto SyclKernel = detail::getSyclObjImpl(
588  KernelBundle->get_kernel(KernelID, KernelBundle));
589 
590  DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
591  Program = SyclKernel->getDeviceImage()->get_program_ref();
592  } else if (KernelCG->MSyclKernel != nullptr) {
593  DeviceImage =
594  KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
595  Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
596  } else {
597  auto ContextImpl = Queue->getContextImplPtr();
598  auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
599  auto DeviceImpl = Queue->getDeviceImplPtr();
600  auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
602  KernelCG->MOSModuleHandle, KernelName, Context, Device);
604  *DeviceImage, Context, Device);
605  }
606  if (!DeviceImage || !Program) {
607  printPerformanceWarning("No suitable IR available for fusion");
608  return nullptr;
609  }
610  ProgramManager::KernelArgMask EliminatedArgs;
611  if (Program && (KernelCG->MSyclKernel == nullptr ||
612  !KernelCG->MSyclKernel->isCreatedFromSource())) {
613  EliminatedArgs =
615  KernelCG->MOSModuleHandle, Program, KernelName);
616  }
617 
618  // Collect information about the arguments of this kernel.
619 
620  // Might need to sort the arguments in case they are not already sorted,
621  // see also the similar code in commands.cpp.
622  auto Args = KernelCG->MArgs;
623  std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
624  return A.MIndex < B.MIndex;
625  });
626 
627  ::jit_compiler::SYCLArgumentDescriptor ArgDescriptor;
628  size_t ArgIndex = 0;
629  // The kernel function in SPIR-V will only have the non-eliminated
630  // arguments, so keep track of this "actual" argument index.
631  unsigned ArgFunctionIndex = 0;
632  for (auto &Arg : Args) {
633  ArgDescriptor.Kinds.push_back(translateArgType(Arg.MType));
634  // DPC++ internally uses 'true' to indicate that an argument has been
635  // eliminated, while the JIT compiler uses 'true' to indicate an
636  // argument is used. Translate this here.
637  bool Eliminated = !EliminatedArgs.empty() && EliminatedArgs[ArgIndex++];
638  ArgDescriptor.UsageMask.emplace_back(!Eliminated);
639 
640  // If the argument has not been eliminated, i.e., is still present on
641  // the kernel function in LLVM-IR/SPIR-V, collect information about the
642  // argument for performance optimizations in the JIT compiler.
643  if (!Eliminated) {
644  if (Arg.MType == kernel_param_kind_t::kind_accessor) {
645  resolveInternalization(Arg, KernelIndex, ArgFunctionIndex,
646  KernelCG->MNDRDesc, PromotedAccs);
647  }
648  FusedParams.emplace_back(Arg, KernelIndex, ArgFunctionIndex, true);
649  ++ArgFunctionIndex;
650  } else {
651  FusedParams.emplace_back(Arg, KernelIndex, 0, false);
652  }
653  }
654 
655  // TODO(Lukas, ONNX-399): Check for the correct kernel bundle state of the
656  // device image?
657  auto &RawDeviceImage = DeviceImage->getRawData();
658  auto DeviceImageSize = static_cast<size_t>(RawDeviceImage.BinaryEnd -
659  RawDeviceImage.BinaryStart);
660  // Set 0 as the number of address bits, because the JIT compiler can set
661  // this field based on information from SPIR-V/LLVM module's data-layout.
662  auto BinaryImageFormat =
663  translateBinaryImageFormat(DeviceImage->getFormat());
664  if (BinaryImageFormat == ::jit_compiler::BinaryFormat::INVALID) {
665  printPerformanceWarning("No suitable IR available for fusion");
666  return nullptr;
667  }
668  ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
669  translateBinaryImageFormat(DeviceImage->getFormat()), 0,
670  RawDeviceImage.BinaryStart, DeviceImageSize};
671 
672  constexpr auto SYCLTypeToIndices = [](auto Val) -> ::jit_compiler::Indices {
673  return {Val.get(0), Val.get(1), Val.get(2)};
674  };
675 
676  auto &CurrentNDR = KernelCG->MNDRDesc;
677  const ::jit_compiler::NDRange JITCompilerNDR{
678  static_cast<int>(CurrentNDR.Dims),
679  SYCLTypeToIndices(CurrentNDR.GlobalSize),
680  SYCLTypeToIndices(CurrentNDR.LocalSize),
681  SYCLTypeToIndices(CurrentNDR.GlobalOffset)};
682 
683  Ranges.push_back(JITCompilerNDR);
684  InputKernelInfo.emplace_back(KernelName, ArgDescriptor, JITCompilerNDR,
685  BinInfo);
686  InputKernelNames.push_back(KernelName);
687 
688  // Collect information for the fused kernel
689 
690  if (CurrentNDR.GlobalSize[0] == 0 && CurrentNDR.NumWorkGroups[0] != 0) {
691  // Some overloads of parallel_for_work_group only specify the number of
692  // work-groups, so this can be used to identify hierarchical parallel
693  // kernels, which are not supported by fusion for now.
694  printPerformanceWarning(
695  "Cannot fuse kernel with hierarchical parallelism");
696  return nullptr;
697  // Not all overloads of parallel_for_work_group only specify the number of
698  // work-groups, so the above mechanism might not detect all hierarchical
699  // parallelism.
700  // TODO(Lukas, CRD-6): Find a more reliable way to detect hierarchical
701  // parallelism.
702  }
703 
704  // We need to copy the storages here. The input CGs might be eliminated
705  // before the fused kernel gets executed, so we need to copy the storages
706  // here to make sure the arguments don't die on us before executing the
707  // fused kernel.
708  ArgsStorage.insert(ArgsStorage.end(), KernelCG->getArgsStorage().begin(),
709  KernelCG->getArgsStorage().end());
710  AccStorage.insert(AccStorage.end(), KernelCG->getAccStorage().begin(),
711  KernelCG->getAccStorage().end());
712  // TODO(Lukas, ONNX-399): Does the MSharedPtrStorage contain any
713  // information about actual shared pointers beside the kernel bundle and
714  // handler impl? If yes, we might need to copy it here.
715  Requirements.insert(Requirements.end(), KernelCG->MRequirements.begin(),
716  KernelCG->MRequirements.end());
717  Events.insert(Events.end(), KernelCG->MEvents.begin(),
718  KernelCG->MEvents.end());
719 
720  // If all kernels have the same cache config then use it for the merged
721  // kernel, otherwise use default configuration.
722  if (KernelIndex == 0) {
723  KernelCacheConfig = KernelCG->MKernelCacheConfig;
724  } else if (KernelCG->MKernelCacheConfig != KernelCacheConfig) {
725  KernelCacheConfig = PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT;
726  }
727 
728  ++KernelIndex;
729  }
730 
731  // Pre-process the arguments, to detect identical parameters or arguments that
732  // can be constant-propagated by the JIT compiler.
733  std::vector<::jit_compiler::ParameterInternalization> InternalizeParams;
734  std::vector<::jit_compiler::JITConstant> JITConstants;
735  ::jit_compiler::ParamIdentList ParamIdentities;
736  ParamList NonIdenticalParameters;
737  for (auto PI = FusedParams.begin(); PI != FusedParams.end();) {
738  PI = preProcessArguments(ArgsStorage, PI, PromotedAccs, InternalizeParams,
739  JITConstants, NonIdenticalParameters,
740  ParamIdentities);
741  }
742 
743  // Retrieve barrier flags.
744  int BarrierFlags =
745  (PropList
746  .has_property<ext::codeplay::experimental::property::no_barriers>())
747  ? -1
748  : 3;
749 
750  static size_t FusedKernelNameIndex = 0;
751  std::stringstream FusedKernelName;
752  FusedKernelName << "fused_" << FusedKernelNameIndex++;
753  ::jit_compiler::Config JITConfig;
754  bool DebugEnabled =
756  JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled);
757  JITConfig.set<::jit_compiler::option::JITEnableCaching>(
759 
760  auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels(
761  *MJITContext, std::move(JITConfig), InputKernelInfo, InputKernelNames,
762  FusedKernelName.str(), ParamIdentities, BarrierFlags, InternalizeParams,
763  JITConstants);
764 
765  if (FusionResult.failed()) {
766  if (DebugEnabled) {
767  std::cerr
768  << "ERROR: JIT compilation for kernel fusion failed with message:\n"
769  << FusionResult.getErrorMessage() << "\n";
770  }
771  return nullptr;
772  }
773 
774  auto &FusedKernelInfo = FusionResult.getKernelInfo();
775 
776  std::vector<ArgDesc> FusedArgs;
777  int FusedArgIndex = 0;
778  for (auto &Param : FusedParams) {
779  // Add to the argument list of the fused kernel, but with the correct
780  // new index in the fused kernel.
781  auto &Arg = Param.Arg;
782  FusedArgs.emplace_back(Arg.MType, Arg.MPtr, Arg.MSize, FusedArgIndex++);
783  }
784 
785  // Update the kernel arguments for internalized accessors.
786  const auto NDRDesc = [](const auto &ND) -> NDRDescT {
787  constexpr auto ToSYCLType = [](const auto &Indices) -> sycl::range<3> {
788  return {Indices[0], Indices[1], Indices[2]};
789  };
790  NDRDescT NDRDesc;
791  NDRDesc.Dims = ND.getDimensions();
792  NDRDesc.GlobalSize = ToSYCLType(ND.getGlobalSize());
793  NDRDesc.LocalSize = ToSYCLType(ND.getLocalSize());
794  NDRDesc.GlobalOffset = ToSYCLType(ND.getOffset());
795  return NDRDesc;
796  }(FusedKernelInfo.NDR);
797  updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage);
798 
799  if (!FusionResult.cached()) {
800  auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo);
802  } else if (DebugEnabled) {
803  std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
804  }
805 
806  // Create a kernel bundle for the fused kernel.
807  // Kernel bundles are stored in the CG as one of the "extended" members.
809  FusedKernelInfo.Name);
810  std::vector<std::shared_ptr<const void>> RawExtendedMembers;
811 
812  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
813  detail::getSyclObjImpl(get_kernel_bundle<bundle_state::executable>(
814  Queue->get_context(), {Queue->get_device()}, {FusedKernelId}));
815 
816  std::unique_ptr<detail::CG> FusedCG;
817  FusedCG.reset(new detail::CGExecKernel(
818  NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr),
819  std::move(ArgsStorage), std::move(AccStorage),
820  std::move(RawExtendedMembers), std::move(Requirements), std::move(Events),
821  std::move(FusedArgs), FusedKernelInfo.Name, OSUtil::DummyModuleHandle, {},
822  {}, CG::CGTYPE::Kernel, KernelCacheConfig));
823  return FusedCG;
824 }
825 
826 pi_device_binaries jit_compiler::createPIDeviceBinary(
827  const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo) {
828 
829  DeviceBinaryContainer Binary;
830 
831  // Create an offload entry for the fused kernel.
832  // It seems to be OK to set zero for most of the information here, at least
833  // that is the case for compiled SPIR-V binaries.
834  OffloadEntryContainer Entry{FusedKernelInfo.Name, nullptr, 0, 0, 0};
835  Binary.addOffloadEntry(std::move(Entry));
836 
837  // Create a property entry for the argument usage mask for the fused kernel.
838  auto ArgMask = encodeArgUsageMask(FusedKernelInfo.Args.UsageMask);
839  PropertyContainer ArgMaskProp{FusedKernelInfo.Name, ArgMask.data(),
840  ArgMask.size(),
842 
843  // Create a property set for the argument usage masks of all kernels
844  // (currently only one).
845  PropertySetContainer ArgMaskPropSet{
847 
848  ArgMaskPropSet.addProperty(std::move(ArgMaskProp));
849 
850  Binary.addProperty(std::move(ArgMaskPropSet));
851 
852  DeviceBinariesCollection Collection;
853  Collection.addDeviceBinary(std::move(Binary),
854  FusedKernelInfo.BinaryInfo.BinaryStart,
855  FusedKernelInfo.BinaryInfo.BinarySize,
856  FusedKernelInfo.BinaryInfo.AddressBits);
857 
858  JITDeviceBinaries.push_back(std::move(Collection));
859  return JITDeviceBinaries.back().getPIDeviceStruct();
860 }
861 
862 std::vector<uint8_t> jit_compiler::encodeArgUsageMask(
864  // This must match the decoding logic in program_manager.cpp.
865  constexpr uint64_t NBytesForSize = 8;
866  constexpr uint64_t NBitsInElement = 8;
867  uint64_t Size = static_cast<uint64_t>(Mask.size());
868  // Round the size to the next multiple of 8
869  uint64_t RoundedSize =
870  ((Size + (NBitsInElement - 1)) & (~(NBitsInElement - 1)));
871  std::vector<uint8_t> Encoded((RoundedSize / NBitsInElement) + NBytesForSize,
872  0u);
873  // First encode the size of the actual mask
874  for (size_t i = 0; i < NBytesForSize; ++i) {
875  uint8_t Byte =
876  static_cast<uint8_t>((RoundedSize >> i * NBitsInElement) & 0xFF);
877  Encoded[i] = Byte;
878  }
879  // Encode the actual mask bit-wise
880  for (size_t i = 0; i < Size; ++i) {
881  // DPC++ internally uses 'true' to indicate that an argument has been
882  // eliminated, while the JIT compiler uses 'true' to indicate an argument
883  // is used. Translate this here.
884  if (!(Mask[i] & ::jit_compiler::ArgUsage::Used)) {
885  uint8_t &Byte = Encoded[NBytesForSize + (i / NBitsInElement)];
886  Byte |= static_cast<uint8_t>((1 << (i % NBitsInElement)));
887  }
888  }
889  return Encoded;
890 }
891 
892 } // namespace detail
893 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
894 } // namespace sycl
895 
896 #endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
sycl::_V1::detail::pi::PiDeviceBinaryType
::pi_device_binary_type PiDeviceBinaryType
Definition: pi.hpp:127
sycl::_V1::detail::kernel_param_kind_t::kind_std_layout
@ kind_std_layout
PI_DEVICE_BINARY_TYPE_SPIRV
static constexpr pi_device_binary_type PI_DEVICE_BINARY_TYPE_SPIRV
Definition: pi.h:767
sycl::_V1::make_error_code
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:92
sycl::_V1::ext::oneapi::experimental::obj
global_pointer_t obj
Definition: annotated_arg.hpp:76
PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE
static constexpr pi_device_binary_type PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE
Definition: pi.h:769
sycl::_V1::detail::kernel_param_kind_t
kernel_param_kind_t
Definition: kernel_desc.hpp:37
sycl::_V1::detail::remove_const_t
typename std::remove_const< T >::type remove_const_t
Definition: stl_type_traits.hpp:30
__SYCL_INLINE_VER_NAMESPACE
#define __SYCL_INLINE_VER_NAMESPACE(X)
Definition: defines_elementary.hpp:11
sycl::_V1::detail::jit_compiler::fuseKernels
std::unique_ptr< detail::CG > fuseKernels(QueueImplPtr Queue, std::vector< ExecCGCommand * > &InputKernels, const property_list &)
sycl::_V1::detail::ProgramManager::getDeviceImage
RTDeviceBinaryImage & getDeviceImage(OSModuleHandle M, const std::string &KernelName, const context &Context, const device &Device, bool JITCompilationIsRequired=false)
Definition: program_manager.cpp:109
sycl::_V1::detail::pi::PiKernelCacheConfig
::pi_kernel_cache_config PiKernelCacheConfig
Definition: pi.hpp:148
sycl::_V1::detail::memcpy
void memcpy(void *Dst, const void *Src, size_t Size)
Definition: memcpy.hpp:16
sycl::_V1::detail::kernel_param_kind_t::kind_accessor
@ kind_accessor
__SYCL_PI_PROPERTY_SET_KERNEL_PARAM_OPT_INFO
#define __SYCL_PI_PROPERTY_SET_KERNEL_PARAM_OPT_INFO
PropertySetRegistry::SYCL_KERNEL_PARAM_OPT_INFO defined in PropertySetIO.h.
Definition: pi.h:816
pi_device_binaries_struct
This struct is a record of all the device code that may be offloaded.
Definition: pi.h:939
sycl
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
sycl::_V1::detail::KernelBundleImplPtr
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
Definition: kernel_bundle.hpp:138
sycl::_V1::detail::CG::Kernel
@ Kernel
Definition: cg.hpp:57
queue_impl.hpp
pi.hpp
sycl::_V1::detail::ProgramManager::addImages
void addImages(pi_device_binaries DeviceImages)
Definition: program_manager.cpp:1177
sycl::_V1::detail::Requirement
AccessorImplHost Requirement
Definition: accessor_impl.hpp:150
sycl::_V1::detail::ProgramManager::getInstance
static ProgramManager & getInstance()
Definition: program_manager.cpp:65
jit_compiler::ArgUsageMask
std::vector< unsigned char > ArgUsageMask
Definition: jit_compiler.hpp:18
sycl::_V1::detail::ProgramManager::getSYCLKernelID
kernel_id getSYCLKernelID(const std::string &KernelName)
Definition: program_manager.cpp:1544
sycl::_V1::ext::oneapi::experimental::has_property
static constexpr bool has_property()
Definition: annotated_arg.hpp:162
kernel_bundle.hpp
sycl::_V1::detail::ProgramManager::createPIProgram
RT::PiProgram createPIProgram(const RTDeviceBinaryImage &Img, const context &Context, const device &Device)
Definition: program_manager.cpp:277
PI_PROPERTY_TYPE_BYTE_ARRAY
@ PI_PROPERTY_TYPE_BYTE_ARRAY
Definition: pi.h:733
sycl::_V1::detail::pi::PiProgram
::pi_program PiProgram
Definition: pi.hpp:130
std::cerr
__SYCL_EXTERN_STREAM_ATTRS ostream cerr
Linked to standard error (unbuffered)
PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT
Definition: pi.h:635
sycl::_V1::detail::kernel_param_kind_t::kind_pointer
@ kind_pointer
fusion_properties.hpp
device_image_impl.hpp
sycl::_V1::detail::ProgramManager::KernelArgMask
std::vector< bool > KernelArgMask
Definition: program_manager.hpp:84
sycl::_V1::detail::ProgramManager::getEliminatedKernelArgMask
KernelArgMask getEliminatedKernelArgMask(OSModuleHandle M, pi::PiProgram NativePrg, const std::string &KernelName)
Returns the mask for eliminated kernel arguments for the requested kernel within the native program.
Definition: program_manager.cpp:1471
sycl::_V1::detail::QueueImplPtr
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: event_impl.hpp:32
kernel_bundle_impl.hpp
sycl::_V1::detail::WorkSizeGuarantees::None
@ None
kernel_impl.hpp
usm_settings::Device
@ Device
Definition: usm_allocator_config.hpp:18
sycl::_V1::detail::OSUtil::DummyModuleHandle
static constexpr OSModuleHandle DummyModuleHandle
Dummy module handle to designate non-existing module for a device binary image loaded from file e....
Definition: os_util.hpp:68
jit_compiler.hpp
sycl::_V1::detail::SYCLConfig::get
static const char * get()
Definition: config.hpp:114
sycl_mem_obj_t.hpp
sycl::_V1::detail::getSyclObjImpl
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: common.hpp:300