DPC++ Runtime
Runtime libraries for oneAPI DPC++
jit_compiler.cpp
Go to the documentation of this file.
1 //==--- jit_compiler.cpp - SYCL runtime JIT compiler for kernel fusion -----==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include <sycl/feature_test.hpp>
9 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
10 #include <KernelFusion.h>
12 #include <detail/jit_compiler.hpp>
14 #include <detail/kernel_impl.hpp>
15 #include <detail/queue_impl.hpp>
17 #include <sycl/detail/pi.hpp>
19 #include <sycl/kernel_bundle.hpp>
20 
21 namespace sycl {
22 inline namespace _V1 {
23 namespace detail {
24 
25 static ::jit_compiler::BinaryFormat
26 translateBinaryImageFormat(pi::PiDeviceBinaryType Type) {
27  switch (Type) {
29  return ::jit_compiler::BinaryFormat::SPIRV;
31  return ::jit_compiler::BinaryFormat::LLVM;
32  default:
33  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
34  "Format unsupported for JIT compiler");
35  }
36 }
37 
38 ::jit_compiler::BinaryFormat getTargetFormat(QueueImplPtr &Queue) {
39  auto Backend = Queue->getDeviceImplPtr()->getBackend();
40  switch (Backend) {
42  case backend::opencl:
43  return ::jit_compiler::BinaryFormat::SPIRV;
45  return ::jit_compiler::BinaryFormat::PTX;
47  return ::jit_compiler::BinaryFormat::AMDGCN;
48  default:
49  throw sycl::exception(
50  sycl::make_error_code(sycl::errc::feature_not_supported),
51  "Backend unsupported by kernel fusion");
52  }
53 }
54 
55 ::jit_compiler::TargetInfo getTargetInfo(QueueImplPtr &Queue) {
56  ::jit_compiler::BinaryFormat Format = getTargetFormat(Queue);
58  Format, static_cast<::jit_compiler::DeviceArchitecture>(
59  Queue->getDeviceImplPtr()->getDeviceArch()));
60 }
61 
62 std::pair<const RTDeviceBinaryImage *, sycl::detail::pi::PiProgram>
63 retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) {
64  auto KernelName = KernelCG->getKernelName();
65 
66  bool isNvidia =
67  Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_cuda;
68  bool isHIP =
69  Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_hip;
70  if (isNvidia || isHIP) {
71  auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName);
72  std::vector<kernel_id> KernelIds{KernelID};
73  auto DeviceImages =
75  auto DeviceImage = std::find_if(
76  DeviceImages.begin(), DeviceImages.end(),
77  [isNvidia](RTDeviceBinaryImage *DI) {
78  const std::string &TargetSpec = isNvidia ? std::string("llvm_nvptx64")
79  : std::string("llvm_amdgcn");
80  return DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
81  DI->getRawData().DeviceTargetSpec == TargetSpec;
82  });
83  if (DeviceImage == DeviceImages.end()) {
84  return {nullptr, nullptr};
85  }
86  auto ContextImpl = Queue->getContextImplPtr();
87  auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
88  auto DeviceImpl = Queue->getDeviceImplPtr();
89  auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
92  Context, Device);
93  return {*DeviceImage, Program};
94  }
95 
96  const RTDeviceBinaryImage *DeviceImage = nullptr;
97  sycl::detail::pi::PiProgram Program = nullptr;
98  if (KernelCG->getKernelBundle() != nullptr) {
99  // Retrieve the device image from the kernel bundle.
100  auto KernelBundle = KernelCG->getKernelBundle();
101  kernel_id KernelID =
103 
104  auto SyclKernel = detail::getSyclObjImpl(
105  KernelBundle->get_kernel(KernelID, KernelBundle));
106 
107  DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
108  Program = SyclKernel->getDeviceImage()->get_program_ref();
109  } else if (KernelCG->MSyclKernel != nullptr) {
110  DeviceImage = KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
111  Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
112  } else {
113  auto ContextImpl = Queue->getContextImplPtr();
114  auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
115  auto DeviceImpl = Queue->getDeviceImplPtr();
116  auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
118  KernelName, Context, Device);
120  *DeviceImage, Context, Device);
121  }
122  return {DeviceImage, Program};
123 }
124 
125 static ::jit_compiler::ParameterKind
126 translateArgType(kernel_param_kind_t Kind) {
127  using PK = ::jit_compiler::ParameterKind;
128  using kind = kernel_param_kind_t;
129  switch (Kind) {
130  case kind::kind_accessor:
131  return PK::Accessor;
132  case kind::kind_std_layout:
133  return PK::StdLayout;
134  case kind::kind_sampler:
135  return PK::Sampler;
136  case kind::kind_pointer:
137  return PK::Pointer;
138  case kind::kind_specialization_constants_buffer:
139  return PK::SpecConstBuffer;
140  case kind::kind_stream:
141  return PK::Stream;
142  case kind::kind_invalid:
143  return PK::Invalid;
144  }
145  return PK::Invalid;
146 }
147 
148 enum class Promotion { None, Private, Local };
149 
150 struct PromotionInformation {
151  Promotion PromotionTarget;
152  unsigned KernelIndex;
153  unsigned ArgIndex;
154  Requirement *Definition;
155  NDRDescT NDRange;
156  size_t LocalSize;
157  size_t ElemSize;
158  std::vector<bool> UsedParams;
159 };
160 
161 using PromotionMap = std::unordered_map<SYCLMemObjI *, PromotionInformation>;
162 
163 static inline void printPerformanceWarning(const std::string &Message) {
165  std::cerr << "WARNING: " << Message << "\n";
166  }
167 }
168 
169 template <typename Obj> Promotion getPromotionTarget(const Obj &obj) {
170  auto Result = Promotion::None;
171  if (obj.template has_property<
172  ext::codeplay::experimental::property::promote_private>()) {
173  Result = Promotion::Private;
174  }
175  if (obj.template has_property<
176  ext::codeplay::experimental::property::promote_local>()) {
177  if (Result != Promotion::None) {
178  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
179  "Two contradicting promotion properties on the "
180  "same buffer/accessor are not allowed.");
181  }
182  Result = Promotion::Local;
183  }
184  return Result;
185 }
186 
187 static Promotion getInternalizationInfo(Requirement *Req) {
188  auto AccPromotion = getPromotionTarget(Req->MPropertyList);
189 
190  auto *MemObj = static_cast<sycl::detail::SYCLMemObjT *>(Req->MSYCLMemObj);
191  if (MemObj->getType() != SYCLMemObjI::MemObjType::Buffer) {
192  // We currently do not support promotion on non-buffer memory objects (e.g.,
193  // images).
194  return Promotion::None;
195  }
196  Promotion BuffPromotion = getPromotionTarget(*MemObj);
197  if (AccPromotion != Promotion::None && BuffPromotion != Promotion::None &&
198  AccPromotion != BuffPromotion) {
199  throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
200  "Contradicting promotion properties on accessor and "
201  "underlying buffer are not allowed");
202  }
203  return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion;
204 }
205 
206 static std::optional<size_t> getLocalSize(NDRDescT NDRange,
207  std::optional<size_t> UserGlobalSize,
208  Requirement *Req, Promotion Target) {
209  assert((!UserGlobalSize.has_value() || Target != Promotion::Local) &&
210  "Unexpected range rounding");
211  auto NumElementsMem = static_cast<SYCLMemObjT *>(Req->MSYCLMemObj)->size();
212  if (Target == Promotion::Private) {
213  if (UserGlobalSize.has_value()) {
214  // Only the first dimension is affected by range rounding.
215  NDRange.GlobalSize[0] = *UserGlobalSize;
216  }
217  auto NumWorkItems = NDRange.GlobalSize.size();
218  // For private internalization, the local size is
219  // (Number of elements in buffer)/(number of work-items)
220  return NumElementsMem / NumWorkItems;
221  } else if (Target == Promotion::Local) {
222  if (NDRange.LocalSize.size() == 0) {
223  // No work-group size provided, cannot calculate the local size
224  // and need to bail out.
225  return {};
226  }
227  auto NumWorkGroups = NDRange.GlobalSize.size() / NDRange.LocalSize.size();
228  // For local internalization, the local size is
229  // (Number of elements in buffer)/(number of work-groups)
230  return NumElementsMem / NumWorkGroups;
231  }
232  return 0;
233 }
234 
235 static bool accessorEquals(Requirement *Req, Requirement *Other) {
236  return Req->MOffset == Other->MOffset &&
237  Req->MAccessRange == Other->MAccessRange &&
238  Req->MMemoryRange == Other->MMemoryRange &&
239  Req->MSYCLMemObj == Other->MSYCLMemObj && Req->MDims == Other->MDims &&
240  Req->MElemSize == Other->MElemSize &&
241  Req->MOffsetInBytes == Other->MOffsetInBytes &&
242  Req->MIsSubBuffer == Other->MIsSubBuffer;
243 }
244 
245 static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
246  unsigned ArgFunctionIndex, NDRDescT NDRange,
247  std::optional<size_t> UserGlobalSize,
248  PromotionMap &Promotions) {
249  assert(Arg.MType == kernel_param_kind_t::kind_accessor);
250 
251  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
252 
253  auto ThisPromotionTarget = getInternalizationInfo(Req);
254  auto ThisLocalSize =
255  getLocalSize(NDRange, UserGlobalSize, Req, ThisPromotionTarget);
256 
257  if (Promotions.count(Req->MSYCLMemObj)) {
258  // We previously encountered an accessor for the same buffer.
259  auto &PreviousDefinition = Promotions.at(Req->MSYCLMemObj);
260 
261  switch (ThisPromotionTarget) {
262  case Promotion::None: {
263  if (PreviousDefinition.PromotionTarget != Promotion::None) {
264  printPerformanceWarning(
265  "Deactivating previously specified promotion, because this "
266  "accessor does not specify promotion");
267  PreviousDefinition.PromotionTarget = Promotion::None;
268  }
269  return;
270  }
271  case Promotion::Local: {
272  if (PreviousDefinition.PromotionTarget == Promotion::None) {
273  printPerformanceWarning(
274  "Not performing specified local promotion, due to previous "
275  "mismatch or because previous accessor specified no promotion");
276  return;
277  }
278  if (!ThisLocalSize.has_value()) {
279  printPerformanceWarning("Work-group size for local promotion not "
280  "specified, not performing internalization");
281  PreviousDefinition.PromotionTarget = Promotion::None;
282  return;
283  }
284  if (PreviousDefinition.PromotionTarget == Promotion::Private) {
285  printPerformanceWarning(
286  "Overriding previous private promotion with local promotion");
287  // Recompute the local size for the previous definition with adapted
288  // promotion target.
289  auto NewPrevLocalSize =
290  getLocalSize(PreviousDefinition.NDRange, std::nullopt,
291  PreviousDefinition.Definition, Promotion::Local);
292 
293  if (!NewPrevLocalSize.has_value()) {
294  printPerformanceWarning(
295  "Not performing specified local promotion because previous "
296  "kernels did not specify a local size");
297  PreviousDefinition.PromotionTarget = Promotion::None;
298  return;
299  }
300 
301  PreviousDefinition.LocalSize = NewPrevLocalSize.value();
302  PreviousDefinition.PromotionTarget = Promotion::Local;
303  }
304  if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
305  printPerformanceWarning("Not performing specified local promotion due "
306  "to work-group size mismatch");
307  PreviousDefinition.PromotionTarget = Promotion::None;
308  return;
309  }
310  if (!accessorEquals(Req, PreviousDefinition.Definition)) {
311  printPerformanceWarning("Not performing specified promotion, due to "
312  "accessor parameter mismatch");
313  PreviousDefinition.PromotionTarget = Promotion::None;
314  return;
315  }
316  return;
317  }
318  case Promotion::Private: {
319  if (PreviousDefinition.PromotionTarget == Promotion::None) {
320  printPerformanceWarning(
321  "Not performing specified private promotion, due to previous "
322  "mismatch or because previous accessor specified no promotion");
323  return;
324  }
325 
326  if (PreviousDefinition.PromotionTarget == Promotion::Local) {
327  // Recompute the local size with adapted promotion target.
328  auto ThisLocalSize =
329  getLocalSize(NDRange, std::nullopt, Req, Promotion::Local);
330  if (!ThisLocalSize.has_value()) {
331  printPerformanceWarning("Work-group size for local promotion not "
332  "specified, not performing internalization");
333  PreviousDefinition.PromotionTarget = Promotion::None;
334  return;
335  }
336 
337  if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
338  printPerformanceWarning(
339  "Not performing specified local promotion due "
340  "to work-group size mismatch");
341  PreviousDefinition.PromotionTarget = Promotion::None;
342  return;
343  }
344 
345  if (!accessorEquals(Req, PreviousDefinition.Definition)) {
346  printPerformanceWarning("Not performing local promotion, due to "
347  "accessor parameter mismatch");
348  PreviousDefinition.PromotionTarget = Promotion::None;
349  return;
350  }
351 
352  printPerformanceWarning(
353  "Performing local internalization instead, because previous "
354  "accessor specified local promotion");
355  return;
356  }
357 
358  // Previous accessors also specified private promotion.
359  if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
360  printPerformanceWarning(
361  "Not performing specified private promotion due "
362  "to work-group size mismatch");
363  PreviousDefinition.PromotionTarget = Promotion::None;
364  return;
365  }
366  if (!accessorEquals(Req, PreviousDefinition.Definition)) {
367  printPerformanceWarning("Not performing specified promotion, due to "
368  "accessor parameter mismatch");
369  PreviousDefinition.PromotionTarget = Promotion::None;
370  return;
371  }
372  return;
373  }
374  }
375  } else {
376  if (ThisPromotionTarget == Promotion::Local && !ThisLocalSize.has_value()) {
377  printPerformanceWarning("Work-group size for local promotion not "
378  "specified, not performing internalization");
379  ThisPromotionTarget = Promotion::None;
380  ThisLocalSize = 0;
381  }
382  assert(ThisLocalSize.has_value());
383  Promotions.emplace(
384  Req->MSYCLMemObj,
385  PromotionInformation{ThisPromotionTarget, KernelIndex, ArgFunctionIndex,
386  Req, NDRange, ThisLocalSize.value(),
387  Req->MElemSize, std::vector<bool>()});
388  }
389 }
390 
391 // Identify a parameter by the argument description, the kernel index and the
392 // parameter index in that kernel.
393 struct Param {
394  ArgDesc Arg;
395  unsigned KernelIndex;
396  unsigned ArgIndex;
397  bool Used;
398  Param(ArgDesc Argument, unsigned KernelIdx, unsigned ArgIdx, bool InUse)
399  : Arg{Argument}, KernelIndex{KernelIdx}, ArgIndex{ArgIdx}, Used{InUse} {}
400 };
401 
402 using ParamList = std::vector<Param>;
403 
404 using ParamIterator = std::vector<Param>::iterator;
405 
406 std::vector<Param>::const_iterator
407 detectIdenticalParameter(std::vector<Param> &Params, ArgDesc Arg) {
408  for (auto I = Params.begin(); I < Params.end(); ++I) {
409  // Two arguments of different type can never be identical.
410  if (I->Arg.MType == Arg.MType) {
411  if (Arg.MType == kernel_param_kind_t::kind_pointer ||
413  // Compare size and, if the size is identical, the content byte-by-byte.
414  if ((Arg.MSize == I->Arg.MSize) &&
415  std::memcmp(Arg.MPtr, I->Arg.MPtr, Arg.MSize) == 0) {
416  return I;
417  }
418  } else if (Arg.MType == kernel_param_kind_t::kind_accessor) {
419  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
420  Requirement *Other = static_cast<Requirement *>(I->Arg.MPtr);
421  if (accessorEquals(Req, Other)) {
422  return I;
423  }
424  }
425  }
426  }
427  return Params.end();
428 }
429 
430 template <typename T, typename F = typename std::remove_const_t<
431  typename std::remove_reference_t<T>>>
432 F *storePlainArg(std::vector<std::vector<char>> &ArgStorage, T &&Arg) {
433  ArgStorage.emplace_back(sizeof(T));
434  auto Storage = reinterpret_cast<F *>(ArgStorage.back().data());
435  *Storage = Arg;
436  return Storage;
437 }
438 
439 void *storePlainArgRaw(std::vector<std::vector<char>> &ArgStorage, void *ArgPtr,
440  size_t ArgSize) {
441  ArgStorage.emplace_back(ArgSize);
442  void *Storage = ArgStorage.back().data();
443  std::memcpy(Storage, ArgPtr, ArgSize);
444  return Storage;
445 }
446 
447 static ParamIterator preProcessArguments(
448  std::vector<std::vector<char>> &ArgStorage, ParamIterator Arg,
449  PromotionMap &PromotedAccs,
450  std::vector<::jit_compiler::ParameterInternalization> &InternalizeParams,
451  std::vector<::jit_compiler::JITConstant> &JITConstants,
452  ParamList &NonIdenticalParams,
453  std::vector<::jit_compiler::ParameterIdentity> &ParamIdentities) {
454 
455  // Unused arguments are still in the list at this point (because we
456  // need them for accessor handling), but there's not pre-processing
457  // that needs to be done.
458  if (!Arg->Used) {
459  return ++Arg;
460  }
461 
462  if (Arg->Arg.MType == kernel_param_kind_t::kind_pointer) {
463  // Pointer arguments are only stored in the kernel functor object, which
464  // will go out-of-scope before we execute the fused kernel. Therefore, we
465  // need to copy the pointer (not the memory it's pointing to) to a permanent
466  // location and update the argument.
467  Arg->Arg.MPtr =
468  storePlainArg(ArgStorage, *static_cast<void **>(Arg->Arg.MPtr));
469  }
470  if (Arg->Arg.MType == kernel_param_kind_t::kind_std_layout) {
471  // Standard layout arguments are only stored in the kernel functor object,
472  // which will go out-of-scope before we execute the fused kernel. Therefore,
473  // we need to copy the argument to a permant location and update the
474  // argument.
475  if (Arg->Arg.MPtr) {
476  Arg->Arg.MPtr =
477  storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
478  // Propagate values of scalar parameters as constants to the JIT
479  // compiler.
480  JITConstants.emplace_back(
481  ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
482  Arg->Arg.MPtr, Arg->Arg.MSize);
483  }
484  // Standard layout arguments do not participate in identical argument
485  // detection, but we still add it to the list here. As the SYCL runtime can
486  // only check the raw bytes for identical content, but is unaware of the
487  // underlying datatype, some identities that would be detected here could
488  // not be materialized by the JIT compiler. Instead of removing some
489  // standard layout arguments due to identity and missing some in case the
490  // materialization is not possible, we rely on constant propagation to
491  // replace standard layout arguments by constants.
492  NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
493  true);
494  return ++Arg;
495  }
496  // First check if there's already another parameter with identical
497  // value.
498  auto Identical = detectIdenticalParameter(NonIdenticalParams, Arg->Arg);
499  if (Identical != NonIdenticalParams.end()) {
500  ::jit_compiler::Parameter ThisParam{Arg->KernelIndex, Arg->ArgIndex};
501  ::jit_compiler::Parameter IdenticalParam{Identical->KernelIndex,
502  Identical->ArgIndex};
503  ::jit_compiler::ParameterIdentity Identity{ThisParam, IdenticalParam};
504  ParamIdentities.push_back(Identity);
505  return ++Arg;
506  }
507 
508  if (Arg->Arg.MType == kernel_param_kind_t::kind_accessor) {
509  // Get local and private promotion information from accessors.
510  Requirement *Req = static_cast<Requirement *>(Arg->Arg.MPtr);
511  auto &Internalization = PromotedAccs.at(Req->MSYCLMemObj);
512  auto PromotionTarget = Internalization.PromotionTarget;
513  if (PromotionTarget == Promotion::Private ||
514  PromotionTarget == Promotion::Local) {
515  // The accessor should be promoted.
516  if (Internalization.KernelIndex == Arg->KernelIndex &&
517  Internalization.ArgIndex == Arg->ArgIndex) {
518  // This is the first accessor for this buffer that should be
519  // internalized.
520  InternalizeParams.emplace_back(
521  ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
522  (PromotionTarget == Promotion::Private)
523  ? ::jit_compiler::Internalization::Private
524  : ::jit_compiler::Internalization::Local,
525  Internalization.LocalSize, Internalization.ElemSize);
526  // If an accessor will be promoted, i.e., if it has the promotion
527  // property attached to it, the next three arguments, that are
528  // associated with the accessor (access range, memory range, offset),
529  // must not participate in identical parameter detection or constant
530  // propagation, because their values will change if promotion happens.
531  // Therefore, we can just skip them here, but we need to remember which
532  // of them are used.
533  for (unsigned I = 0; I < 4; ++I) {
534  Internalization.UsedParams.push_back(Arg->Used);
535  ++Arg;
536  }
537  } else {
538  // We have previously encountered an accessor the same buffer, which
539  // should be internalized. We can add parameter identities for the
540  // accessor argument and the next three arguments (range, memory range
541  // and offset, if they are used).
542  unsigned Increment = 0;
543  for (unsigned I = 0; I < 4; ++I) {
544  // If the argument is used in both cases, i.e., on the original
545  // accessor to be internalized, and this one, we can insert a
546  // parameter identity.
547  if (Arg->Used && Internalization.UsedParams[I]) {
548  ::jit_compiler::Parameter ThisParam{Arg->KernelIndex,
549  Arg->ArgIndex};
550  ::jit_compiler::Parameter IdenticalParam{
551  Internalization.KernelIndex,
552  Internalization.ArgIndex + Increment};
553  ::jit_compiler::ParameterIdentity Identity{ThisParam,
554  IdenticalParam};
555  ParamIdentities.push_back(Identity);
556  }
557  if (Internalization.UsedParams[I]) {
558  ++Increment;
559  }
560  ++Arg;
561  }
562  }
563  return Arg;
564  } else {
565  // The accessor will not be promoted, so it can participate in identical
566  // parameter detection.
567  NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
568  true);
569  return ++Arg;
570  }
571  } else if (Arg->Arg.MType == kernel_param_kind_t::kind_pointer) {
572  // No identical parameter exists, so add this to the list.
573  NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
574  true);
575  return ++Arg;
576  }
577  return ++Arg;
578 }
579 
580 static void
581 updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
582  NDRDescT NDRange, std::vector<ArgDesc> &FusedArgs,
583  std::vector<std::vector<char>> &FusedArgStorage) {
584  auto &ArgUsageInfo = FusedKernelInfo.Args.UsageMask;
585  assert(ArgUsageInfo.size() == FusedArgs.size());
586  for (size_t ArgIndex = 0; ArgIndex < ArgUsageInfo.size();) {
587  bool PromotedToPrivate =
588  (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedPrivate);
589  bool PromotedToLocal =
590  (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedLocal);
591  if (PromotedToLocal || PromotedToPrivate) {
592  // For each internalized accessor, we need to override four arguments
593  // (see 'addArgsForGlobalAccessor' in handler.cpp for reference), i.e.,
594  // the pointer itself, plus twice the range and the offset.
595  auto &OldArgDesc = FusedArgs[ArgIndex];
596  assert(OldArgDesc.MType == kernel_param_kind_t::kind_accessor);
597  auto *Req = static_cast<Requirement *>(OldArgDesc.MPtr);
598 
599  // The stored args are all three-dimensional, but depending on the
600  // actual number of dimensions of the accessor, only a part of that
601  // argument is later on passed to the kernel.
602  const size_t SizeAccField =
603  sizeof(size_t) * (Req->MDims == 0 ? 1 : Req->MDims);
604  // Compute the local size and use it for the range parameters (only
605  // relevant for local promotion).
606  size_t LocalSize = PromotedToLocal ? *getLocalSize(NDRange, std::nullopt,
607  Req, Promotion::Local)
608  : 0;
609  range<3> AccessRange{1, 1, LocalSize};
610  auto *RangeArg = storePlainArg(FusedArgStorage, AccessRange);
611  // Use all-zero as the offset
612  id<3> AcessOffset{0, 0, 0};
613  auto *OffsetArg = storePlainArg(FusedArgStorage, AcessOffset);
614 
615  // Override the arguments.
616  // 1. Override the pointer with a std-layout argument with 'nullptr' as
617  // value. handler.cpp does the same for local accessors.
618  int SizeInBytes = Req->MElemSize * LocalSize;
619  FusedArgs[ArgIndex] =
620  ArgDesc{kernel_param_kind_t::kind_std_layout, nullptr, SizeInBytes,
621  static_cast<int>(ArgIndex)};
622  ++ArgIndex;
623  // 2. Access Range
624  FusedArgs[ArgIndex] =
625  ArgDesc{kernel_param_kind_t::kind_std_layout, RangeArg,
626  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
627  ++ArgIndex;
628  // 3. Memory Range
629  FusedArgs[ArgIndex] =
630  ArgDesc{kernel_param_kind_t::kind_std_layout, RangeArg,
631  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
632  ++ArgIndex;
633  // 4. Offset
634  FusedArgs[ArgIndex] =
635  ArgDesc{kernel_param_kind_t::kind_std_layout, OffsetArg,
636  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
637  ++ArgIndex;
638  } else {
639  ++ArgIndex;
640  }
641  }
642 }
643 
644 std::unique_ptr<detail::CG>
646  std::vector<ExecCGCommand *> &InputKernels,
647  const property_list &PropList) {
648  if (InputKernels.empty()) {
649  printPerformanceWarning("Fusion list is empty");
650  return nullptr;
651  }
652 
653  // Retrieve the device binary from each of the input
654  // kernels to hand them over to the JIT compiler.
655  std::vector<::jit_compiler::SYCLKernelInfo> InputKernelInfo;
656  std::vector<std::string> InputKernelNames;
657  // Collect argument information from all input kernels.
658 
659  detail::CG::StorageInitHelper CGData;
660  std::vector<std::vector<char>> &ArgsStorage = CGData.MArgsStorage;
661  std::vector<detail::AccessorImplPtr> &AccStorage = CGData.MAccStorage;
662  std::vector<Requirement *> &Requirements = CGData.MRequirements;
663  std::vector<detail::EventImplPtr> &Events = CGData.MEvents;
664  std::vector<::jit_compiler::NDRange> Ranges;
665  sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig =
667  unsigned KernelIndex = 0;
668  ParamList FusedParams;
669  PromotionMap PromotedAccs;
670  // TODO: Collect information about streams and figure out how
671  // to fuse them.
672  for (auto &RawCmd : InputKernels) {
673  auto *KernelCmd = static_cast<ExecCGCommand *>(RawCmd);
674  auto &CG = KernelCmd->getCG();
675  assert(KernelCmd->isFusable());
676  auto *KernelCG = static_cast<CGExecKernel *>(&CG);
677 
678  auto KernelName = KernelCG->MKernelName;
679  if (KernelName.empty()) {
680  printPerformanceWarning(
681  "Cannot fuse kernel with invalid kernel function name");
682  return nullptr;
683  }
684 
685  auto [DeviceImage, Program] = retrieveKernelBinary(Queue, KernelCG);
686 
687  if (!DeviceImage || !Program) {
688  printPerformanceWarning("No suitable IR available for fusion");
689  return nullptr;
690  }
691  const KernelArgMask *EliminatedArgs = nullptr;
692  if (Program && (KernelCG->MSyclKernel == nullptr ||
693  !KernelCG->MSyclKernel->isCreatedFromSource())) {
694  EliminatedArgs =
696  Program, KernelName);
697  }
698 
699  // Collect information about the arguments of this kernel.
700 
701  // Might need to sort the arguments in case they are not already sorted,
702  // see also the similar code in commands.cpp.
703  auto Args = KernelCG->MArgs;
704  std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
705  return A.MIndex < B.MIndex;
706  });
707 
708  // Determine whether the kernel has been subject to DPCPP's range rounding.
709  // If so, the first argument will be the original ("user") range.
710  std::optional<size_t> UserGlobalSize;
711  if ((KernelName.find("_ZTSN4sycl3_V16detail18RoundedRangeKernel") == 0 ||
712  KernelName.find("_ZTSN4sycl3_V16detail19__pf_kernel_wrapper") == 0) &&
713  !Args.empty()) {
714  auto &A0 = Args[0];
715  [[maybe_unused]] auto Dims = KernelCG->MNDRDesc.Dims;
716  assert(A0.MPtr && A0.MSize == static_cast<int>(Dims * sizeof(size_t)) &&
718  "Unexpected signature for rounded range kernel");
719 
720  size_t *UGS = reinterpret_cast<size_t *>(A0.MPtr);
721  // Range-rounding only applies to the first dimension.
722  assert(UGS[0] > KernelCG->MNDRDesc.GlobalSize[1]);
723  assert(Dims < 2 || UGS[1] == KernelCG->MNDRDesc.GlobalSize[1]);
724  assert(Dims < 3 || UGS[2] == KernelCG->MNDRDesc.GlobalSize[2]);
725  UserGlobalSize = UGS[0];
726  }
727 
728  ::jit_compiler::SYCLArgumentDescriptor ArgDescriptor{Args.size()};
729  size_t ArgIndex = 0;
730  // The kernel function in SPIR-V will only have the non-eliminated
731  // arguments, so keep track of this "actual" argument index.
732  unsigned ArgFunctionIndex = 0;
733  auto KindIt = ArgDescriptor.Kinds.begin();
734  auto UsageMaskIt = ArgDescriptor.UsageMask.begin();
735  for (auto &Arg : Args) {
736  *KindIt = translateArgType(Arg.MType);
737  ++KindIt;
738 
739  // DPC++ internally uses 'true' to indicate that an argument has been
740  // eliminated, while the JIT compiler uses 'true' to indicate an
741  // argument is used. Translate this here.
742  bool Eliminated = EliminatedArgs && !EliminatedArgs->empty() &&
743  (*EliminatedArgs)[ArgIndex++];
744  *UsageMaskIt = !Eliminated;
745  ++UsageMaskIt;
746 
747  // If the argument has not been eliminated, i.e., is still present on
748  // the kernel function in LLVM-IR/SPIR-V, collect information about the
749  // argument for performance optimizations in the JIT compiler.
750  if (!Eliminated) {
751  if (Arg.MType == kernel_param_kind_t::kind_accessor) {
752  resolveInternalization(Arg, KernelIndex, ArgFunctionIndex,
753  KernelCG->MNDRDesc, UserGlobalSize,
754  PromotedAccs);
755  }
756  FusedParams.emplace_back(Arg, KernelIndex, ArgFunctionIndex, true);
757  ++ArgFunctionIndex;
758  } else {
759  FusedParams.emplace_back(Arg, KernelIndex, 0, false);
760  }
761  }
762 
763  // TODO: Check for the correct kernel bundle state of the device image?
764  auto &RawDeviceImage = DeviceImage->getRawData();
765  auto DeviceImageSize = static_cast<size_t>(RawDeviceImage.BinaryEnd -
766  RawDeviceImage.BinaryStart);
767  // Set 0 as the number of address bits, because the JIT compiler can set
768  // this field based on information from SPIR-V/LLVM module's data-layout.
769  auto BinaryImageFormat =
770  translateBinaryImageFormat(DeviceImage->getFormat());
771  if (BinaryImageFormat == ::jit_compiler::BinaryFormat::INVALID) {
772  printPerformanceWarning("No suitable IR available for fusion");
773  return nullptr;
774  }
775  ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
776  BinaryImageFormat, 0, RawDeviceImage.BinaryStart, DeviceImageSize};
777 
778  constexpr auto SYCLTypeToIndices = [](auto Val) -> ::jit_compiler::Indices {
779  return {Val.get(0), Val.get(1), Val.get(2)};
780  };
781 
782  auto &CurrentNDR = KernelCG->MNDRDesc;
783  const ::jit_compiler::NDRange JITCompilerNDR{
784  static_cast<int>(CurrentNDR.Dims),
785  SYCLTypeToIndices(CurrentNDR.GlobalSize),
786  SYCLTypeToIndices(CurrentNDR.LocalSize),
787  SYCLTypeToIndices(CurrentNDR.GlobalOffset)};
788 
789  Ranges.push_back(JITCompilerNDR);
790  InputKernelInfo.emplace_back(KernelName.c_str(), ArgDescriptor,
791  JITCompilerNDR, BinInfo);
792 
793  // Collect information for the fused kernel
794 
795  if (CurrentNDR.GlobalSize[0] == 0 && CurrentNDR.NumWorkGroups[0] != 0) {
796  // Some overloads of parallel_for_work_group only specify the number of
797  // work-groups, so this can be used to identify hierarchical parallel
798  // kernels, which are not supported by fusion for now.
799  printPerformanceWarning(
800  "Cannot fuse kernel with hierarchical parallelism");
801  return nullptr;
802  // Not all overloads of parallel_for_work_group only specify the number of
803  // work-groups, so the above mechanism might not detect all hierarchical
804  // parallelism.
805  // TODO: Find a more reliable way to detect hierarchical parallelism.
806  }
807 
808  // We need to copy the storages here. The input CGs might be eliminated
809  // before the fused kernel gets executed, so we need to copy the storages
810  // here to make sure the arguments don't die on us before executing the
811  // fused kernel.
812  ArgsStorage.insert(ArgsStorage.end(), KernelCG->getArgsStorage().begin(),
813  KernelCG->getArgsStorage().end());
814  AccStorage.insert(AccStorage.end(), KernelCG->getAccStorage().begin(),
815  KernelCG->getAccStorage().end());
816  // TODO: Does the MSharedPtrStorage contain any information about actual
817  // shared pointers beside the kernel bundle and handler impl? If yes, we
818  // might need to copy it here.
819  Requirements.insert(Requirements.end(), KernelCG->getRequirements().begin(),
820  KernelCG->getRequirements().end());
821  Events.insert(Events.end(), KernelCG->getEvents().begin(),
822  KernelCG->getEvents().end());
823 
824  // If all kernels have the same cache config then use it for the merged
825  // kernel, otherwise use default configuration.
826  if (KernelIndex == 0) {
827  KernelCacheConfig = KernelCG->MKernelCacheConfig;
828  } else if (KernelCG->MKernelCacheConfig != KernelCacheConfig) {
829  KernelCacheConfig = PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT;
830  }
831 
832  ++KernelIndex;
833  }
834 
835  // Pre-process the arguments, to detect identical parameters or arguments that
836  // can be constant-propagated by the JIT compiler.
837  std::vector<::jit_compiler::ParameterInternalization> InternalizeParams;
838  std::vector<::jit_compiler::JITConstant> JITConstants;
839  std::vector<::jit_compiler::ParameterIdentity> ParamIdentities;
840  ParamList NonIdenticalParameters;
841  for (auto PI = FusedParams.begin(); PI != FusedParams.end();) {
842  PI = preProcessArguments(ArgsStorage, PI, PromotedAccs, InternalizeParams,
843  JITConstants, NonIdenticalParameters,
844  ParamIdentities);
845  }
846 
847  // Retrieve barrier flags.
848  ::jit_compiler::BarrierFlags BarrierFlags =
849  (PropList
850  .has_property<ext::codeplay::experimental::property::no_barriers>())
851  ? ::jit_compiler::getNoBarrierFlag()
852  : ::jit_compiler::getLocalAndGlobalBarrierFlag();
853 
854  static size_t FusedKernelNameIndex = 0;
855  auto FusedKernelName = "fused_" + std::to_string(FusedKernelNameIndex++);
856  ::jit_compiler::KernelFusion::resetConfiguration();
857  bool DebugEnabled =
859  ::jit_compiler::KernelFusion::set<::jit_compiler::option::JITEnableVerbose>(
860  DebugEnabled);
861  ::jit_compiler::KernelFusion::set<::jit_compiler::option::JITEnableCaching>(
863 
864  ::jit_compiler::TargetInfo TargetInfo = getTargetInfo(Queue);
865  ::jit_compiler::BinaryFormat TargetFormat = TargetInfo.getFormat();
866  ::jit_compiler::KernelFusion::set<::jit_compiler::option::JITTargetInfo>(
867  std::move(TargetInfo));
868 
869  using ::jit_compiler::View;
870  auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels(
871  View{InputKernelInfo}, FusedKernelName.c_str(), View(ParamIdentities),
872  BarrierFlags, View(InternalizeParams), View(JITConstants));
873 
874  if (FusionResult.failed()) {
875  if (DebugEnabled) {
876  std::cerr
877  << "ERROR: JIT compilation for kernel fusion failed with message:\n"
878  << FusionResult.getErrorMessage() << "\n";
879  }
880  return nullptr;
881  }
882 
883  auto &FusedKernelInfo = FusionResult.getKernelInfo();
884  std::string FusedOrCachedKernelName{FusedKernelInfo.Name.c_str()};
885 
886  std::vector<ArgDesc> FusedArgs;
887  int FusedArgIndex = 0;
888  for (auto &Param : FusedParams) {
889  // Add to the argument list of the fused kernel, but with the correct
890  // new index in the fused kernel.
891  auto &Arg = Param.Arg;
892  FusedArgs.emplace_back(Arg.MType, Arg.MPtr, Arg.MSize, FusedArgIndex++);
893  }
894 
895  // Update the kernel arguments for internalized accessors.
896  const auto NDRDesc = [](const auto &ND) -> NDRDescT {
897  constexpr auto ToSYCLType = [](const auto &Indices) -> sycl::range<3> {
898  return {Indices[0], Indices[1], Indices[2]};
899  };
900  NDRDescT NDRDesc;
901  NDRDesc.Dims = ND.getDimensions();
902  NDRDesc.GlobalSize = ToSYCLType(ND.getGlobalSize());
903  NDRDesc.LocalSize = ToSYCLType(ND.getLocalSize());
904  NDRDesc.GlobalOffset = ToSYCLType(ND.getOffset());
905  return NDRDesc;
906  }(FusedKernelInfo.NDR);
907  updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage);
908 
909  if (!FusionResult.cached()) {
910  auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat);
912  } else {
913  if (DebugEnabled) {
914  std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
915  }
916  }
917 
918  // Create a kernel bundle for the fused kernel.
919  // Kernel bundles are stored in the CG as one of the "extended" members.
921  FusedOrCachedKernelName);
922 
923  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr;
924  if (TargetFormat == ::jit_compiler::BinaryFormat::SPIRV) {
925  detail::getSyclObjImpl(get_kernel_bundle<bundle_state::executable>(
926  Queue->get_context(), {Queue->get_device()}, {FusedKernelId}));
927  }
928 
929  std::unique_ptr<detail::CG> FusedCG;
930  FusedCG.reset(new detail::CGExecKernel(
931  NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr),
932  std::move(CGData), std::move(FusedArgs), FusedOrCachedKernelName, {}, {},
933  CG::CGTYPE::Kernel, KernelCacheConfig, false /* KernelIsCooperative */));
934  return FusedCG;
935 }
936 
937 pi_device_binaries jit_compiler::createPIDeviceBinary(
938  const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
939  ::jit_compiler::BinaryFormat Format) {
940 
941  const char *TargetSpec = nullptr;
943  switch (Format) {
944  case ::jit_compiler::BinaryFormat::PTX: {
946  BinFormat = PI_DEVICE_BINARY_TYPE_NONE;
947  break;
948  }
949  case ::jit_compiler::BinaryFormat::AMDGCN: {
951  BinFormat = PI_DEVICE_BINARY_TYPE_NONE;
952  break;
953  }
954  case ::jit_compiler::BinaryFormat::SPIRV: {
955  TargetSpec = (FusedKernelInfo.BinaryInfo.AddressBits == 64)
958  BinFormat = PI_DEVICE_BINARY_TYPE_SPIRV;
959  break;
960  }
961  default:
962  sycl::exception(sycl::make_error_code(sycl::errc::invalid),
963  "Invalid output format");
964  }
965 
966  std::string FusedKernelName{FusedKernelInfo.Name.c_str()};
967  DeviceBinaryContainer Binary;
968 
969  // Create an offload entry for the fused kernel.
970  // It seems to be OK to set zero for most of the information here, at least
971  // that is the case for compiled SPIR-V binaries.
972  OffloadEntryContainer Entry{FusedKernelName, nullptr, 0, 0, 0};
973  Binary.addOffloadEntry(std::move(Entry));
974 
975  // Create a property entry for the argument usage mask for the fused kernel.
976  auto ArgMask = encodeArgUsageMask(FusedKernelInfo.Args.UsageMask);
977  PropertyContainer ArgMaskProp{FusedKernelName, ArgMask.data(), ArgMask.size(),
979 
980  // Create a property set for the argument usage masks of all kernels
981  // (currently only one).
982  PropertySetContainer ArgMaskPropSet{
984 
985  ArgMaskPropSet.addProperty(std::move(ArgMaskProp));
986 
987  Binary.addProperty(std::move(ArgMaskPropSet));
988 
989  if (Format == ::jit_compiler::BinaryFormat::PTX ||
990  Format == ::jit_compiler::BinaryFormat::AMDGCN) {
991  // Add a program metadata property with the reqd_work_group_size attribute.
992  // See CUDA PI (pi_cuda.cpp) _pi_program::set_metadata for reference.
993  auto ReqdWGS = std::find_if(
994  FusedKernelInfo.Attributes.begin(), FusedKernelInfo.Attributes.end(),
995  [](const ::jit_compiler::SYCLKernelAttribute &Attr) {
996  return Attr.Kind == ::jit_compiler::SYCLKernelAttribute::AttrKind::
997  ReqdWorkGroupSize;
998  });
999  if (ReqdWGS != FusedKernelInfo.Attributes.end()) {
1000  auto Encoded = encodeReqdWorkGroupSize(*ReqdWGS);
1001  std::stringstream PropName;
1002  PropName << FusedKernelInfo.Name.c_str();
1004  PropertyContainer ReqdWorkGroupSizeProp{
1005  PropName.str(), Encoded.data(), Encoded.size(),
1007  PropertySetContainer ProgramMetadata{
1009  ProgramMetadata.addProperty(std::move(ReqdWorkGroupSizeProp));
1010  Binary.addProperty(std::move(ProgramMetadata));
1011  }
1012  }
1013  if (Format == ::jit_compiler::BinaryFormat::AMDGCN) {
1014  PropertyContainer NeedFinalization{
1016  PropertySetContainer ProgramMetadata{
1018  ProgramMetadata.addProperty(std::move(NeedFinalization));
1019  Binary.addProperty(std::move(ProgramMetadata));
1020  }
1021 
1022  DeviceBinariesCollection Collection;
1023  Collection.addDeviceBinary(
1024  std::move(Binary), FusedKernelInfo.BinaryInfo.BinaryStart,
1025  FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec, BinFormat);
1026 
1027  JITDeviceBinaries.push_back(std::move(Collection));
1028  return JITDeviceBinaries.back().getPIDeviceStruct();
1029 }
1030 
1031 std::vector<uint8_t> jit_compiler::encodeArgUsageMask(
1032  const ::jit_compiler::ArgUsageMask &Mask) const {
1033  // This must match the decoding logic in program_manager.cpp.
1034  constexpr uint64_t NBytesForSize = 8;
1035  constexpr uint64_t NBitsInElement = 8;
1036  uint64_t Size = static_cast<uint64_t>(Mask.size());
1037  // Round the size to the next multiple of 8
1038  uint64_t RoundedSize =
1039  ((Size + (NBitsInElement - 1)) & (~(NBitsInElement - 1)));
1040  std::vector<uint8_t> Encoded((RoundedSize / NBitsInElement) + NBytesForSize,
1041  0u);
1042  // First encode the size of the actual mask
1043  for (size_t i = 0; i < NBytesForSize; ++i) {
1044  uint8_t Byte =
1045  static_cast<uint8_t>((RoundedSize >> i * NBitsInElement) & 0xFF);
1046  Encoded[i] = Byte;
1047  }
1048  // Encode the actual mask bit-wise
1049  for (size_t i = 0; i < Size; ++i) {
1050  // DPC++ internally uses 'true' to indicate that an argument has been
1051  // eliminated, while the JIT compiler uses 'true' to indicate an argument
1052  // is used. Translate this here.
1053  if (!(Mask[i] & ::jit_compiler::ArgUsage::Used)) {
1054  uint8_t &Byte = Encoded[NBytesForSize + (i / NBitsInElement)];
1055  Byte |= static_cast<uint8_t>((1 << (i % NBitsInElement)));
1056  }
1057  }
1058  return Encoded;
1059 }
1060 
1061 std::vector<uint8_t> jit_compiler::encodeReqdWorkGroupSize(
1062  const ::jit_compiler::SYCLKernelAttribute &Attr) const {
1063  assert(Attr.Kind ==
1064  ::jit_compiler::SYCLKernelAttribute::AttrKind::ReqdWorkGroupSize);
1065  size_t NumBytes = sizeof(uint64_t) + (Attr.Values.size() * sizeof(uint32_t));
1066  std::vector<uint8_t> Encoded(NumBytes, 0u);
1067  uint8_t *Ptr = Encoded.data();
1068  // Skip 64-bit wide size argument with value 0 at the start of the data.
1069  // See CUDA PI (pi_cuda.cpp) _pi_program::set_metadata for reference.
1070  Ptr += sizeof(uint64_t);
1071  for (const auto &Val : Attr.Values) {
1072  auto UVal = static_cast<uint32_t>(Val);
1073  std::memcpy(Ptr, &UVal, sizeof(uint32_t));
1074  Ptr += sizeof(uint32_t);
1075  }
1076  return Encoded;
1077 }
1078 
1079 } // namespace detail
1080 } // namespace _V1
1081 } // namespace sycl
1082 
1083 #endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
std::set< RTDeviceBinaryImage * > getRawDeviceImages(const std::vector< kernel_id > &KernelIDs)
sycl::detail::pi::PiProgram createPIProgram(const RTDeviceBinaryImage &Img, const context &Context, const device &Device)
void addImages(pi_device_binaries DeviceImages)
static ProgramManager & getInstance()
const KernelArgMask * getEliminatedKernelArgMask(pi::PiProgram NativePrg, const std::string &KernelName)
Returns the mask for eliminated kernel arguments for the requested kernel within the native program.
kernel_id getSYCLKernelID(const std::string &KernelName)
RTDeviceBinaryImage & getDeviceImage(const std::string &KernelName, const context &Context, const device &Device, bool JITCompilationIsRequired=false)
static const char * get()
Definition: config.hpp:115
std::unique_ptr< detail::CG > fuseKernels(QueueImplPtr Queue, std::vector< ExecCGCommand * > &InputKernels, const property_list &)
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: range.hpp:26
DynArray< uint8_t > ArgUsageMask
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:198
__SYCL_EXTERN_STREAM_ATTRS ostream cerr
Linked to standard error (unbuffered)
::pi_device_binary_type PiDeviceBinaryType
Definition: pi.hpp:134
std::vector< bool > KernelArgMask
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:30
AccessorImplHost Requirement
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: event_impl.hpp:34
static constexpr bool has_property()
PropertyListT Accessor
Definition: multi_ptr.hpp:510
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:87
Definition: access.hpp:18
_pi_kernel_cache_config
Definition: pi.h:808
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT
Definition: pi.h:810
#define __SYCL_PI_PROGRAM_METADATA_TAG_NEED_FINALIZATION
Definition: pi.h:1018
static constexpr pi_device_binary_type PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE
Definition: pi.h:948
#define __SYCL_PI_PROPERTY_SET_KERNEL_PARAM_OPT_INFO
PropertySetRegistry::SYCL_KERNEL_PARAM_OPT_INFO defined in PropertySetIO.h.
Definition: pi.h:995
static constexpr pi_device_binary_type PI_DEVICE_BINARY_TYPE_NATIVE
Definition: pi.h:943
@ PI_PROPERTY_TYPE_BYTE_ARRAY
Definition: pi.h:912
uint8_t pi_device_binary_type
Types of device binary.
Definition: pi.h:939
#define __SYCL_PI_PROPERTY_SET_PROGRAM_METADATA
PropertySetRegistry::SYCL_KERNEL_PROGRAM_METADATA defined in PropertySetIO.h.
Definition: pi.h:997
#define __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64
PTX 64-bit image <-> "nvptx64", 64-bit NVIDIA PTX device.
Definition: pi.h:974
#define __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64
SPIR-V 64-bit image <-> "spir64", 64-bit OpenCL device.
Definition: pi.h:967
#define __SYCL_PI_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE
Program metadata tags recognized by the PI backends.
Definition: pi.h:1014
#define __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN
Definition: pi.h:975
static constexpr pi_device_binary_type PI_DEVICE_BINARY_TYPE_SPIRV
Definition: pi.h:946
#define __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32
SPIR-V 32-bit image <-> "spir", 32-bit OpenCL device.
Definition: pi.h:965
static constexpr pi_device_binary_type PI_DEVICE_BINARY_TYPE_NONE
Definition: pi.h:941
C++ wrapper of extern "C" PI interfaces.
This struct is a record of all the device code that may be offloaded.
Definition: pi.h:1122