DPC++ Runtime
Runtime libraries for oneAPI Data Parallel C++
handler.hpp
Go to the documentation of this file.
1 //==-------- handler.hpp --- SYCL command group handler --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
12 #include <CL/sycl/accessor.hpp>
13 #include <CL/sycl/context.hpp>
14 #include <CL/sycl/detail/cg.hpp>
19 #include <CL/sycl/event.hpp>
20 #include <CL/sycl/id.hpp>
22 #include <CL/sycl/item.hpp>
23 #include <CL/sycl/kernel.hpp>
26 #include <CL/sycl/nd_item.hpp>
27 #include <CL/sycl/nd_range.hpp>
29 #include <CL/sycl/sampler.hpp>
30 #include <CL/sycl/stl.hpp>
31 
32 #include <functional>
33 #include <limits>
34 #include <memory>
35 #include <tuple>
36 #include <type_traits>
37 
38 // SYCL_LANGUAGE_VERSION is 4 digit year followed by 2 digit revision
39 #if !SYCL_LANGUAGE_VERSION || SYCL_LANGUAGE_VERSION < 202001
40 #define __SYCL_NONCONST_FUNCTOR__
41 #endif
42 
43 template <typename DataT, int Dimensions, cl::sycl::access::mode AccessMode,
44  cl::sycl::access::target AccessTarget,
45  cl::sycl::access::placeholder IsPlaceholder>
46 class __fill;
47 
48 template <typename T> class __usmfill;
49 
50 template <typename T_Src, typename T_Dst, int Dims,
51  cl::sycl::access::mode AccessMode,
52  cl::sycl::access::target AccessTarget,
53  cl::sycl::access::placeholder IsPlaceholder>
55 
56 template <typename T_Src, typename T_Dst, int Dims,
57  cl::sycl::access::mode AccessMode,
58  cl::sycl::access::target AccessTarget,
59  cl::sycl::access::placeholder IsPlaceholder>
61 
62 template <typename T_Src, int Dims_Src, cl::sycl::access::mode AccessMode_Src,
63  cl::sycl::access::target AccessTarget_Src, typename T_Dst,
64  int Dims_Dst, cl::sycl::access::mode AccessMode_Dst,
65  cl::sycl::access::target AccessTarget_Dst,
66  cl::sycl::access::placeholder IsPlaceholder_Src,
67  cl::sycl::access::placeholder IsPlaceholder_Dst>
69 
70 // For unit testing purposes
71 class MockHandler;
72 
74 namespace sycl {
75 
76 // Forward declaration
77 
78 class handler;
79 template <typename T, int Dimensions, typename AllocatorT, typename Enable>
80 class buffer;
81 namespace detail {
82 
83 class handler_impl;
84 class kernel_impl;
85 class queue_impl;
86 class stream_impl;
87 template <typename DataT, int Dimensions, access::mode AccessMode,
88  access::target AccessTarget, access::placeholder IsPlaceholder>
89 class image_accessor;
90 template <typename RetType, typename Func, typename Arg>
91 static Arg member_ptr_helper(RetType (Func::*)(Arg) const);
92 
93 // Non-const version of the above template to match functors whose 'operator()'
94 // is declared w/o the 'const' qualifier.
95 template <typename RetType, typename Func, typename Arg>
96 static Arg member_ptr_helper(RetType (Func::*)(Arg));
97 
98 // template <typename RetType, typename Func>
99 // static void member_ptr_helper(RetType (Func::*)() const);
100 
101 // template <typename RetType, typename Func>
102 // static void member_ptr_helper(RetType (Func::*)());
103 
104 template <typename F, typename SuggestedArgType>
105 decltype(member_ptr_helper(&F::operator())) argument_helper(int);
106 
107 template <typename F, typename SuggestedArgType>
108 SuggestedArgType argument_helper(...);
109 
110 template <typename F, typename SuggestedArgType>
111 using lambda_arg_type = decltype(argument_helper<F, SuggestedArgType>(0));
112 
113 // Used when parallel_for range is rounded-up.
114 template <typename Name> class __pf_kernel_wrapper;
115 
116 template <typename Type> struct get_kernel_wrapper_name_t {
118 };
119 
120 __SYCL_EXPORT device getDeviceFromHandler(handler &);
121 
122 #if __SYCL_ID_QUERIES_FIT_IN_INT__
123 template <typename T> struct NotIntMsg;
124 
125 template <int Dims> struct NotIntMsg<range<Dims>> {
126  constexpr static const char *Msg =
127  "Provided range is out of integer limits. Pass "
128  "`-fno-sycl-id-queries-fit-in-int' to disable range check.";
129 };
130 
131 template <int Dims> struct NotIntMsg<id<Dims>> {
132  constexpr static const char *Msg =
133  "Provided offset is out of integer limits. Pass "
134  "`-fno-sycl-id-queries-fit-in-int' to disable offset check.";
135 };
136 #endif
137 
138 #if __SYCL_ID_QUERIES_FIT_IN_INT__
139 template <typename T, typename ValT>
140 typename detail::enable_if_t<std::is_same<ValT, size_t>::value ||
141  std::is_same<ValT, unsigned long long>::value>
142 checkValueRangeImpl(ValT V) {
143  static constexpr size_t Limit =
144  static_cast<size_t>((std::numeric_limits<int>::max)());
145  if (V > Limit)
146  throw runtime_error(NotIntMsg<T>::Msg, PI_INVALID_VALUE);
147 }
148 #endif
149 
150 template <int Dims, typename T>
151 typename detail::enable_if_t<std::is_same<T, range<Dims>>::value ||
152  std::is_same<T, id<Dims>>::value>
153 checkValueRange(const T &V) {
154 #if __SYCL_ID_QUERIES_FIT_IN_INT__
155  for (size_t Dim = 0; Dim < Dims; ++Dim)
156  checkValueRangeImpl<T>(V[Dim]);
157 
158  {
159  unsigned long long Product = 1;
160  for (size_t Dim = 0; Dim < Dims; ++Dim) {
161  Product *= V[Dim];
162  // check value now to prevent product overflow in the end
163  checkValueRangeImpl<T>(Product);
164  }
165  }
166 #else
167  (void)V;
168 #endif
169 }
170 
171 template <int Dims>
172 void checkValueRange(const range<Dims> &R, const id<Dims> &O) {
173 #if __SYCL_ID_QUERIES_FIT_IN_INT__
174  checkValueRange<Dims>(R);
175  checkValueRange<Dims>(O);
176 
177  for (size_t Dim = 0; Dim < Dims; ++Dim) {
178  unsigned long long Sum = R[Dim] + O[Dim];
179 
180  checkValueRangeImpl<range<Dims>>(Sum);
181  }
182 #else
183  (void)R;
184  (void)O;
185 #endif
186 }
187 
188 template <int Dims, typename T>
190 checkValueRange(const T &V) {
191 #if __SYCL_ID_QUERIES_FIT_IN_INT__
192  checkValueRange<Dims>(V.get_global_range());
193  checkValueRange<Dims>(V.get_local_range());
194  checkValueRange<Dims>(V.get_offset());
195 
196  checkValueRange<Dims>(V.get_global_range(), V.get_offset());
197 #else
198  (void)V;
199 #endif
200 }
201 
202 template <typename TransformedArgType, int Dims, typename KernelType>
204 public:
205  RoundedRangeKernel(range<Dims> NumWorkItems, KernelType KernelFunc)
206  : NumWorkItems(NumWorkItems), KernelFunc(KernelFunc) {}
207 
208  void operator()(TransformedArgType Arg) const {
209  if (Arg[0] >= NumWorkItems[0])
210  return;
211  Arg.set_allowed_range(NumWorkItems);
212  KernelFunc(Arg);
213  }
214 
215 private:
216  range<Dims> NumWorkItems;
217  KernelType KernelFunc;
218 };
219 
220 template <typename TransformedArgType, int Dims, typename KernelType>
222 public:
224  : NumWorkItems(NumWorkItems), KernelFunc(KernelFunc) {}
225 
226  void operator()(TransformedArgType Arg, kernel_handler KH) const {
227  if (Arg[0] >= NumWorkItems[0])
228  return;
229  Arg.set_allowed_range(NumWorkItems);
230  KernelFunc(Arg, KH);
231  }
232 
233 private:
234  range<Dims> NumWorkItems;
235  KernelType KernelFunc;
236 };
237 
238 } // namespace detail
239 
240 namespace ext {
241 namespace oneapi {
242 namespace detail {
243 template <typename T, class BinaryOperation, int Dims, bool IsUSM,
244  access::placeholder IsPlaceholder>
246 
249 
250 template <typename KernelName, typename KernelType, int Dims, class Reduction>
251 void reduCGFunc(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
252  size_t MaxWGSize, uint32_t NumConcurrentWorkGroups,
253  Reduction &Redu);
254 
255 template <typename KernelName, typename KernelType, int Dims, class Reduction>
257 reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
258  const nd_range<Dims> &Range, Reduction &Redu);
259 
260 template <typename KernelName, typename KernelType, int Dims, class Reduction>
262 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
263  Reduction &Redu);
264 
265 template <typename KernelName, typename KernelType, int Dims, class Reduction>
267 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
268  Reduction &Redu);
269 
270 template <typename KernelName, typename KernelType, class Reduction>
272 reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
273  Reduction &Redu);
274 
275 template <typename KernelName, typename KernelType, int Dims,
276  typename... Reductions, size_t... Is>
277 void reduCGFunc(handler &CGH, KernelType KernelFunc,
278  const nd_range<Dims> &Range,
279  std::tuple<Reductions...> &ReduTuple,
280  std::index_sequence<Is...>);
281 
282 template <typename KernelName, typename KernelType, typename... Reductions,
283  size_t... Is>
284 size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
285  std::tuple<Reductions...> &ReduTuple,
286  std::index_sequence<Is...>);
287 
288 template <typename KernelName, class Reduction>
289 std::enable_if_t<!Reduction::is_usm>
290 reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu);
291 
292 template <typename KernelName, class Reduction>
293 std::enable_if_t<Reduction::is_usm>
294 reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu);
295 
296 template <typename... Reduction, size_t... Is>
297 std::shared_ptr<event>
298 reduSaveFinalResultToUserMem(std::shared_ptr<detail::queue_impl> Queue,
299  bool IsHost, std::tuple<Reduction...> &ReduTuple,
300  std::index_sequence<Is...>);
301 
302 template <typename Reduction, typename... RestT>
303 std::enable_if_t<!Reduction::is_usm>
304 reduSaveFinalResultToUserMemHelper(std::vector<event> &Events,
305  std::shared_ptr<detail::queue_impl> Queue,
306  bool IsHost, Reduction &Redu, RestT... Rest);
307 
308 __SYCL_EXPORT uint32_t
309 reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
310 
311 __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
312  size_t LocalMemBytesPerWorkItem);
313 
314 template <typename... ReductionT, size_t... Is>
315 size_t reduGetMemPerWorkItem(std::tuple<ReductionT...> &ReduTuple,
316  std::index_sequence<Is...>);
317 
318 template <typename TupleT, std::size_t... Is>
319 std::tuple<std::tuple_element_t<Is, TupleT>...>
320 tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>);
321 
322 template <typename FirstT, typename... RestT> struct AreAllButLastReductions;
323 
324 } // namespace detail
325 } // namespace oneapi
326 } // namespace ext
327 
361 class __SYCL_EXPORT handler {
362 private:
367  handler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost);
368 
378  handler(std::shared_ptr<detail::queue_impl> Queue,
379  std::shared_ptr<detail::queue_impl> PrimaryQueue,
380  std::shared_ptr<detail::queue_impl> SecondaryQueue, bool IsHost);
381 
383  template <typename T, typename F = typename detail::remove_const_t<
385  F *storePlainArg(T &&Arg) {
386  MArgsStorage.emplace_back(sizeof(T));
387  auto Storage = reinterpret_cast<F *>(MArgsStorage.back().data());
388  *Storage = Arg;
389  return Storage;
390  }
391 
392  void setType(detail::CG::CGTYPE Type) {
393  constexpr detail::CG::CG_VERSION Version = detail::CG::CG_VERSION::V1;
394  MCGType = static_cast<detail::CG::CGTYPE>(
395  getVersionedCGType(Type, static_cast<int>(Version)));
396  }
397 
398  detail::CG::CGTYPE getType() {
399  return static_cast<detail::CG::CGTYPE>(getUnversionedCGType(MCGType));
400  }
401 
402  void throwIfActionIsCreated() {
403  if (detail::CG::None != getType())
404  throw sycl::runtime_error("Attempt to set multiple actions for the "
405  "command group. Command group must consist of "
406  "a single kernel or explicit memory operation.",
407  CL_INVALID_OPERATION);
408  }
409 
413  void
414  extractArgsAndReqsFromLambda(char *LambdaPtr, size_t KernelArgsNum,
415  const detail::kernel_param_desc_t *KernelArgs);
416 
419  void
420  extractArgsAndReqsFromLambda(char *LambdaPtr, size_t KernelArgsNum,
421  const detail::kernel_param_desc_t *KernelArgs,
422  bool IsESIMD);
423 
425  void extractArgsAndReqs();
426 
428  void processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
429  const int Size, const size_t Index, size_t &IndexShift,
430  bool IsKernelCreatedFromSource);
431 
432  void processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
433  const int Size, const size_t Index, size_t &IndexShift,
434  bool IsKernelCreatedFromSource, bool IsESIMD);
435 
437  std::string getKernelName();
438 
439  template <typename LambdaNameT> bool lambdaAndKernelHaveEqualName() {
440  // TODO It is unclear a kernel and a lambda/functor must to be equal or not
441  // for parallel_for with sycl::kernel and lambda/functor together
442  // Now if they are equal we extract argumets from lambda/functor for the
443  // kernel. Else it is necessary use set_atg(s) for resolve the order and
444  // values of arguments for the kernel.
445  assert(MKernel && "MKernel is not initialized");
446  const std::string LambdaName = detail::KernelInfo<LambdaNameT>::getName();
447  const std::string KernelName = getKernelName();
448  return LambdaName == KernelName;
449  }
450 
453  void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
454 
461  event finalize();
462 
468  void addStream(const std::shared_ptr<detail::stream_impl> &Stream) {
469  MStreamStorage.push_back(Stream);
470  }
471 
478  void addReduction(const std::shared_ptr<const void> &ReduObj) {
479  MSharedPtrStorage.push_back(ReduObj);
480  }
481 
482  ~handler() = default;
483 
484  bool is_host() { return MIsHost; }
485 
487  access::target AccTarget);
488 
489  // Recursively calls itself until arguments pack is fully processed.
490  // The version for regular(standard layout) argument.
491  template <typename T, typename... Ts>
492  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&... Args) {
493  set_arg(ArgIndex, std::move(Arg));
494  setArgsHelper(++ArgIndex, std::move(Args)...);
495  }
496 
497  void setArgsHelper(int) {}
498 
499  // setArgHelper for local accessor argument.
500  template <typename DataT, int Dims, access::mode AccessMode,
501  access::placeholder IsPlaceholder>
502  void setArgHelper(int ArgIndex,
503  accessor<DataT, Dims, AccessMode, access::target::local,
504  IsPlaceholder> &&Arg) {
505  detail::LocalAccessorBaseHost *LocalAccBase =
507  detail::LocalAccessorImplPtr LocalAccImpl =
508  detail::getSyclObjImpl(*LocalAccBase);
509  detail::LocalAccessorImplHost *Req = LocalAccImpl.get();
510  MLocalAccStorage.push_back(std::move(LocalAccImpl));
511  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
512  static_cast<int>(access::target::local), ArgIndex);
513  }
514 
515  // setArgHelper for non local accessor argument.
516  template <typename DataT, int Dims, access::mode AccessMode,
517  access::target AccessTarget, access::placeholder IsPlaceholder>
519  setArgHelper(
520  int ArgIndex,
524  detail::Requirement *Req = AccImpl.get();
525  // Add accessor to the list of requirements.
526  MRequirements.push_back(Req);
527  // Store copy of the accessor.
528  MAccStorage.push_back(std::move(AccImpl));
529  // Add accessor to the list of arguments.
530  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
531  static_cast<int>(AccessTarget), ArgIndex);
532  }
533 
534  template <typename T> void setArgHelper(int ArgIndex, T &&Arg) {
535  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
536 
537  if (!std::is_same<cl_mem, T>::value && std::is_pointer<T>::value) {
538  MArgs.emplace_back(detail::kernel_param_kind_t::kind_pointer, StoredArg,
539  sizeof(T), ArgIndex);
540  } else {
541  MArgs.emplace_back(detail::kernel_param_kind_t::kind_std_layout,
542  StoredArg, sizeof(T), ArgIndex);
543  }
544  }
545 
546  void setArgHelper(int ArgIndex, sampler &&Arg) {
547  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
548  MArgs.emplace_back(detail::kernel_param_kind_t::kind_sampler, StoredArg,
549  sizeof(sampler), ArgIndex);
550  }
551 
552  void verifyKernelInvoc(const kernel &Kernel) {
553  if (is_host()) {
554  throw invalid_object_error(
555  "This kernel invocation method cannot be used on the host",
557  }
558  if (Kernel.is_host()) {
559  throw invalid_object_error("Invalid kernel type, OpenCL expected",
561  }
562  }
563 
564  /* The kernel passed to StoreLambda can take an id, an item or an nd_item as
565  * its argument. Since esimd plugin directly invokes the kernel (doesn’t use
566  * piKernelSetArg), the kernel argument type must be known to the plugin.
567  * However, passing kernel argument type to the plugin requires changing ABI
568  * in HostKernel class. To overcome this problem, helpers below wrap the
569  * “original” kernel with a functor that always takes an nd_item as argument.
570  * A functor is used instead of a lambda because extractArgsAndReqsFromLambda
571  * needs access to the “original” kernel and keeps references to its internal
572  * data, i.e. the kernel passed as argument cannot be local in scope. The
573  * functor itself is again encapsulated in a std::function since functor’s
574  * type is unknown to the plugin.
575  */
576 
577  // For 'id, item w/wo offset, nd_item' kernel arguments
578  template <class KernelType, class NormalizedKernelType, int Dims>
579  KernelType *ResetHostKernelHelper(const KernelType &KernelFunc) {
580  NormalizedKernelType NormalizedKernel(KernelFunc);
581  auto NormalizedKernelFunc =
582  std::function<void(const sycl::nd_item<Dims> &)>(NormalizedKernel);
583  auto HostKernelPtr =
584  new detail::HostKernel<decltype(NormalizedKernelFunc),
585  sycl::nd_item<Dims>, Dims>(NormalizedKernelFunc);
586  MHostKernel.reset(HostKernelPtr);
587  return &HostKernelPtr->MKernel.template target<NormalizedKernelType>()
588  ->MKernelFunc;
589  }
590 
591  // For 'sycl::id<Dims>' kernel argument
592  template <class KernelType, typename ArgT, int Dims>
593  typename std::enable_if<std::is_same<ArgT, sycl::id<Dims>>::value,
594  KernelType *>::type
595  ResetHostKernel(const KernelType &KernelFunc) {
596  struct NormalizedKernelType {
597  KernelType MKernelFunc;
598  NormalizedKernelType(const KernelType &KernelFunc)
599  : MKernelFunc(KernelFunc) {}
600  void operator()(const nd_item<Dims> &Arg) {
601  detail::runKernelWithArg(MKernelFunc, Arg.get_global_id());
602  }
603  };
604  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
605  KernelFunc);
606  }
607 
608  // For 'sycl::nd_item<Dims>' kernel argument
609  template <class KernelType, typename ArgT, int Dims>
610  typename std::enable_if<std::is_same<ArgT, sycl::nd_item<Dims>>::value,
611  KernelType *>::type
612  ResetHostKernel(const KernelType &KernelFunc) {
613  struct NormalizedKernelType {
614  KernelType MKernelFunc;
615  NormalizedKernelType(const KernelType &KernelFunc)
616  : MKernelFunc(KernelFunc) {}
617  void operator()(const nd_item<Dims> &Arg) {
618  detail::runKernelWithArg(MKernelFunc, Arg);
619  }
620  };
621  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
622  KernelFunc);
623  }
624 
625  // For 'sycl::item<Dims, without_offset>' kernel argument
626  template <class KernelType, typename ArgT, int Dims>
627  typename std::enable_if<std::is_same<ArgT, sycl::item<Dims, false>>::value,
628  KernelType *>::type
629  ResetHostKernel(const KernelType &KernelFunc) {
630  struct NormalizedKernelType {
631  KernelType MKernelFunc;
632  NormalizedKernelType(const KernelType &KernelFunc)
633  : MKernelFunc(KernelFunc) {}
634  void operator()(const nd_item<Dims> &Arg) {
635  sycl::item<Dims, false> Item = detail::Builder::createItem<Dims, false>(
636  Arg.get_global_range(), Arg.get_global_id());
637  detail::runKernelWithArg(MKernelFunc, Item);
638  }
639  };
640  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
641  KernelFunc);
642  }
643 
644  // For 'sycl::item<Dims, with_offset>' kernel argument
645  template <class KernelType, typename ArgT, int Dims>
646  typename std::enable_if<std::is_same<ArgT, sycl::item<Dims, true>>::value,
647  KernelType *>::type
648  ResetHostKernel(const KernelType &KernelFunc) {
649  struct NormalizedKernelType {
650  KernelType MKernelFunc;
651  NormalizedKernelType(const KernelType &KernelFunc)
652  : MKernelFunc(KernelFunc) {}
653  void operator()(const nd_item<Dims> &Arg) {
654  sycl::item<Dims, true> Item = detail::Builder::createItem<Dims, true>(
655  Arg.get_global_range(), Arg.get_global_id(), Arg.get_offset());
656  detail::runKernelWithArg(MKernelFunc, Item);
657  }
658  };
659  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
660  KernelFunc);
661  }
662 
663  /* 'wrapper'-based approach using 'NormalizedKernelType' struct is
664  * not applied for 'void(void)' type kernel and
665  * 'void(sycl::group<Dims>)'. This is because 'void(void)' type does
666  * not have argument to normalize and 'void(sycl::group<Dims>)' is
667  * not supported in ESIMD.
668  */
669  // For 'void' and 'sycl::group<Dims>' kernel argument
670  template <class KernelType, typename ArgT, int Dims>
671  typename std::enable_if<std::is_same<ArgT, void>::value ||
672  std::is_same<ArgT, sycl::group<Dims>>::value,
673  KernelType *>::type
674  ResetHostKernel(const KernelType &KernelFunc) {
675  MHostKernel.reset(
677  return (KernelType *)(MHostKernel->getPtr());
678  }
679 
687  void verifyUsedKernelBundle(const std::string &KernelName);
688 
695  template <typename KernelName, typename KernelType, int Dims,
696  typename LambdaArgType>
697  void StoreLambda(KernelType KernelFunc) {
699 
700  constexpr bool IsCallableWithKernelHandler =
702  LambdaArgType>::value;
703 
704  if (IsCallableWithKernelHandler && MIsHost) {
705  throw cl::sycl::feature_not_supported(
706  "kernel_handler is not yet supported by host device.",
708  }
709  KernelType *KernelPtr =
710  ResetHostKernel<KernelType, LambdaArgType, Dims>(KernelFunc);
711 
713  // Empty name indicates that the compilation happens without integration
714  // header, so don't perform things that require it.
715  if (KI::getName() != nullptr && KI::getName()[0] != '\0') {
716  // TODO support ESIMD in no-integration-header case too.
717  MArgs.clear();
718  extractArgsAndReqsFromLambda(reinterpret_cast<char *>(KernelPtr),
719  KI::getNumParams(), &KI::getParamDesc(0),
720  KI::isESIMD());
721  MKernelName = KI::getName();
722  MOSModuleHandle = detail::OSUtil::getOSModuleHandle(KI::getName());
723  } else {
724  // In case w/o the integration header it is necessary to process
725  // accessors from the list(which are associated with this handler) as
726  // arguments.
727  MArgs = std::move(MAssociatedAccesors);
728  }
729 
730  // If the kernel lambda is callable with a kernel_handler argument, manifest
731  // the associated kernel handler.
732  if (IsCallableWithKernelHandler) {
733  getOrInsertHandlerKernelBundle(/*Insert=*/true);
734  }
735  }
736 
741  template <int Dims_Src, int Dims_Dst>
742  static bool IsCopyingRectRegionAvailable(const range<Dims_Src> Src,
743  const range<Dims_Dst> Dst) {
744  if (Dims_Src > Dims_Dst)
745  return false;
746  for (size_t I = 0; I < Dims_Src; ++I)
747  if (Src[I] > Dst[I])
748  return false;
749  return true;
750  }
751 
752  // TODO: Delete these functions when ABI breaking changes are allowed.
753  // Currently these functions are unused but they are static members of
754  // the exported class 'handler' and has got into sycl library some time ago
755  // and must stay there for a while.
756  static id<1> getDelinearizedIndex(const range<1> Range, const size_t Index) {
757  return detail::getDelinearizedId(Range, Index);
758  }
759  static id<2> getDelinearizedIndex(const range<2> Range, const size_t Index) {
760  return detail::getDelinearizedId(Range, Index);
761  }
762  static id<3> getDelinearizedIndex(const range<3> Range, const size_t Index) {
763  return detail::getDelinearizedId(Range, Index);
764  }
765 
771  template <typename TSrc, int DimSrc, access::mode ModeSrc,
772  access::target TargetSrc, typename TDst, int DimDst,
773  access::mode ModeDst, access::target TargetDst,
774  access::placeholder IsPHSrc, access::placeholder IsPHDst>
775  detail::enable_if_t<(DimSrc > 0) && (DimDst > 0), bool>
778  if (!MIsHost &&
779  IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range()))
780  return false;
781 
782  range<1> LinearizedRange(Src.size());
783  parallel_for<class __copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc,
784  TDst, DimDst, ModeDst, TargetDst,
785  IsPHSrc, IsPHDst>>
786  (LinearizedRange, [=](id<1> Id) {
787  size_t Index = Id[0];
788  id<DimSrc> SrcId = detail::getDelinearizedId(Src.get_range(), Index);
789  id<DimDst> DstId = detail::getDelinearizedId(Dst.get_range(), Index);
790  Dst[DstId] = Src[SrcId];
791  });
792  return true;
793  }
794 
802  template <typename TSrc, int DimSrc, access::mode ModeSrc,
803  access::target TargetSrc, typename TDst, int DimDst,
804  access::mode ModeDst, access::target TargetDst,
805  access::placeholder IsPHSrc, access::placeholder IsPHDst>
809  if (!MIsHost)
810  return false;
811 
812  single_task<class __copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc,
813  TDst, DimDst, ModeDst, TargetDst,
814  IsPHSrc, IsPHDst>> ([=]() {
815  *(Dst.get_pointer()) = *(Src.get_pointer());
816  });
817  return true;
818  }
819 
820 #ifndef __SYCL_DEVICE_ONLY__
821  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
827  access::target AccTarget, access::placeholder IsPH>
828  detail::enable_if_t<(Dim > 0)>
830  TDst *Dst) {
831  range<Dim> Range = Src.get_range();
832  parallel_for<class __copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>
833  (Range, [=](id<Dim> Index) {
834  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
835  using TSrcNonConst = typename detail::remove_const_t<TSrc>;
836  (reinterpret_cast<TSrcNonConst *>(Dst))[LinearIndex] = Src[Index];
837  });
838  }
839 
845  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
846  access::target AccTarget, access::placeholder IsPH>
849  TDst *Dst) {
850  single_task<class __copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>
851  ([=]() {
852  using TSrcNonConst = typename detail::remove_const_t<TSrc>;
853  *(reinterpret_cast<TSrcNonConst *>(Dst)) = *(Src.get_pointer());
854  });
855  }
856 
861  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
862  access::target AccTarget, access::placeholder IsPH>
863  detail::enable_if_t<(Dim > 0)>
864  copyPtrToAccHost(TSrc *Src,
866  range<Dim> Range = Dst.get_range();
867  parallel_for<class __copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>
868  (Range, [=](id<Dim> Index) {
869  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
870  Dst[Index] = (reinterpret_cast<const TDst *>(Src))[LinearIndex];
871  });
872  }
873 
879  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
880  access::target AccTarget, access::placeholder IsPH>
882  copyPtrToAccHost(TSrc *Src,
884  single_task<class __copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>
885  ([=]() {
886  *(Dst.get_pointer()) = *(reinterpret_cast<const TDst *>(Src));
887  });
888  }
889 #endif // __SYCL_DEVICE_ONLY__
890 
891  constexpr static bool isConstOrGlobal(access::target AccessTarget) {
892  return AccessTarget == access::target::device ||
893  AccessTarget == access::target::constant_buffer;
894  }
895 
896  constexpr static bool isImageOrImageArray(access::target AccessTarget) {
897  return AccessTarget == access::target::image ||
898  AccessTarget == access::target::image_array;
899  }
900 
901  constexpr static bool
902  isValidTargetForExplicitOp(access::target AccessTarget) {
903  return isConstOrGlobal(AccessTarget) || isImageOrImageArray(AccessTarget);
904  }
905 
906  constexpr static bool isValidModeForSourceAccessor(access::mode AccessMode) {
907  return AccessMode == access::mode::read ||
908  AccessMode == access::mode::read_write;
909  }
910 
911  constexpr static bool
912  isValidModeForDestinationAccessor(access::mode AccessMode) {
913  return AccessMode == access::mode::write ||
914  AccessMode == access::mode::read_write ||
915  AccessMode == access::mode::discard_write ||
916  AccessMode == access::mode::discard_read_write;
917  }
918 
919  template <int Dims, typename LambdaArgType> struct TransformUserItemType {
920  using type = typename std::conditional<
921  std::is_convertible<nd_item<Dims>, LambdaArgType>::value, nd_item<Dims>,
922  typename std::conditional<
923  std::is_convertible<item<Dims>, LambdaArgType>::value, item<Dims>,
924  LambdaArgType>::type>::type;
925  };
926 
938  template <typename KernelName, typename KernelType, int Dims>
939  void parallel_for_lambda_impl(range<Dims> NumWorkItems,
940  KernelType KernelFunc) {
941  throwIfActionIsCreated();
943 
944  // If 1D kernel argument is an integral type, convert it to sycl::item<1>
945  // If user type is convertible from sycl::item/sycl::nd_item, use
946  // sycl::item/sycl::nd_item to transport item information
947  using TransformedArgType = typename std::conditional<
948  std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
949  typename TransformUserItemType<Dims, LambdaArgType>::type>::type;
950 
951  using NameT =
953 
954  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
955 
956  // Range rounding can be disabled by the user.
957  // Range rounding is not done on the host device.
958  // Range rounding is supported only for newer SYCL standards.
959 #if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \
960  !defined(DPCPP_HOST_DEVICE_OPENMP) && \
961  !defined(DPCPP_HOST_DEVICE_PERF_NATIVE) && SYCL_LANGUAGE_VERSION >= 202001
962  // Range should be a multiple of this for reasonable performance.
963  size_t MinFactorX = 16;
964  // Range should be a multiple of this for improved performance.
965  size_t GoodFactorX = 32;
966  // Range should be at least this to make rounding worthwhile.
967  size_t MinRangeX = 1024;
968 
969  // Check if rounding parameters have been set through environment:
970  // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
971  this->GetRangeRoundingSettings(MinFactorX, GoodFactorX, MinRangeX);
972 
973  // Disable the rounding-up optimizations under these conditions:
974  // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
975  // 2. The kernel is provided via an interoperability method.
976  // 3. The range is already a multiple of the rounding factor.
977  //
978  // Cases 2 and 3 could be supported with extra effort.
979  // As an optimization for the common case it is an
980  // implementation choice to not support those scenarios.
981  // Note that "this_item" is a free function, i.e. not tied to any
982  // specific id or item. When concurrent parallel_fors are executing
983  // on a device it is difficult to tell which parallel_for the call is
984  // being made from. One could replicate portions of the
985  // call-graph to make this_item calls kernel-specific but this is
986  // not considered worthwhile.
987 
988  // Get the kernel name to check condition 2.
989  std::string KName = typeid(NameT *).name();
991  bool DisableRounding =
992  this->DisableRangeRounding() ||
993  (KI::getName() == nullptr || KI::getName()[0] == '\0');
994 
995  // Perform range rounding if rounding-up is enabled
996  // and there are sufficient work-items to need rounding
997  // and the user-specified range is not a multiple of a "good" value.
998  if (!DisableRounding && (NumWorkItems[0] >= MinRangeX) &&
999  (NumWorkItems[0] % MinFactorX != 0)) {
1000  // It is sufficient to round up just the first dimension.
1001  // Multiplying the rounded-up value of the first dimension
1002  // by the values of the remaining dimensions (if any)
1003  // will yield a rounded-up value for the total range.
1004  size_t NewValX =
1005  ((NumWorkItems[0] + GoodFactorX - 1) / GoodFactorX) * GoodFactorX;
1006  if (this->RangeRoundingTrace())
1007  std::cout << "parallel_for range adjusted from " << NumWorkItems[0]
1008  << " to " << NewValX << std::endl;
1009 
1010  using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
1011  auto Wrapper =
1012  getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
1013  KernelFunc, NumWorkItems);
1014 
1015  using KName = std::conditional_t<std::is_same<KernelType, NameT>::value,
1016  decltype(Wrapper), NameWT>;
1017 
1018  range<Dims> AdjustedRange = NumWorkItems;
1019  AdjustedRange.set_range_dim0(NewValX);
1020  kernel_parallel_for_wrapper<KName, TransformedArgType>(Wrapper);
1021 #ifndef __SYCL_DEVICE_ONLY__
1022  detail::checkValueRange<Dims>(AdjustedRange);
1023  MNDRDesc.set(std::move(AdjustedRange));
1024  StoreLambda<KName, decltype(Wrapper), Dims, TransformedArgType>(
1025  std::move(Wrapper));
1026  setType(detail::CG::Kernel);
1027 #endif
1028  } else
1029 #endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ &&
1030  // !DPCPP_HOST_DEVICE_OPENMP && !DPCPP_HOST_DEVICE_PERF_NATIVE &&
1031  // SYCL_LANGUAGE_VERSION >= 202001
1032  {
1033  (void)NumWorkItems;
1034  kernel_parallel_for_wrapper<NameT, TransformedArgType>(KernelFunc);
1035 #ifndef __SYCL_DEVICE_ONLY__
1036  detail::checkValueRange<Dims>(NumWorkItems);
1037  MNDRDesc.set(std::move(NumWorkItems));
1038  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1039  std::move(KernelFunc));
1040  setType(detail::CG::Kernel);
1041 #endif
1042  }
1043  }
1044 
1052  template <int Dims>
1053  void parallel_for_impl(range<Dims> NumWorkItems, kernel Kernel) {
1054  throwIfActionIsCreated();
1055  verifyKernelInvoc(Kernel);
1056  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1057  detail::checkValueRange<Dims>(NumWorkItems);
1058  MNDRDesc.set(std::move(NumWorkItems));
1059  setType(detail::CG::Kernel);
1060  extractArgsAndReqs();
1061  MKernelName = getKernelName();
1062  }
1063 
1064 #ifdef SYCL_LANGUAGE_VERSION
1065 #define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel]]
1066 #else
1067 #define __SYCL_KERNEL_ATTR__
1068 #endif
1069  // NOTE: the name of this function - "kernel_single_task" - is used by the
1070  // Front End to determine kernel invocation kind.
1071  template <typename KernelName, typename KernelType>
1073 #ifdef __SYCL_NONCONST_FUNCTOR__
1074  kernel_single_task(KernelType KernelFunc) {
1075 #else
1076  kernel_single_task(const KernelType &KernelFunc) {
1077 #endif
1078 #ifdef __SYCL_DEVICE_ONLY__
1079  KernelFunc();
1080 #else
1081  (void)KernelFunc;
1082 #endif
1083  }
1084 
1085  // NOTE: the name of this function - "kernel_single_task" - is used by the
1086  // Front End to determine kernel invocation kind.
1087  template <typename KernelName, typename KernelType>
1089 #ifdef __SYCL_NONCONST_FUNCTOR__
1090  kernel_single_task(KernelType KernelFunc, kernel_handler KH) {
1091 #else
1092  kernel_single_task(const KernelType &KernelFunc, kernel_handler KH) {
1093 #endif
1094 #ifdef __SYCL_DEVICE_ONLY__
1095  KernelFunc(KH);
1096 #else
1097  (void)KernelFunc;
1098  (void)KH;
1099 #endif
1100  }
1101 
1102  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1103  // Front End to determine kernel invocation kind.
1104  template <typename KernelName, typename ElementType, typename KernelType>
1106 #ifdef __SYCL_NONCONST_FUNCTOR__
1107  kernel_parallel_for(KernelType KernelFunc) {
1108 #else
1109  kernel_parallel_for(const KernelType &KernelFunc) {
1110 #endif
1111 #ifdef __SYCL_DEVICE_ONLY__
1112  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1113 #else
1114  (void)KernelFunc;
1115 #endif
1116  }
1117 
1118  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1119  // Front End to determine kernel invocation kind.
1120  template <typename KernelName, typename ElementType, typename KernelType>
1122 #ifdef __SYCL_NONCONST_FUNCTOR__
1123  kernel_parallel_for(KernelType KernelFunc, kernel_handler KH) {
1124 #else
1125  kernel_parallel_for(const KernelType &KernelFunc, kernel_handler KH) {
1126 #endif
1127 #ifdef __SYCL_DEVICE_ONLY__
1128  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1129 #else
1130  (void)KernelFunc;
1131  (void)KH;
1132 #endif
1133  }
1134 
1135  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1136  // used by the Front End to determine kernel invocation kind.
1137  template <typename KernelName, typename ElementType, typename KernelType>
1139 #ifdef __SYCL_NONCONST_FUNCTOR__
1140  kernel_parallel_for_work_group(KernelType KernelFunc) {
1141 #else
1142  kernel_parallel_for_work_group(const KernelType &KernelFunc) {
1143 #endif
1144 #ifdef __SYCL_DEVICE_ONLY__
1145  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1146 #else
1147  (void)KernelFunc;
1148 #endif
1149  }
1150 
1151  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1152  // used by the Front End to determine kernel invocation kind.
1153  template <typename KernelName, typename ElementType, typename KernelType>
1155 #ifdef __SYCL_NONCONST_FUNCTOR__
1156  kernel_parallel_for_work_group(KernelType KernelFunc, kernel_handler KH) {
1157 #else
1158  kernel_parallel_for_work_group(const KernelType &KernelFunc,
1159  kernel_handler KH) {
1160 #endif
1161 #ifdef __SYCL_DEVICE_ONLY__
1162  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1163 #else
1164  (void)KernelFunc;
1165  (void)KH;
1166 #endif
1167  }
1168 
1169  // Wrappers for kernel_*** functions above with and without support of
1170  // additional kernel_handler argument.
1171 
1172  // NOTE: to support kernel_handler argument in kernel lambdas, only
1173  // kernel_***_wrapper functions must be called in this code
1174 
1175  // Wrappers for kernel_single_task(...)
1176 
1177  template <typename KernelName, typename KernelType>
1178  std::enable_if_t<detail::KernelLambdaHasKernelHandlerArgT<KernelType>::value>
1179 #ifdef __SYCL_NONCONST_FUNCTOR__
1180  kernel_single_task_wrapper(KernelType KernelFunc) {
1181 #else
1182  kernel_single_task_wrapper(const KernelType &KernelFunc) {
1183 #endif
1184 #ifdef __SYCL_DEVICE_ONLY__
1185  detail::CheckDeviceCopyable<KernelType>();
1186 #endif // __SYCL_DEVICE_ONLY__
1187  kernel_handler KH;
1188  kernel_single_task<KernelName>(KernelFunc, KH);
1189  }
1190 
1191  template <typename KernelName, typename KernelType>
1192  std::enable_if_t<!detail::KernelLambdaHasKernelHandlerArgT<KernelType>::value>
1193 #ifdef __SYCL_NONCONST_FUNCTOR__
1194  kernel_single_task_wrapper(KernelType KernelFunc) {
1195 #else
1196  kernel_single_task_wrapper(const KernelType &KernelFunc) {
1197 #endif
1198 #ifdef __SYCL_DEVICE_ONLY__
1199  detail::CheckDeviceCopyable<KernelType>();
1200 #endif // __SYCL_DEVICE_ONLY__
1201  kernel_single_task<KernelName>(KernelFunc);
1202  }
1203 
1204  // Wrappers for kernel_parallel_for(...)
1205 
1206  template <typename KernelName, typename ElementType, typename KernelType>
1208  detail::KernelLambdaHasKernelHandlerArgT<KernelType, ElementType>::value>
1209 #ifdef __SYCL_NONCONST_FUNCTOR__
1210  kernel_parallel_for_wrapper(KernelType KernelFunc) {
1211 #else
1212  kernel_parallel_for_wrapper(const KernelType &KernelFunc) {
1213 #endif
1214 #ifdef __SYCL_DEVICE_ONLY__
1215  detail::CheckDeviceCopyable<KernelType>();
1216 #endif // __SYCL_DEVICE_ONLY__
1217  kernel_handler KH;
1218  kernel_parallel_for<KernelName, ElementType>(KernelFunc, KH);
1219  }
1220 
1221  template <typename KernelName, typename ElementType, typename KernelType>
1223  !detail::KernelLambdaHasKernelHandlerArgT<KernelType, ElementType>::value>
1224 #ifdef __SYCL_NONCONST_FUNCTOR__
1225  kernel_parallel_for_wrapper(KernelType KernelFunc) {
1226 #else
1227  kernel_parallel_for_wrapper(const KernelType &KernelFunc) {
1228 #endif
1229 #ifdef __SYCL_DEVICE_ONLY__
1230  detail::CheckDeviceCopyable<KernelType>();
1231 #endif // __SYCL_DEVICE_ONLY__
1232  kernel_parallel_for<KernelName, ElementType>(KernelFunc);
1233  }
1234 
1235  // Wrappers for kernel_parallel_for_work_group(...)
1236 
1237  template <typename KernelName, typename ElementType, typename KernelType>
1239  detail::KernelLambdaHasKernelHandlerArgT<KernelType, ElementType>::value>
1240 #ifdef __SYCL_NONCONST_FUNCTOR__
1241  kernel_parallel_for_work_group_wrapper(KernelType KernelFunc) {
1242 #else
1243  kernel_parallel_for_work_group_wrapper(const KernelType &KernelFunc) {
1244 #endif
1245 #ifdef __SYCL_DEVICE_ONLY__
1246  detail::CheckDeviceCopyable<KernelType>();
1247 #endif // __SYCL_DEVICE_ONLY__
1248  kernel_handler KH;
1249  kernel_parallel_for_work_group<KernelName, ElementType>(KernelFunc, KH);
1250  }
1251 
1252  template <typename KernelName, typename ElementType, typename KernelType>
1254  !detail::KernelLambdaHasKernelHandlerArgT<KernelType, ElementType>::value>
1255 #ifdef __SYCL_NONCONST_FUNCTOR__
1256  kernel_parallel_for_work_group_wrapper(KernelType KernelFunc) {
1257 #else
1258  kernel_parallel_for_work_group_wrapper(const KernelType &KernelFunc) {
1259 #endif
1260 #ifdef __SYCL_DEVICE_ONLY__
1261  detail::CheckDeviceCopyable<KernelType>();
1262 #endif // __SYCL_DEVICE_ONLY__
1263  kernel_parallel_for_work_group<KernelName, ElementType>(KernelFunc);
1264  }
1265 
1266  std::shared_ptr<detail::handler_impl> getHandlerImpl() const;
1267 
1268  void setStateExplicitKernelBundle();
1269  void setStateSpecConstSet();
1270  bool isStateExplicitKernelBundle() const;
1271 
1272  std::shared_ptr<detail::kernel_bundle_impl>
1273  getOrInsertHandlerKernelBundle(bool Insert) const;
1274 
1275  void setHandlerKernelBundle(
1276  const std::shared_ptr<detail::kernel_bundle_impl> &NewKernelBundleImpPtr);
1277 
1278  template <typename FuncT>
1280  detail::check_fn_signature<detail::remove_reference_t<FuncT>,
1281  void()>::value ||
1282  detail::check_fn_signature<detail::remove_reference_t<FuncT>,
1283  void(interop_handle)>::value>
1284  host_task_impl(FuncT &&Func) {
1285  throwIfActionIsCreated();
1286 
1287  MNDRDesc.set(range<1>(1));
1288  MArgs = std::move(MAssociatedAccesors);
1289 
1290  MHostTask.reset(new detail::HostTask(std::move(Func)));
1291 
1292  setType(detail::CG::CodeplayHostTask);
1293  }
1294 
1295 public:
1296  handler(const handler &) = delete;
1297  handler(handler &&) = delete;
1298  handler &operator=(const handler &) = delete;
1299  handler &operator=(handler &&) = delete;
1300 
1301 #if __cplusplus > 201402L
1302  template <auto &SpecName>
1303  void set_specialization_constant(
1304  typename std::remove_reference_t<decltype(SpecName)>::value_type Value) {
1305 
1306  setStateSpecConstSet();
1307 
1308  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1309  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1310 
1311  detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1313  .set_specialization_constant<SpecName>(Value);
1314  }
1315 
1316  template <auto &SpecName>
1317  typename std::remove_reference_t<decltype(SpecName)>::value_type
1318  get_specialization_constant() const {
1319 
1320  if (isStateExplicitKernelBundle())
1321  throw sycl::exception(make_error_code(errc::invalid),
1322  "Specialization constants cannot be read after "
1323  "explicitly setting the used kernel bundle");
1324 
1325  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1326  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1327 
1328  return detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1330  .get_specialization_constant<SpecName>();
1331  }
1332 
1333 #endif
1334 
1335  void
1336  use_kernel_bundle(const kernel_bundle<bundle_state::executable> &ExecBundle);
1337 
1345  template <typename DataT, int Dims, access::mode AccMode,
1346  access::target AccTarget>
1347  void
1349  Acc) {
1350 #ifndef __SYCL_DEVICE_ONLY__
1351  associateWithHandler(&Acc, AccTarget);
1352 #else
1353  (void)Acc;
1354 #endif
1355  }
1356 
1360  void depends_on(event Event);
1361 
1365  void depends_on(const std::vector<event> &Events);
1366 
1367  template <typename T>
1368  using remove_cv_ref_t =
1370 
1371  template <typename U, typename T>
1372  using is_same_type = std::is_same<remove_cv_ref_t<U>, remove_cv_ref_t<T>>;
1373 
1374  template <typename T> struct ShouldEnableSetArg {
1375  static constexpr bool value =
1376  std::is_trivially_copyable<detail::remove_reference_t<T>>::value
1377 #if SYCL_LANGUAGE_VERSION && SYCL_LANGUAGE_VERSION <= 201707
1378  && std::is_standard_layout<detail::remove_reference_t<T>>::value
1379 #endif
1380  || is_same_type<sampler, T>::value // Sampler
1382  std::is_pointer<remove_cv_ref_t<T>>::value) // USM
1383  || is_same_type<cl_mem, T>::value; // Interop
1384  };
1385 
1392  template <typename T>
1394  set_arg(int ArgIndex, T &&Arg) {
1395  setArgHelper(ArgIndex, std::move(Arg));
1396  }
1397 
1398  template <typename DataT, int Dims, access::mode AccessMode,
1399  access::target AccessTarget, access::placeholder IsPlaceholder>
1400  void
1401  set_arg(int ArgIndex,
1403  setArgHelper(ArgIndex, std::move(Arg));
1404  }
1405 
1411  template <typename... Ts> void set_args(Ts &&... Args) {
1412  setArgsHelper(0, std::move(Args)...);
1413  }
1414 
1422  template <typename KernelName = detail::auto_name, typename KernelType>
1423 #ifdef __SYCL_NONCONST_FUNCTOR__
1424  void single_task(KernelType KernelFunc) {
1425 #else
1426  void single_task(const KernelType &KernelFunc) {
1427 #endif
1428  throwIfActionIsCreated();
1429  using NameT =
1431  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1432  kernel_single_task_wrapper<NameT>(KernelFunc);
1433 #ifndef __SYCL_DEVICE_ONLY__
1434  // No need to check if range is out of INT_MAX limits as it's compile-time
1435  // known constant.
1436  MNDRDesc.set(range<1>{1});
1437 
1438  StoreLambda<NameT, KernelType, /*Dims*/ 0, void>(KernelFunc);
1439  setType(detail::CG::Kernel);
1440 #endif
1441  }
1442 
1443  template <typename KernelName = detail::auto_name, typename KernelType>
1444 #ifdef __SYCL_NONCONST_FUNCTOR__
1445  void parallel_for(range<1> NumWorkItems, KernelType KernelFunc) {
1446 #else
1447  void parallel_for(range<1> NumWorkItems, const KernelType &KernelFunc) {
1448 #endif
1449  parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
1450  }
1451 
1452  template <typename KernelName = detail::auto_name, typename KernelType>
1453 #ifdef __SYCL_NONCONST_FUNCTOR__
1454  void parallel_for(range<2> NumWorkItems, KernelType KernelFunc) {
1455 #else
1456  void parallel_for(range<2> NumWorkItems, const KernelType &KernelFunc) {
1457 #endif
1458  parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
1459  }
1460 
1461  template <typename KernelName = detail::auto_name, typename KernelType>
1462 #ifdef __SYCL_NONCONST_FUNCTOR__
1463  void parallel_for(range<3> NumWorkItems, KernelType KernelFunc) {
1464 #else
1465  void parallel_for(range<3> NumWorkItems, const KernelType &KernelFunc) {
1466 #endif
1467  parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
1468  }
1469 
1474  template <typename FuncT>
1476  "run_on_host_intel() is deprecated, use host_task() instead")
1477  void run_on_host_intel(FuncT Func) {
1478  throwIfActionIsCreated();
1479  // No need to check if range is out of INT_MAX limits as it's compile-time
1480  // known constant
1481  MNDRDesc.set(range<1>{1});
1482 
1483  MArgs = std::move(MAssociatedAccesors);
1484  MHostKernel.reset(new detail::HostKernel<FuncT, void, 1>(std::move(Func)));
1485  setType(detail::CG::RunOnHostIntel);
1486  }
1487 
1489  template <typename FuncT>
1492  void()>::value ||
1494  void(interop_handle)>::value>
1495  host_task(FuncT &&Func) {
1496  host_task_impl(Func);
1497  }
1498 
1499 // replace _KERNELFUNCPARAM(KernelFunc) with KernelType KernelFunc
1500 // or const KernelType &KernelFunc
1501 #ifdef __SYCL_NONCONST_FUNCTOR__
1502 #define _KERNELFUNCPARAM(a) KernelType a
1503 #else
1504 #define _KERNELFUNCPARAM(a) const KernelType &a
1505 #endif
1506 
1520  template <typename KernelName = detail::auto_name, typename KernelType,
1521  int Dims>
1522  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL2020")
1523  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
1525  throwIfActionIsCreated();
1526  using NameT =
1528  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1530  (void)NumWorkItems;
1531  (void)WorkItemOffset;
1532  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
1533 #ifndef __SYCL_DEVICE_ONLY__
1534  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
1535  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
1536  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1537  setType(detail::CG::Kernel);
1538 #endif
1539  }
1540 
1553  template <typename KernelName = detail::auto_name, typename KernelType,
1554  int Dims>
1555  void parallel_for(nd_range<Dims> ExecutionRange,
1557  throwIfActionIsCreated();
1558  using NameT =
1560  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1561  using LambdaArgType =
1563  // If user type is convertible from sycl::item/sycl::nd_item, use
1564  // sycl::item/sycl::nd_item to transport item information
1565  using TransformedArgType =
1566  typename TransformUserItemType<Dims, LambdaArgType>::type;
1567  (void)ExecutionRange;
1568  kernel_parallel_for_wrapper<NameT, TransformedArgType>(KernelFunc);
1569 #ifndef __SYCL_DEVICE_ONLY__
1570  detail::checkValueRange<Dims>(ExecutionRange);
1571  MNDRDesc.set(std::move(ExecutionRange));
1572  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1573  std::move(KernelFunc));
1574  setType(detail::CG::Kernel);
1575 #endif
1576  }
1577 
1586  template <typename KernelName = detail::auto_name, typename KernelType,
1587  int Dims, typename Reduction>
1588  void parallel_for(range<Dims> Range, Reduction Redu,
1590  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1591 
1592  // Before running the kernels, check that device has enough local memory
1593  // to hold local arrays required for the tree-reduction algorithm.
1594  constexpr bool IsTreeReduction =
1595  !Reduction::has_fast_reduce && !Reduction::has_fast_atomics;
1596  size_t OneElemSize =
1597  IsTreeReduction ? sizeof(typename Reduction::result_type) : 0;
1598  uint32_t NumConcurrentWorkGroups =
1599 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
1600  __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
1601 #else
1603 #endif
1604  // TODO: currently the maximal work group size is determined for the given
1605  // queue/device, while it is safer to use queries to the kernel pre-compiled
1606  // for the device.
1607  size_t MaxWGSize =
1608  ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
1609  ext::oneapi::detail::reduCGFunc<KernelName>(
1610  *this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu);
1611  if (Reduction::is_usm ||
1612  (Reduction::has_fast_atomics && Redu.initializeToIdentity()) ||
1613  (!Reduction::has_fast_atomics && Redu.hasUserDiscardWriteAccessor())) {
1614  this->finalize();
1615  handler CopyHandler(QueueCopy, MIsHost);
1616  CopyHandler.saveCodeLoc(MCodeLoc);
1617  ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1618  Redu);
1619  MLastEvent = CopyHandler.finalize();
1620  }
1621  }
1622 
1626  //
1627  // If the reduction variable must be initialized with the identity value
1628  // before the kernel run, then an additional working accessor is created,
1629  // initialized with the identity value and used in the kernel. That working
1630  // accessor is then copied to user's accessor or USM pointer after
1631  // the kernel run.
1632  // For USM pointers without initialize_to_identity properties the same scheme
1633  // with working accessor is used as re-using user's USM pointer in the kernel
1634  // would require creation of another variant of user's kernel, which does not
1635  // seem efficient.
1636  template <typename KernelName = detail::auto_name, typename KernelType,
1637  int Dims, typename Reduction>
1639  parallel_for(nd_range<Dims> Range, Reduction Redu,
1641  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1642  ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
1643 
1644  if (Reduction::is_usm || Redu.initializeToIdentity()) {
1645  this->finalize();
1646  handler CopyHandler(QueueCopy, MIsHost);
1647  CopyHandler.saveCodeLoc(MCodeLoc);
1648  ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1649  Redu);
1650  MLastEvent = CopyHandler.finalize();
1651  }
1652  }
1653 
1660  //
1661  // If the reduction variable must be initialized with the identity value
1662  // before the kernel run, then an additional working accessor is created,
1663  // initialized with the identity value and used in the kernel. That working
1664  // accessor is then copied to user's accessor or USM pointer after
1665  // the kernel run.
1666  // For USM pointers without initialize_to_identity properties the same scheme
1667  // with working accessor is used as re-using user's USM pointer in the kernel
1668  // would require creation of another variant of user's kernel, which does not
1669  // seem efficient.
1670  template <typename KernelName = detail::auto_name, typename KernelType,
1671  int Dims, typename Reduction>
1673  parallel_for(nd_range<Dims> Range, Reduction Redu,
1675 
1676  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1678 
1679  if (D.has(aspect::atomic64)) {
1680 
1681  ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
1682  Range, Redu);
1683 
1684  if (Reduction::is_usm || Redu.initializeToIdentity()) {
1685  this->finalize();
1686  handler CopyHandler(QueueCopy, MIsHost);
1687  CopyHandler.saveCodeLoc(MCodeLoc);
1688  ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1689  CopyHandler, Redu);
1690  MLastEvent = CopyHandler.finalize();
1691  }
1692  } else {
1693  parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1694  }
1695  }
1696 
1711  template <typename KernelName = detail::auto_name, typename KernelType,
1712  int Dims, typename Reduction>
1713  detail::enable_if_t<!Reduction::has_fast_atomics &&
1714  !Reduction::has_atomic_add_float64>
1715  parallel_for(nd_range<Dims> Range, Reduction Redu,
1717 
1718  parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1719  }
1720 
1721  template <typename KernelName, typename KernelType, int Dims,
1722  typename Reduction>
1724  parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
1725  KernelType KernelFunc) {
1726  // This parallel_for() is lowered to the following sequence:
1727  // 1) Call a kernel that a) call user's lambda function and b) performs
1728  // one iteration of reduction, storing the partial reductions/sums
1729  // to either a newly created global buffer or to user's reduction
1730  // accessor. So, if the original 'Range' has totally
1731  // N1 elements and work-group size is W, then after the first iteration
1732  // there will be N2 partial sums where N2 = N1 / W.
1733  // If (N2 == 1) then the partial sum is written to user's accessor.
1734  // Otherwise, a new global buffer is created and partial sums are written
1735  // to it.
1736  // 2) Call an aux kernel (if necessary, i.e. if N2 > 1) as many times as
1737  // necessary to reduce all partial sums into one final sum.
1738 
1739  // Before running the kernels, check that device has enough local memory
1740  // to hold local arrays that may be required for the reduction algorithm.
1741  // TODO: If the work-group-size is limited by the local memory, then
1742  // a special version of the main kernel may be created. The one that would
1743  // not use local accessors, which means it would not do the reduction in
1744  // the main kernel, but simply generate Range.get_global_range.size() number
1745  // of partial sums, leaving the reduction work to the additional/aux
1746  // kernels.
1747  constexpr bool HFR = Reduction::has_fast_reduce;
1748  size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
1749  // TODO: currently the maximal work group size is determined for the given
1750  // queue/device, while it may be safer to use queries to the kernel compiled
1751  // for the device.
1752  size_t MaxWGSize =
1753  ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
1754  if (Range.get_local_range().size() > MaxWGSize)
1755  throw sycl::runtime_error("The implementation handling parallel_for with"
1756  " reduction requires work group size not bigger"
1757  " than " +
1758  std::to_string(MaxWGSize),
1760 
1761  // 1. Call the kernel that includes user's lambda function.
1762  ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
1763  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1764  this->finalize();
1765 
1766  // 2. Run the additional kernel as many times as needed to reduce
1767  // all partial sums into one scalar.
1768 
1769  // TODO: Create a special slow/sequential version of the kernel that would
1770  // handle the reduction instead of reporting an assert below.
1771  if (MaxWGSize <= 1)
1772  throw sycl::runtime_error("The implementation handling parallel_for with "
1773  "reduction requires the maximal work group "
1774  "size to be greater than 1 to converge. "
1775  "The maximal work group size depends on the "
1776  "device and the size of the objects passed to "
1777  "the reduction.",
1779  size_t NWorkItems = Range.get_group_range().size();
1780  while (NWorkItems > 1) {
1781  handler AuxHandler(QueueCopy, MIsHost);
1782  AuxHandler.saveCodeLoc(MCodeLoc);
1783 
1784  NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
1785  AuxHandler, NWorkItems, MaxWGSize, Redu);
1786  MLastEvent = AuxHandler.finalize();
1787  } // end while (NWorkItems > 1)
1788 
1789  if (Reduction::is_usm || Redu.hasUserDiscardWriteAccessor()) {
1790  handler CopyHandler(QueueCopy, MIsHost);
1791  CopyHandler.saveCodeLoc(MCodeLoc);
1792  ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1793  Redu);
1794  MLastEvent = CopyHandler.finalize();
1795  }
1796  }
1797 
1798  // This version of parallel_for may handle one or more reductions packed in
1799  // \p Rest argument. Note thought that the last element in \p Rest pack is
1800  // the kernel function.
1801  // TODO: this variant is currently enabled for 2+ reductions only as the
1802  // versions handling 1 reduction variable are more efficient right now.
1803  //
1804  // Algorithm:
1805  // 1) discard_write accessor (DWAcc), InitializeToIdentity = true:
1806  // a) Create uninitialized buffer and read_write accessor (RWAcc).
1807  // b) discard-write partial sums to RWAcc.
1808  // c) Repeat the steps (a) and (b) to get one final sum.
1809  // d) Copy RWAcc to DWAcc.
1810  // 2) read_write accessor (RWAcc), InitializeToIdentity = false:
1811  // a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1812  // re-use user's RWAcc (if #work-groups is 1).
1813  // b) discard-write to RWAcc (#WG > 1), or update-write (#WG == 1).
1814  // c) Repeat the steps (a) and (b) to get one final sum.
1815  // 3) read_write accessor (RWAcc), InitializeToIdentity = true:
1816  // a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1817  // re-use user's RWAcc (if #work-groups is 1).
1818  // b) discard-write to RWAcc.
1819  // c) Repeat the steps (a) and (b) to get one final sum.
1820  // 4) USM pointer, InitializeToIdentity = false:
1821  // a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1822  // re-use user's USM pointer (if #work-groups is 1).
1823  // b) discard-write to RWAcc (#WG > 1) or
1824  // update-write to USM pointer (#WG == 1).
1825  // c) Repeat the steps (a) and (b) to get one final sum.
1826  // 5) USM pointer, InitializeToIdentity = true:
1827  // a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1828  // re-use user's USM pointer (if #work-groups is 1).
1829  // b) discard-write to RWAcc (#WG > 1) or
1830  // discard-write to USM pointer (#WG == 1).
1831  // c) Repeat the steps (a) and (b) to get one final sum.
1832  template <typename KernelName = detail::auto_name, int Dims,
1833  typename... RestT>
1835  (sizeof...(RestT) >= 3 &&
1837  parallel_for(nd_range<Dims> Range, RestT... Rest) {
1838  std::tuple<RestT...> ArgsTuple(Rest...);
1839  constexpr size_t NumArgs = sizeof...(RestT);
1840  auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
1841  auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
1842  auto ReduTuple =
1843  ext::oneapi::detail::tuple_select_elements(ArgsTuple, ReduIndices);
1844 
1845  size_t LocalMemPerWorkItem =
1846  ext::oneapi::detail::reduGetMemPerWorkItem(ReduTuple, ReduIndices);
1847  // TODO: currently the maximal work group size is determined for the given
1848  // queue/device, while it is safer to use queries to the kernel compiled
1849  // for the device.
1850  size_t MaxWGSize =
1851  ext::oneapi::detail::reduGetMaxWGSize(MQueue, LocalMemPerWorkItem);
1852  if (Range.get_local_range().size() > MaxWGSize)
1853  throw sycl::runtime_error("The implementation handling parallel_for with"
1854  " reduction requires work group size not bigger"
1855  " than " +
1856  std::to_string(MaxWGSize),
1858 
1859  ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
1860  ReduTuple, ReduIndices);
1861  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1862  this->finalize();
1863 
1864  size_t NWorkItems = Range.get_group_range().size();
1865  while (NWorkItems > 1) {
1866  handler AuxHandler(QueueCopy, MIsHost);
1867  AuxHandler.saveCodeLoc(MCodeLoc);
1868 
1869  NWorkItems =
1870  ext::oneapi::detail::reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
1871  AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
1872  MLastEvent = AuxHandler.finalize();
1873  } // end while (NWorkItems > 1)
1874 
1876  QueueCopy, MIsHost, ReduTuple, ReduIndices);
1877  if (CopyEvent)
1878  MLastEvent = *CopyEvent;
1879  }
1880 
1891  template <typename KernelName = detail::auto_name, typename KernelType,
1892  int Dims>
1895  throwIfActionIsCreated();
1896  using NameT =
1898  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1899  using LambdaArgType =
1901  (void)NumWorkGroups;
1902  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
1903 #ifndef __SYCL_DEVICE_ONLY__
1904  detail::checkValueRange<Dims>(NumWorkGroups);
1905  MNDRDesc.setNumWorkGroups(NumWorkGroups);
1906  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1907  setType(detail::CG::Kernel);
1908 #endif // __SYCL_DEVICE_ONLY__
1909  }
1910 
1923  template <typename KernelName = detail::auto_name, typename KernelType,
1924  int Dims>
1926  range<Dims> WorkGroupSize,
1928  throwIfActionIsCreated();
1929  using NameT =
1931  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1932  using LambdaArgType =
1934  (void)NumWorkGroups;
1935  (void)WorkGroupSize;
1936  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
1937 #ifndef __SYCL_DEVICE_ONLY__
1938  nd_range<Dims> ExecRange =
1939  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
1940  detail::checkValueRange<Dims>(ExecRange);
1941  MNDRDesc.set(std::move(ExecRange));
1942  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1943  setType(detail::CG::Kernel);
1944 #endif // __SYCL_DEVICE_ONLY__
1945  }
1946 
1953  void single_task(kernel Kernel) {
1954  throwIfActionIsCreated();
1955  verifyKernelInvoc(Kernel);
1956  // Ignore any set kernel bundles and use the one associated with the kernel
1957  setHandlerKernelBundle(detail::getSyclObjImpl(Kernel.get_kernel_bundle()));
1958  // No need to check if range is out of INT_MAX limits as it's compile-time
1959  // known constant
1960  MNDRDesc.set(range<1>{1});
1961  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1962  setType(detail::CG::Kernel);
1963  extractArgsAndReqs();
1964  MKernelName = getKernelName();
1965  }
1966 
1967  void parallel_for(range<1> NumWorkItems, kernel Kernel) {
1968  parallel_for_impl(NumWorkItems, Kernel);
1969  }
1970 
1971  void parallel_for(range<2> NumWorkItems, kernel Kernel) {
1972  parallel_for_impl(NumWorkItems, Kernel);
1973  }
1974 
1975  void parallel_for(range<3> NumWorkItems, kernel Kernel) {
1976  parallel_for_impl(NumWorkItems, Kernel);
1977  }
1978 
1987  template <int Dims>
1988  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
1989  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
1990  kernel Kernel) {
1991  throwIfActionIsCreated();
1992  verifyKernelInvoc(Kernel);
1993  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1994  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
1995  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
1996  setType(detail::CG::Kernel);
1997  extractArgsAndReqs();
1998  MKernelName = getKernelName();
1999  }
2000 
2009  template <int Dims> void parallel_for(nd_range<Dims> NDRange, kernel Kernel) {
2010  throwIfActionIsCreated();
2011  verifyKernelInvoc(Kernel);
2012  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2013  detail::checkValueRange<Dims>(NDRange);
2014  MNDRDesc.set(std::move(NDRange));
2015  setType(detail::CG::Kernel);
2016  extractArgsAndReqs();
2017  MKernelName = getKernelName();
2018  }
2019 
2026  template <typename KernelName = detail::auto_name, typename KernelType>
2028  throwIfActionIsCreated();
2029  // Ignore any set kernel bundles and use the one associated with the kernel
2030  setHandlerKernelBundle(detail::getSyclObjImpl(Kernel.get_kernel_bundle()));
2031  using NameT =
2033  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2034  (void)Kernel;
2035  kernel_single_task<NameT>(KernelFunc);
2036 #ifndef __SYCL_DEVICE_ONLY__
2037  // No need to check if range is out of INT_MAX limits as it's compile-time
2038  // known constant
2039  MNDRDesc.set(range<1>{1});
2040  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2041  setType(detail::CG::Kernel);
2042  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2043  extractArgsAndReqs();
2044  MKernelName = getKernelName();
2045  } else
2046  StoreLambda<NameT, KernelType, /*Dims*/ 0, void>(std::move(KernelFunc));
2047 #else
2048  detail::CheckDeviceCopyable<KernelType>();
2049 #endif
2050  }
2051 
2055  template <typename FuncT>
2056  __SYCL_DEPRECATED("interop_task() is deprecated, use host_task() instead")
2057  void interop_task(FuncT Func) {
2058 
2059  MInteropTask.reset(new detail::InteropTask(std::move(Func)));
2060  setType(detail::CG::CodeplayInteropTask);
2061  }
2062 
2070  template <typename KernelName = detail::auto_name, typename KernelType,
2071  int Dims>
2072  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
2074  throwIfActionIsCreated();
2075  // Ignore any set kernel bundles and use the one associated with the kernel
2076  setHandlerKernelBundle(detail::getSyclObjImpl(Kernel.get_kernel_bundle()));
2077  using NameT =
2079  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2081  (void)Kernel;
2082  (void)NumWorkItems;
2083  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2084 #ifndef __SYCL_DEVICE_ONLY__
2085  detail::checkValueRange<Dims>(NumWorkItems);
2086  MNDRDesc.set(std::move(NumWorkItems));
2087  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2088  setType(detail::CG::Kernel);
2089  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2090  extractArgsAndReqs();
2091  MKernelName = getKernelName();
2092  } else
2093  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2094  std::move(KernelFunc));
2095 #endif
2096  }
2097 
2107  template <typename KernelName = detail::auto_name, typename KernelType,
2108  int Dims>
2109  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
2110  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
2111  id<Dims> WorkItemOffset, _KERNELFUNCPARAM(KernelFunc)) {
2112  throwIfActionIsCreated();
2113  // Ignore any set kernel bundles and use the one associated with the kernel
2114  setHandlerKernelBundle(detail::getSyclObjImpl(Kernel.get_kernel_bundle()));
2115  using NameT =
2117  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2119  (void)Kernel;
2120  (void)NumWorkItems;
2121  (void)WorkItemOffset;
2122  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2123 #ifndef __SYCL_DEVICE_ONLY__
2124  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2125  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
2126  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2127  setType(detail::CG::Kernel);
2128  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2129  extractArgsAndReqs();
2130  MKernelName = getKernelName();
2131  } else
2132  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2133  std::move(KernelFunc));
2134 #endif
2135  }
2136 
2146  template <typename KernelName = detail::auto_name, typename KernelType,
2147  int Dims>
2148  void parallel_for(kernel Kernel, nd_range<Dims> NDRange,
2150  throwIfActionIsCreated();
2151  // Ignore any set kernel bundles and use the one associated with the kernel
2152  setHandlerKernelBundle(detail::getSyclObjImpl(Kernel.get_kernel_bundle()));
2153  using NameT =
2155  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2156  using LambdaArgType =
2158  (void)Kernel;
2159  (void)NDRange;
2160  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2161 #ifndef __SYCL_DEVICE_ONLY__
2162  detail::checkValueRange<Dims>(NDRange);
2163  MNDRDesc.set(std::move(NDRange));
2164  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2165  setType(detail::CG::Kernel);
2166  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2167  extractArgsAndReqs();
2168  MKernelName = getKernelName();
2169  } else
2170  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2171  std::move(KernelFunc));
2172 #endif
2173  }
2174 
2188  template <typename KernelName = detail::auto_name, typename KernelType,
2189  int Dims>
2190  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2192  throwIfActionIsCreated();
2193  // Ignore any set kernel bundles and use the one associated with the kernel
2194  setHandlerKernelBundle(detail::getSyclObjImpl(Kernel.get_kernel_bundle()));
2195  using NameT =
2197  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2198  using LambdaArgType =
2200  (void)Kernel;
2201  (void)NumWorkGroups;
2202  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2203 #ifndef __SYCL_DEVICE_ONLY__
2204  detail::checkValueRange<Dims>(NumWorkGroups);
2205  MNDRDesc.setNumWorkGroups(NumWorkGroups);
2206  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2207  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2208  setType(detail::CG::Kernel);
2209 #endif // __SYCL_DEVICE_ONLY__
2210  }
2211 
2227  template <typename KernelName = detail::auto_name, typename KernelType,
2228  int Dims>
2229  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2230  range<Dims> WorkGroupSize,
2232  throwIfActionIsCreated();
2233  // Ignore any set kernel bundles and use the one associated with the kernel
2234  setHandlerKernelBundle(detail::getSyclObjImpl(Kernel.get_kernel_bundle()));
2235  using NameT =
2237  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2238  using LambdaArgType =
2240  (void)Kernel;
2241  (void)NumWorkGroups;
2242  (void)WorkGroupSize;
2243  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2244 #ifndef __SYCL_DEVICE_ONLY__
2245  nd_range<Dims> ExecRange =
2246  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
2247  detail::checkValueRange<Dims>(ExecRange);
2248  MNDRDesc.set(std::move(ExecRange));
2249  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2250  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2251  setType(detail::CG::Kernel);
2252 #endif // __SYCL_DEVICE_ONLY__
2253  }
2254 
2255  // Clean up KERNELFUNC macro.
2256 #undef _KERNELFUNCPARAM
2257 
2258  // Explicit copy operations API
2259 
2267  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2268  access::target AccessTarget,
2269  access::placeholder IsPlaceholder = access::placeholder::false_t>
2271  std::shared_ptr<T_Dst> Dst) {
2272  throwIfActionIsCreated();
2273  static_assert(isValidTargetForExplicitOp(AccessTarget),
2274  "Invalid accessor target for the copy method.");
2275  static_assert(isValidModeForSourceAccessor(AccessMode),
2276  "Invalid accessor mode for the copy method.");
2277  // Make sure data shared_ptr points to is not released until we finish
2278  // work with it.
2279  MSharedPtrStorage.push_back(Dst);
2280  typename std::shared_ptr<T_Dst>::element_type *RawDstPtr = Dst.get();
2281  copy(Src, RawDstPtr);
2282  }
2283 
2291  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2292  access::target AccessTarget,
2293  access::placeholder IsPlaceholder = access::placeholder::false_t>
2294  void
2295  copy(std::shared_ptr<T_Src> Src,
2297  throwIfActionIsCreated();
2298  static_assert(isValidTargetForExplicitOp(AccessTarget),
2299  "Invalid accessor target for the copy method.");
2300  static_assert(isValidModeForDestinationAccessor(AccessMode),
2301  "Invalid accessor mode for the copy method.");
2302  // Make sure data shared_ptr points to is not released until we finish
2303  // work with it.
2304  MSharedPtrStorage.push_back(Src);
2305  typename std::shared_ptr<T_Src>::element_type *RawSrcPtr = Src.get();
2306  copy(RawSrcPtr, Dst);
2307  }
2308 
2316  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2317  access::target AccessTarget,
2318  access::placeholder IsPlaceholder = access::placeholder::false_t>
2320  T_Dst *Dst) {
2321  throwIfActionIsCreated();
2322  static_assert(isValidTargetForExplicitOp(AccessTarget),
2323  "Invalid accessor target for the copy method.");
2324  static_assert(isValidModeForSourceAccessor(AccessMode),
2325  "Invalid accessor mode for the copy method.");
2326 #ifndef __SYCL_DEVICE_ONLY__
2327  if (MIsHost) {
2328  // TODO: Temporary implementation for host. Should be handled by memory
2329  // manager.
2330  copyAccToPtrHost(Src, Dst);
2331  return;
2332  }
2333 #endif
2334  setType(detail::CG::CopyAccToPtr);
2335 
2337  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2338 
2339  MRequirements.push_back(AccImpl.get());
2340  MSrcPtr = static_cast<void *>(AccImpl.get());
2341  MDstPtr = static_cast<void *>(Dst);
2342  // Store copy of accessor to the local storage to make sure it is alive
2343  // until we finish
2344  MAccStorage.push_back(std::move(AccImpl));
2345  }
2346 
2354  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2355  access::target AccessTarget,
2356  access::placeholder IsPlaceholder = access::placeholder::false_t>
2357  void
2358  copy(const T_Src *Src,
2360  throwIfActionIsCreated();
2361  static_assert(isValidTargetForExplicitOp(AccessTarget),
2362  "Invalid accessor target for the copy method.");
2363  static_assert(isValidModeForDestinationAccessor(AccessMode),
2364  "Invalid accessor mode for the copy method.");
2365 #ifndef __SYCL_DEVICE_ONLY__
2366  if (MIsHost) {
2367  // TODO: Temporary implementation for host. Should be handled by memory
2368  // manager.
2369  copyPtrToAccHost(Src, Dst);
2370  return;
2371  }
2372 #endif
2373  setType(detail::CG::CopyPtrToAcc);
2374 
2376  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2377 
2378  MRequirements.push_back(AccImpl.get());
2379  MSrcPtr = const_cast<T_Src *>(Src);
2380  MDstPtr = static_cast<void *>(AccImpl.get());
2381  // Store copy of accessor to the local storage to make sure it is alive
2382  // until we finish
2383  MAccStorage.push_back(std::move(AccImpl));
2384  }
2385 
2393  template <
2394  typename T_Src, int Dims_Src, access::mode AccessMode_Src,
2395  access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
2396  access::mode AccessMode_Dst, access::target AccessTarget_Dst,
2397  access::placeholder IsPlaceholder_Src = access::placeholder::false_t,
2398  access::placeholder IsPlaceholder_Dst = access::placeholder::false_t>
2399  void copy(accessor<T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src,
2400  IsPlaceholder_Src>
2401  Src,
2402  accessor<T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst,
2403  IsPlaceholder_Dst>
2404  Dst) {
2405  throwIfActionIsCreated();
2406  static_assert(isValidTargetForExplicitOp(AccessTarget_Src),
2407  "Invalid source accessor target for the copy method.");
2408  static_assert(isValidTargetForExplicitOp(AccessTarget_Dst),
2409  "Invalid destination accessor target for the copy method.");
2410  static_assert(isValidModeForSourceAccessor(AccessMode_Src),
2411  "Invalid source accessor mode for the copy method.");
2412  static_assert(isValidModeForDestinationAccessor(AccessMode_Dst),
2413  "Invalid destination accessor mode for the copy method.");
2414  assert(Dst.get_size() >= Src.get_size() &&
2415  "The destination accessor does not fit the copied memory.");
2416  if (copyAccToAccHelper(Src, Dst))
2417  return;
2418  setType(detail::CG::CopyAccToAcc);
2419 
2420  detail::AccessorBaseHost *AccBaseSrc = (detail::AccessorBaseHost *)&Src;
2421  detail::AccessorImplPtr AccImplSrc = detail::getSyclObjImpl(*AccBaseSrc);
2422 
2423  detail::AccessorBaseHost *AccBaseDst = (detail::AccessorBaseHost *)&Dst;
2424  detail::AccessorImplPtr AccImplDst = detail::getSyclObjImpl(*AccBaseDst);
2425 
2426  MRequirements.push_back(AccImplSrc.get());
2427  MRequirements.push_back(AccImplDst.get());
2428  MSrcPtr = AccImplSrc.get();
2429  MDstPtr = AccImplDst.get();
2430  // Store copy of accessor to the local storage to make sure it is alive
2431  // until we finish
2432  MAccStorage.push_back(std::move(AccImplSrc));
2433  MAccStorage.push_back(std::move(AccImplDst));
2434  }
2435 
2440  template <typename T, int Dims, access::mode AccessMode,
2441  access::target AccessTarget,
2442  access::placeholder IsPlaceholder = access::placeholder::false_t>
2443  void
2445  throwIfActionIsCreated();
2446  static_assert(isValidTargetForExplicitOp(AccessTarget),
2447  "Invalid accessor target for the update_host method.");
2448  setType(detail::CG::UpdateHost);
2449 
2451  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2452 
2453  MDstPtr = static_cast<void *>(AccImpl.get());
2454  MRequirements.push_back(AccImpl.get());
2455  MAccStorage.push_back(std::move(AccImpl));
2456  }
2457 
2466  template <typename T, int Dims, access::mode AccessMode,
2467  access::target AccessTarget,
2468  access::placeholder IsPlaceholder = access::placeholder::false_t,
2469  typename PropertyListT = property_list>
2470  void
2472  Dst,
2473  const T &Pattern) {
2474  throwIfActionIsCreated();
2475  // TODO add check:T must be an integral scalar value or a SYCL vector type
2476  static_assert(isValidTargetForExplicitOp(AccessTarget),
2477  "Invalid accessor target for the fill method.");
2478  if (!MIsHost && (((Dims == 1) && isConstOrGlobal(AccessTarget)) ||
2479  isImageOrImageArray(AccessTarget))) {
2480  setType(detail::CG::Fill);
2481 
2483  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2484 
2485  MDstPtr = static_cast<void *>(AccImpl.get());
2486  MRequirements.push_back(AccImpl.get());
2487  MAccStorage.push_back(std::move(AccImpl));
2488 
2489  MPattern.resize(sizeof(T));
2490  auto PatternPtr = reinterpret_cast<T *>(MPattern.data());
2491  *PatternPtr = Pattern;
2492  } else {
2493 
2494  // TODO: Temporary implementation for host. Should be handled by memory
2495  // manger.
2496  range<Dims> Range = Dst.get_range();
2497  parallel_for<class __fill<T, Dims, AccessMode, AccessTarget,
2498  IsPlaceholder>>(Range, [=](id<Dims> Index) {
2499  Dst[Index] = Pattern;
2500  });
2501  }
2502  }
2503 
2510  template <typename T> void fill(void *Ptr, const T &Pattern, size_t Count) {
2511  throwIfActionIsCreated();
2512  static_assert(std::is_trivially_copyable<T>::value,
2513  "Pattern must be trivially copyable");
2514  parallel_for<class __usmfill<T>>(range<1>(Count), [=](id<1> Index) {
2515  T *CastedPtr = static_cast<T *>(Ptr);
2516  CastedPtr[Index] = Pattern;
2517  });
2518  }
2519 
2524  throwIfActionIsCreated();
2525  setType(detail::CG::Barrier);
2526  }
2527 
2531  __SYCL2020_DEPRECATED("use 'ext_oneapi_barrier' instead")
2532  void barrier() { ext_oneapi_barrier(); }
2533 
2540  void ext_oneapi_barrier(const std::vector<event> &WaitList);
2541 
2548  __SYCL2020_DEPRECATED("use 'ext_oneapi_barrier' instead")
2549  void barrier(const std::vector<event> &WaitList);
2550 
2560  void memcpy(void *Dest, const void *Src, size_t Count);
2561 
2571  template <typename T> void copy(const T *Src, T *Dest, size_t Count) {
2572  this->memcpy(Dest, Src, Count * sizeof(T));
2573  }
2574 
2583  void memset(void *Dest, int Value, size_t Count);
2584 
2591  void prefetch(const void *Ptr, size_t Count);
2592 
2599  void mem_advise(const void *Ptr, size_t Length, int Advice);
2600 
2601 private:
2602  std::shared_ptr<detail::queue_impl> MQueue;
2607  std::vector<std::vector<char>> MArgsStorage;
2608  std::vector<detail::AccessorImplPtr> MAccStorage;
2609  std::vector<detail::LocalAccessorImplPtr> MLocalAccStorage;
2610  std::vector<std::shared_ptr<detail::stream_impl>> MStreamStorage;
2611  mutable std::vector<std::shared_ptr<const void>> MSharedPtrStorage;
2613  std::vector<detail::ArgDesc> MArgs;
2617  std::vector<detail::ArgDesc> MAssociatedAccesors;
2619  std::vector<detail::Requirement *> MRequirements;
2621  detail::NDRDescT MNDRDesc;
2622  std::string MKernelName;
2624  std::shared_ptr<detail::kernel_impl> MKernel;
2628  detail::CG::CGTYPE MCGType = detail::CG::None;
2630  void *MSrcPtr = nullptr;
2632  void *MDstPtr = nullptr;
2634  size_t MLength = 0;
2636  std::vector<char> MPattern;
2638  std::unique_ptr<detail::HostKernelBase> MHostKernel;
2640  std::unique_ptr<detail::HostTask> MHostTask;
2641  detail::OSModuleHandle MOSModuleHandle = detail::OSUtil::ExeModuleHandle;
2642  // Storage for a lambda or function when using InteropTasks
2643  std::unique_ptr<detail::InteropTask> MInteropTask;
2645  std::vector<detail::EventImplPtr> MEvents;
2648  std::vector<detail::EventImplPtr> MEventsWaitWithBarrier;
2649 
2650  bool MIsHost = false;
2651 
2652  detail::code_location MCodeLoc = {};
2653  bool MIsFinalized = false;
2654  event MLastEvent;
2655 
2656  // Make queue_impl class friend to be able to call finalize method.
2657  friend class detail::queue_impl;
2658  // Make accessor class friend to keep the list of associated accessors.
2659  template <typename DataT, int Dims, access::mode AccMode,
2660  access::target AccTarget, access::placeholder isPlaceholder,
2661  typename PropertyListT>
2662  friend class accessor;
2664 
2665  template <typename DataT, int Dimensions, access::mode AccessMode,
2666  access::target AccessTarget, access::placeholder IsPlaceholder>
2668  // Make stream class friend to be able to keep the list of associated streams
2669  friend class stream;
2670  friend class detail::stream_impl;
2671  // Make reduction_impl friend to store buffers and arrays created for it
2672  // in handler from reduction_impl methods.
2673  template <typename T, class BinaryOperation, int Dims, bool IsUSM,
2674  access::placeholder IsPlaceholder>
2676 
2677  // This method needs to call the method finalize().
2678  template <typename Reduction, typename... RestT>
2679  std::enable_if_t<!Reduction::is_usm> friend ext::oneapi::detail::
2681  std::vector<event> &Events, std::shared_ptr<detail::queue_impl> Queue,
2682  bool IsHost, Reduction &, RestT...);
2683 
2684  friend void detail::associateWithHandler(handler &,
2686  access::target);
2687 
2688  friend class ::MockHandler;
2689  friend class detail::queue_impl;
2690 
2691  bool DisableRangeRounding();
2692 
2693  bool RangeRoundingTrace();
2694 
2695  void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
2696  size_t &MinRange);
2697 
2698  template <typename WrapperT, typename TransformedArgType, int Dims,
2699  typename KernelType,
2701  KernelType, TransformedArgType>::value> * = nullptr>
2702  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
2703  range<Dims> NumWorkItems) {
2704  return detail::RoundedRangeKernelWithKH<TransformedArgType, Dims,
2705  KernelType>(NumWorkItems,
2706  KernelFunc);
2707  }
2708 
2709  template <typename WrapperT, typename TransformedArgType, int Dims,
2710  typename KernelType,
2712  KernelType, TransformedArgType>::value> * = nullptr>
2713  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
2714  range<Dims> NumWorkItems) {
2716  NumWorkItems, KernelFunc);
2717  }
2718 };
2719 } // namespace sycl
2720 } // __SYCL_INLINE_NAMESPACE(cl)
cl::sycl::detail::associateWithHandler
void associateWithHandler(handler &, AccessorBaseHost *, access::target)
Definition: handler_proxy.cpp:17
cl::sycl::nd_range
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:23
cl::sycl::handler::parallel_for
detail::enable_if_t<!Reduction::has_fast_atomics &&!Reduction::has_atomic_add_float64 > parallel_for(nd_range< Dims > Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified nd_range.
Definition: handler.hpp:1715
cl::sycl::detail::stream_impl
Definition: stream_impl.hpp:25
cl::sycl::handler::host_task
detail::enable_if_t< detail::check_fn_signature< detail::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< detail::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
Definition: handler.hpp:1495
PI_INVALID_KERNEL
@ PI_INVALID_KERNEL
Definition: pi.h:86
property_list.hpp
cl::sycl::handler::parallel_for
void parallel_for(range< 1 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1967
__usmfill
Definition: handler.hpp:48
cg.hpp
cl::sycl::detail::getDelinearizedId
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:318
cl::sycl::detail::NDRDescT
Definition: cg_types.hpp:41
cl::sycl::detail::check_fn_signature
Definition: cg_types.hpp:126
type
cl::sycl::detail::getLinearIndex
size_t getLinearIndex(const T< Dims > &Index, const U< Dims > &Range)
Definition: common.hpp:330
cl::sycl::detail::member_ptr_helper
static Arg member_ptr_helper(RetType(Func::*)(Arg) const)
T
cl::sycl::detail::LocalAccessorImplHost
Definition: accessor_impl.hpp:171
cl::sycl::interop_handle
Definition: interop_handle.hpp:37
cl::sycl::info::device
device
Definition: info_desc.hpp:50
cl::sycl::event
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:31
PI_INVALID_OPERATION
@ PI_INVALID_OPERATION
Definition: pi.h:85
__SYCL2020_DEPRECATED
#define __SYCL2020_DEPRECATED(message)
Definition: defines_elementary.hpp:56
cl::sycl::item
Identifies an instance of the function object executing at each point in a range.
Definition: helpers.hpp:28
item.hpp
cl::sycl::handler::parallel_for
void parallel_for(range< 2 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1971
__copyAcc2Acc
Definition: handler.hpp:68
cl::sycl::access::placeholder
placeholder
Definition: access.hpp:43
cl::sycl::detail::LocalAccessorImplPtr
std::shared_ptr< LocalAccessorImplHost > LocalAccessorImplPtr
Definition: accessor_impl.hpp:183
cl::sycl::detail::remove_cv_t
typename std::remove_cv< T >::type remove_cv_t
Definition: stl_type_traits.hpp:32
stl.hpp
cl::sycl::ext::intel::experimental::esimd::barrier
__ESIMD_API void barrier()
Generic work-group barrier.
Definition: memory.hpp:843
cl::sycl::ext::intel::experimental::type
type
Definition: fpga_utils.hpp:22
cl::sycl::detail::RoundedRangeKernelWithKH::operator()
void operator()(TransformedArgType Arg, kernel_handler KH) const
Definition: handler.hpp:226
cg_types.hpp
cl::sycl::handler::is_same_type
std::is_same< remove_cv_ref_t< U >, remove_cv_ref_t< T > > is_same_type
Definition: handler.hpp:1372
cl::sycl::id
A unique identifier of an item in an index space.
Definition: array.hpp:17
cl::sycl::detail::getVersionedCGType
constexpr unsigned int getVersionedCGType(unsigned int Type, unsigned char Version)
Definition: cg.hpp:126
cl::sycl::sampler
Encapsulates a configuration for sampling an image accessor.
Definition: sampler.hpp:65
cl::sycl::ext::oneapi::detail::reduGetMaxNumConcurrentWorkGroups
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< queue_impl > Queue)
handler_proxy.hpp
cl::sycl::detail::CG::CG_VERSION
CG_VERSION
Definition: cg.hpp:150
cl::sycl::detail::argument_helper
SuggestedArgType argument_helper(...)
cl::sycl::handler::set_args
void set_args(Ts &&... Args)
Sets arguments for OpenCL interoperability kernels.
Definition: handler.hpp:1411
cl::sycl::detail::RoundedRangeKernel::RoundedRangeKernel
RoundedRangeKernel(range< Dims > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:205
cl::sycl::detail::RoundedRangeKernel::operator()
void operator()(TransformedArgType Arg) const
Definition: handler.hpp:208
__copyAcc2Ptr
Definition: handler.hpp:54
context.hpp
_KERNELFUNCPARAM
#define _KERNELFUNCPARAM(a)
Definition: handler.hpp:1502
cl::sycl::nd_range::get_group_range
range< dimensions > get_group_range() const
Definition: nd_range.hpp:44
event.hpp
os_util.hpp
cl::sycl::make_error_code
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:121
detail
Definition: pi_opencl.cpp:86
cl::sycl::detail::__pf_kernel_wrapper
Definition: handler.hpp:114
cl::sycl::detail::KernelBundleImplPtr
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
Definition: kernel_bundle.hpp:132
cl::sycl::handler::parallel_for
detail::enable_if_t< Reduction::has_fast_atomics > parallel_for(nd_range< Dims > Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc))
Implements parallel_for() accepting nd_range Range and one reduction object.
Definition: handler.hpp:1639
cl::sycl::accessor::get_range
range< Dimensions > get_range() const
Definition: accessor.hpp:1645
cl::sycl::handler::parallel_for
void parallel_for(range< Dims > Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified nd_range.
Definition: handler.hpp:1588
cl::sycl::property_list
Objects of the property_list class are containers for the SYCL properties.
Definition: property_list.hpp:26
access.hpp
cl::sycl::detail::lambda_arg_type
decltype(argument_helper< F, SuggestedArgType >(0)) lambda_arg_type
Definition: handler.hpp:111
cl::sycl::detail::code_location
Definition: common.hpp:54
cl::sycl::detail::remove_reference_t
typename std::remove_reference< T >::type remove_reference_t
Definition: stl_type_traits.hpp:35
cl::sycl::handler::update_host
void update_host(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder > Acc)
Provides guarantees that the memory object accessed via Acc is updated on the host after command grou...
Definition: handler.hpp:2444
cl::sycl::detail::write
void write(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize, unsigned WIOffset, const char *Str, unsigned Len, unsigned Padding=0)
Definition: stream.hpp:110
cl::sycl::detail::LocalAccessorBaseHost
Definition: accessor_impl.hpp:185
id.hpp
cl::sycl::range
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: buffer.hpp:24
cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMemHelper
std::enable_if_t<!Reduction::is_usm > reduSaveFinalResultToUserMemHelper(std::vector< event > &Events, std::shared_ptr< detail::queue_impl > Queue, bool IsHost, Reduction &Redu, RestT... Rest)
Definition: reduction.hpp:1944
interop_handle.hpp
cl::sycl::handler::parallel_for
detail::enable_if_t< Reduction::has_atomic_add_float64 > parallel_for(nd_range< Dims > Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc))
Implements parallel_for() accepting nd_range Range and one reduction object.
Definition: handler.hpp:1673
cl::sycl::detail::memcpy
void memcpy(void *Dst, const void *Src, std::size_t Size)
cl::sycl::handler::parallel_for_work_group
void parallel_for_work_group(range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:1925
cl::sycl::ext::oneapi::detail::reduGetMaxWGSize
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
cl::sycl::detail::CG::CGTYPE
CGTYPE
Type of the command group.
Definition: cg.hpp:156
nd_range.hpp
cl::sycl::handler::parallel_for_work_group
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2190
export.hpp
cl::sycl::handler::ShouldEnableSetArg
Definition: handler.hpp:1374
cl::sycl::handler::parallel_for
void parallel_for(kernel Kernel, nd_range< Dims > NDRange, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:2148
cl::sycl::detail::AccessorImplHost
Definition: accessor_impl.hpp:76
cl::sycl::handler::require
void require(accessor< DataT, Dims, AccMode, AccTarget, access::placeholder::true_t > Acc)
Requires access to the memory object associated with the placeholder accessor.
Definition: handler.hpp:1348
cl::sycl::handler::parallel_for
void parallel_for(range< 1 > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:1445
cl::sycl::handler::single_task
void single_task(kernel Kernel)
Invokes a SYCL kernel.
Definition: handler.hpp:1953
cl::sycl::handler::parallel_for
void parallel_for(range< 3 > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:1463
kernel.hpp
cl::sycl::ext::oneapi::detail::tuple_select_elements
std::tuple< std::tuple_element_t< Is, TupleT >... > tuple_select_elements(TupleT Tuple, std::index_sequence< Is... >)
Utility function: for the given tuple.
Definition: reduction.hpp:1998
cl::sycl::nd_item::get_global_range
range< dimensions > get_global_range() const
Definition: nd_item.hpp:92
cl::sycl::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:35
cl::sycl::detail::RoundedRangeKernel
Definition: handler.hpp:203
kernel_bundle.hpp
cl::sycl::detail::AccessorImplPtr
std::shared_ptr< AccessorImplHost > AccessorImplPtr
Definition: accessor_impl.hpp:132
cl::sycl::handler::ext_oneapi_barrier
void ext_oneapi_barrier()
Prevents any commands submitted afterward to this queue from executing until all commands previously ...
Definition: handler.hpp:2523
cl::sycl::handler::parallel_for
void parallel_for(nd_range< Dims > ExecutionRange, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified nd_range.
Definition: handler.hpp:1555
cl::sycl::ext::intel::experimental::esimd::max
__ESIMD_API simd< T, SZ > max(simd< T, SZ > src0, simd< T, SZ > src1, int flag=saturation_off)
Selects component-wise the maximum of the two vectors.
Definition: math.hpp:684
cl::sycl::ext::oneapi::detail::AreAllButLastReductions
Predicate returning true if all template type parameters except the last one are reductions.
Definition: handler.hpp:322
cl::sycl::accessor
Buffer accessor.
Definition: accessor.hpp:225
cl::sycl::ext::intel::experimental::prefetch
prefetch_impl< _B > prefetch
Definition: fpga_lsu.hpp:46
cl::sycl::access::target
target
Definition: access.hpp:17
cl::sycl::handler::copy
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2270
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
cl::sycl::handler::remove_cv_ref_t
typename detail::remove_cv_t< detail::remove_reference_t< T > > remove_cv_ref_t
Definition: handler.hpp:1369
cl::sycl::handler::single_task
void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function.
Definition: handler.hpp:2027
cl::sycl::handler::fill
void fill(void *Ptr, const T &Pattern, size_t Count)
Fills the specified memory with the specified pattern.
Definition: handler.hpp:2510
cl::sycl::handler::parallel_for
void parallel_for(range< 2 > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:1454
cl::sycl::detail::KernelInfo
Definition: kernel_desc.hpp:70
cl::sycl::handler::parallel_for
std::enable_if_t<(sizeof...(RestT) >=3 &&ext::oneapi::detail::AreAllButLastReductions< RestT... >::value)> parallel_for(nd_range< Dims > Range, RestT... Rest)
Definition: handler.hpp:1837
cl::sycl::kernel::is_host
bool is_host() const
Get a valid OpenCL kernel handle.
Definition: kernel.cpp:27
cl::sycl::detail::runKernelWithArg
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType, ArgType >::value > runKernelWithArg(KernelType KernelName, ArgType Arg)
Definition: cg_types.hpp:197
cl::sycl::ext::oneapi::detail::reduCGFuncAtomic64
enable_if_t< Reduction::has_atomic_add_float64 > reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &Redu)
Definition: reduction.hpp:1804
cl::sycl::kernel_handler
Reading the value of a specialization constant.
Definition: kernel_handler.hpp:22
cl::sycl::ext::oneapi::detail::reduction_impl
This class encapsulates the reduction variable/accessor, the reduction operator and an optional opera...
Definition: handler.hpp:245
cl::sycl::handler::copy
void copy(std::shared_ptr< T_Src > Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2295
cl::sycl::detail::RoundedRangeKernelWithKH
Definition: handler.hpp:221
cl::sycl::handler
Command group handler class.
Definition: handler.hpp:361
cl::sycl::handler::parallel_for
void parallel_for(kernel Kernel, range< Dims > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range.
Definition: handler.hpp:2072
cl::sycl::accessor::get_pointer
DataT * get_pointer() const
Definition: accessor.hpp:1729
__SYCL_KERNEL_ATTR__
#define __SYCL_KERNEL_ATTR__
Definition: handler.hpp:1067
cl::sycl::ext::oneapi::detail::reduAuxCGFunc
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1903
cl::sycl::detail::queue_impl
Definition: queue_impl.hpp:54
cl::sycl::handler::__SYCL_DEPRECATED
__SYCL_DEPRECATED("run_on_host_intel() is deprecated, use host_task() instead") void run_on_host_intel(FuncT Func)
Defines and invokes a SYCL kernel on host device.
Definition: handler.hpp:1475
accessor.hpp
cl::sycl::detail::InteropTask
Definition: cg_types.hpp:220
cl::sycl::nd_item::get_global_id
id< dimensions > get_global_id() const
Definition: nd_item.hpp:40
std::get
constexpr tuple_element< I, tuple< Types... > >::type & get(cl::sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:199
KernelFunc
std::function< void(const sycl::nd_item< NDims > &)> KernelFunc
Definition: pi_esimd_emulator.cpp:136
cl::sycl::accessor::get_size
size_t get_size() const
Definition: accessor.hpp:1638
cl::sycl::detail::OSModuleHandle
intptr_t OSModuleHandle
Uniquely identifies an operating system module (executable or a dynamic library)
Definition: os_util.hpp:48
cl::sycl::nd_item::get_offset
id< dimensions > get_offset() const
Definition: nd_item.hpp:105
PI_INVALID_VALUE
@ PI_INVALID_VALUE
Definition: pi.h:88
kernel_handler.hpp
cl::sycl::detail::auto_name
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:35
cl::sycl::handler::copy
void copy(const T_Src *Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2358
cl::sycl::detail::getSyclObjImpl
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: common.hpp:198
cl::sycl::accessor::size
size_t size() const noexcept
Definition: accessor.hpp:1642
cl::sycl::nd_item
Identifies an instance of the function object executing at each point in an nd_range.
Definition: helpers.hpp:32
PI_INVALID_WORK_GROUP_SIZE
@ PI_INVALID_WORK_GROUP_SIZE
Definition: pi.h:105
std
Definition: accessor.hpp:2532
cl::sycl::detail::checkValueRange
detail::enable_if_t< std::is_same< T, nd_range< Dims > >::value > checkValueRange(const T &V)
Definition: handler.hpp:190
cl::sycl::handler::set_arg
void set_arg(int ArgIndex, accessor< DataT, Dims, AccessMode, AccessTarget, IsPlaceholder > Arg)
Definition: handler.hpp:1401
cl::sycl::detail::image_accessor
Definition: accessor.hpp:379
cl::sycl::detail::KernelLambdaHasKernelHandlerArgT
Definition: cg_types.hpp:174
cl::sycl::handler::parallel_for_Impl
detail::enable_if_t<!Reduction::has_fast_atomics > parallel_for_Impl(nd_range< Dims > Range, Reduction Redu, KernelType KernelFunc)
Definition: handler.hpp:1724
sampler.hpp
cl::sycl::detail::HostKernel
Definition: cg_types.hpp:246
cl::sycl::detail::getDeviceFromHandler
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
Definition: accessor.cpp:15
cl::sycl::handler::single_task
void single_task(KernelType KernelFunc)
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:1424
cl::sycl::detail::kernel_param_kind_t
kernel_param_kind_t
Definition: kernel_desc.hpp:25
cl::sycl::detail::get_kernel_wrapper_name_t
Definition: handler.hpp:116
cl::sycl::handler::__SYCL_DEPRECATED
__SYCL_DEPRECATED("interop_task() is deprecated, use host_task() instead") void interop_task(FuncT Func)
Invokes a lambda on the host.
Definition: handler.hpp:2056
cl::sycl::ext::oneapi::detail::reduCGFunc
void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1740
cl::sycl::handler::parallel_for_work_group
void parallel_for_work_group(range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:1893
cl::sycl::detail::remove_const_t
typename std::remove_const< T >::type remove_const_t
Definition: stl_type_traits.hpp:30
cl::sycl::detail::kernel_param_desc_t
Definition: kernel_desc.hpp:36
cl::sycl::detail::AccessorBaseHost
Definition: accessor_impl.hpp:134
cl::sycl::detail::getUnversionedCGType
constexpr unsigned char getUnversionedCGType(unsigned int Type)
Definition: cg.hpp:132
cl::sycl::device::has
bool has(aspect Aspect) const
Indicates if the SYCL device has the given feature.
Definition: device.cpp:163
__fill
Definition: handler.hpp:46
nd_item.hpp
cl::sycl::access::mode
mode
Definition: access.hpp:28
cl::sycl::ext::oneapi::detail::reduGetMemPerWorkItem
size_t reduGetMemPerWorkItem(std::tuple< ReductionT... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1989
cl::sycl::handler::copy
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, T_Dst *Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2319
cl::sycl::handler::fill
void fill(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT > Dst, const T &Pattern)
Fills memory pointed by accessor with the pattern given.
Definition: handler.hpp:2471
cl::sycl::stream
A buffered output stream that allows outputting the values of built-in, vector and SYCL types to the ...
Definition: stream.hpp:743
cl::sycl::detail::get_kernel_name_t::name
Name name
Definition: kernel.hpp:41
cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMem
std::shared_ptr< event > reduSaveFinalResultToUserMem(std::shared_ptr< detail::queue_impl > Queue, bool IsHost, std::tuple< Reduction... > &ReduTuple, std::index_sequence< Is... >)
Creates additional kernels that copy the accumulated/final results from reductions accessors to eithe...
Definition: reduction.hpp:1967
cl::sycl::detail::enable_if_t
typename std::enable_if< B, T >::type enable_if_t
Definition: stl_type_traits.hpp:24
cl::sycl::handler::set_arg
detail::enable_if_t< ShouldEnableSetArg< T >::value, void > set_arg(int ArgIndex, T &&Arg)
Sets argument for OpenCL interoperability kernels.
Definition: handler.hpp:1394
cl::sycl::kernel
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:67
cl::sycl::handler::parallel_for
void parallel_for(range< 3 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1975
cl::sycl::kernel::get_kernel_bundle
kernel_bundle< bundle_state::executable > get_kernel_bundle() const
Get the kernel_bundle associated with this kernel.
Definition: kernel.cpp:36
cl::sycl::handler::copy
void copy(accessor< T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src, IsPlaceholder_Src > Src, accessor< T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst, IsPlaceholder_Dst > Dst)
Copies the content of memory object accessed by Src to the memory object accessed by Dst.
Definition: handler.hpp:2399
cl::sycl::handler::parallel_for_work_group
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2229
cl::sycl::detail::RoundedRangeKernelWithKH::RoundedRangeKernelWithKH
RoundedRangeKernelWithKH(range< Dims > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:223
PI_INVALID_DEVICE
@ PI_INVALID_DEVICE
Definition: pi.h:91
cl::sycl::nd_range::get_local_range
range< dimensions > get_local_range() const
Definition: nd_range.hpp:42
cl::sycl::handler::parallel_for
void parallel_for(nd_range< Dims > NDRange, kernel Kernel)
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:2009
__copyPtr2Acc
Definition: handler.hpp:60
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:12