DPC++ Runtime
Runtime libraries for oneAPI DPC++
handler.hpp
Go to the documentation of this file.
1 //==-------- handler.hpp --- SYCL command group handler --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
11 #include <sycl/access/access.hpp>
12 #include <sycl/accessor.hpp>
13 #include <sycl/context.hpp>
14 #include <sycl/detail/cg.hpp>
15 #include <sycl/detail/cg_types.hpp>
16 #include <sycl/detail/cl.h>
17 #include <sycl/detail/export.hpp>
19 #include <sycl/detail/os_util.hpp>
20 #include <sycl/event.hpp>
21 #include <sycl/id.hpp>
22 #include <sycl/interop_handle.hpp>
23 #include <sycl/item.hpp>
24 #include <sycl/kernel.hpp>
25 #include <sycl/kernel_bundle.hpp>
26 #include <sycl/kernel_handler.hpp>
27 #include <sycl/nd_item.hpp>
28 #include <sycl/nd_range.hpp>
29 #include <sycl/property_list.hpp>
30 #include <sycl/sampler.hpp>
31 #include <sycl/stl.hpp>
32 
33 #include <functional>
34 #include <limits>
35 #include <memory>
36 #include <tuple>
37 #include <type_traits>
38 
39 // SYCL_LANGUAGE_VERSION is 4 digit year followed by 2 digit revision
40 #if !SYCL_LANGUAGE_VERSION || SYCL_LANGUAGE_VERSION < 202001
41 #define __SYCL_NONCONST_FUNCTOR__
42 #endif
43 
44 template <typename DataT, int Dimensions, sycl::access::mode AccessMode,
45  sycl::access::target AccessTarget,
47 class __fill;
48 
49 template <typename T> class __usmfill;
50 
51 template <typename T_Src, typename T_Dst, int Dims,
55 
56 template <typename T_Src, typename T_Dst, int Dims,
60 
61 template <typename T_Src, int Dims_Src, sycl::access::mode AccessMode_Src,
62  sycl::access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
63  sycl::access::mode AccessMode_Dst,
64  sycl::access::target AccessTarget_Dst,
65  sycl::access::placeholder IsPlaceholder_Src,
66  sycl::access::placeholder IsPlaceholder_Dst>
68 
69 // For unit testing purposes
70 class MockHandler;
71 
72 namespace sycl {
74 
75 // Forward declaration
76 
77 class handler;
78 template <typename T, int Dimensions, typename AllocatorT, typename Enable>
79 class buffer;
80 namespace detail {
81 
82 class handler_impl;
83 class kernel_impl;
84 class queue_impl;
85 class stream_impl;
86 template <typename DataT, int Dimensions, access::mode AccessMode,
88 class image_accessor;
89 template <typename RetType, typename Func, typename Arg>
90 static Arg member_ptr_helper(RetType (Func::*)(Arg) const);
91 
92 // Non-const version of the above template to match functors whose 'operator()'
93 // is declared w/o the 'const' qualifier.
94 template <typename RetType, typename Func, typename Arg>
95 static Arg member_ptr_helper(RetType (Func::*)(Arg));
96 
97 // template <typename RetType, typename Func>
98 // static void member_ptr_helper(RetType (Func::*)() const);
99 
100 // template <typename RetType, typename Func>
101 // static void member_ptr_helper(RetType (Func::*)());
102 
103 template <typename F, typename SuggestedArgType>
104 decltype(member_ptr_helper(&F::operator())) argument_helper(int);
105 
106 template <typename F, typename SuggestedArgType>
107 SuggestedArgType argument_helper(...);
108 
109 template <typename F, typename SuggestedArgType>
110 using lambda_arg_type = decltype(argument_helper<F, SuggestedArgType>(0));
111 
112 // Used when parallel_for range is rounded-up.
113 template <typename Name> class __pf_kernel_wrapper;
114 
115 template <typename Type> struct get_kernel_wrapper_name_t {
117 };
118 
119 __SYCL_EXPORT device getDeviceFromHandler(handler &);
120 
121 #if __SYCL_ID_QUERIES_FIT_IN_INT__
122 template <typename T> struct NotIntMsg;
123 
124 template <int Dims> struct NotIntMsg<range<Dims>> {
125  constexpr static const char *Msg =
126  "Provided range is out of integer limits. Pass "
127  "`-fno-sycl-id-queries-fit-in-int' to disable range check.";
128 };
129 
130 template <int Dims> struct NotIntMsg<id<Dims>> {
131  constexpr static const char *Msg =
132  "Provided offset is out of integer limits. Pass "
133  "`-fno-sycl-id-queries-fit-in-int' to disable offset check.";
134 };
135 #endif
136 
137 #if __SYCL_ID_QUERIES_FIT_IN_INT__
138 template <typename T, typename ValT>
139 typename detail::enable_if_t<std::is_same<ValT, size_t>::value ||
140  std::is_same<ValT, unsigned long long>::value>
141 checkValueRangeImpl(ValT V) {
142  static constexpr size_t Limit =
143  static_cast<size_t>((std::numeric_limits<int>::max)());
144  if (V > Limit)
145  throw runtime_error(NotIntMsg<T>::Msg, PI_ERROR_INVALID_VALUE);
146 }
147 #endif
148 
149 template <int Dims, typename T>
150 typename detail::enable_if_t<std::is_same<T, range<Dims>>::value ||
151  std::is_same<T, id<Dims>>::value>
152 checkValueRange(const T &V) {
153 #if __SYCL_ID_QUERIES_FIT_IN_INT__
154  for (size_t Dim = 0; Dim < Dims; ++Dim)
155  checkValueRangeImpl<T>(V[Dim]);
156 
157  {
158  unsigned long long Product = 1;
159  for (size_t Dim = 0; Dim < Dims; ++Dim) {
160  Product *= V[Dim];
161  // check value now to prevent product overflow in the end
162  checkValueRangeImpl<T>(Product);
163  }
164  }
165 #else
166  (void)V;
167 #endif
168 }
169 
170 template <int Dims>
171 void checkValueRange(const range<Dims> &R, const id<Dims> &O) {
172 #if __SYCL_ID_QUERIES_FIT_IN_INT__
173  checkValueRange<Dims>(R);
174  checkValueRange<Dims>(O);
175 
176  for (size_t Dim = 0; Dim < Dims; ++Dim) {
177  unsigned long long Sum = R[Dim] + O[Dim];
178 
179  checkValueRangeImpl<range<Dims>>(Sum);
180  }
181 #else
182  (void)R;
183  (void)O;
184 #endif
185 }
186 
187 template <int Dims, typename T>
189 checkValueRange(const T &V) {
190 #if __SYCL_ID_QUERIES_FIT_IN_INT__
191  checkValueRange<Dims>(V.get_global_range());
192  checkValueRange<Dims>(V.get_local_range());
193  checkValueRange<Dims>(V.get_offset());
194 
195  checkValueRange<Dims>(V.get_global_range(), V.get_offset());
196 #else
197  (void)V;
198 #endif
199 }
200 
201 template <typename TransformedArgType, int Dims, typename KernelType>
203 public:
204  RoundedRangeKernel(range<Dims> NumWorkItems, KernelType KernelFunc)
205  : NumWorkItems(NumWorkItems), KernelFunc(KernelFunc) {}
206 
207  void operator()(TransformedArgType Arg) const {
208  if (Arg[0] >= NumWorkItems[0])
209  return;
210  Arg.set_allowed_range(NumWorkItems);
211  KernelFunc(Arg);
212  }
213 
214 private:
215  range<Dims> NumWorkItems;
216  KernelType KernelFunc;
217 };
218 
219 template <typename TransformedArgType, int Dims, typename KernelType>
221 public:
223  : NumWorkItems(NumWorkItems), KernelFunc(KernelFunc) {}
224 
225  void operator()(TransformedArgType Arg, kernel_handler KH) const {
226  if (Arg[0] >= NumWorkItems[0])
227  return;
228  Arg.set_allowed_range(NumWorkItems);
229  KernelFunc(Arg, KH);
230  }
231 
232 private:
233  range<Dims> NumWorkItems;
234  KernelType KernelFunc;
235 };
236 
237 template <typename T, class BinaryOperation, int Dims, size_t Extent,
238  typename RedOutVar>
240 
242 using sycl::detail::queue_impl;
243 
244 // Kernels with single reduction
245 
248 template <typename KernelName, typename KernelType, int Dims, class Reduction>
249 bool reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
250  const range<Dims> &Range, size_t MaxWGSize,
251  uint32_t NumConcurrentWorkGroups, Reduction &Redu);
252 
253 template <typename KernelName, typename KernelType, int Dims, class Reduction>
254 void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
255  const nd_range<Dims> &Range, Reduction &Redu);
256 
257 template <typename KernelName, typename KernelType, int Dims, class Reduction>
258 void reduCGFunc(handler &CGH, KernelType KernelFunc,
259  const nd_range<Dims> &Range, Reduction &Redu);
260 
261 // Kernels with multiple reductions
262 
263 // sycl::nd_range version
264 template <typename KernelName, typename KernelType, int Dims,
265  typename... Reductions, size_t... Is>
266 void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
267  const nd_range<Dims> &Range,
268  std::tuple<Reductions...> &ReduTuple,
269  std::index_sequence<Is...>);
270 
271 template <typename KernelName, typename KernelType, class Reduction>
272 size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
273  Reduction &Redu);
274 
275 template <typename KernelName, typename KernelType, typename... Reductions,
276  size_t... Is>
277 size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
278  std::tuple<Reductions...> &ReduTuple,
279  std::index_sequence<Is...>);
280 
281 template <typename KernelName, class Reduction>
282 std::enable_if_t<!Reduction::is_usm>
283 reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu);
284 
285 template <typename KernelName, class Reduction>
286 std::enable_if_t<Reduction::is_usm>
287 reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu);
288 
289 template <typename... Reduction, size_t... Is>
290 std::shared_ptr<event>
291 reduSaveFinalResultToUserMem(std::shared_ptr<detail::queue_impl> Queue,
292  bool IsHost, std::tuple<Reduction...> &ReduTuple,
293  std::index_sequence<Is...>);
294 
295 __SYCL_EXPORT uint32_t
296 reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
297 
298 __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
299  size_t LocalMemBytesPerWorkItem);
300 
301 __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
302  size_t LocalMemBytesPerWorkItem);
303 
304 template <typename... ReductionT, size_t... Is>
305 size_t reduGetMemPerWorkItem(std::tuple<ReductionT...> &ReduTuple,
306  std::index_sequence<Is...>);
307 
308 template <typename TupleT, std::size_t... Is>
309 std::tuple<std::tuple_element_t<Is, TupleT>...>
310 tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>);
311 
312 template <typename FirstT, typename... RestT> struct AreAllButLastReductions;
313 
314 template <class FunctorTy>
315 event withAuxHandler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost,
316  FunctorTy Func);
317 } // namespace detail
318 
352 class __SYCL_EXPORT handler {
353 private:
358  handler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost);
359 
369  handler(std::shared_ptr<detail::queue_impl> Queue,
370  std::shared_ptr<detail::queue_impl> PrimaryQueue,
371  std::shared_ptr<detail::queue_impl> SecondaryQueue, bool IsHost);
372 
374  template <typename T, typename F = typename detail::remove_const_t<
376  F *storePlainArg(T &&Arg) {
377  MArgsStorage.emplace_back(sizeof(T));
378  auto Storage = reinterpret_cast<F *>(MArgsStorage.back().data());
379  *Storage = Arg;
380  return Storage;
381  }
382 
383  void setType(detail::CG::CGTYPE Type) { MCGType = Type; }
384 
385  detail::CG::CGTYPE getType() { return MCGType; }
386 
387  void throwIfActionIsCreated() {
388  if (detail::CG::None != getType())
389  throw sycl::runtime_error("Attempt to set multiple actions for the "
390  "command group. Command group must consist of "
391  "a single kernel or explicit memory operation.",
392  PI_ERROR_INVALID_OPERATION);
393  }
394 
397  void
398  extractArgsAndReqsFromLambda(char *LambdaPtr, size_t KernelArgsNum,
399  const detail::kernel_param_desc_t *KernelArgs,
400  bool IsESIMD);
401 
403  void extractArgsAndReqs();
404 
405  void processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
406  const int Size, const size_t Index, size_t &IndexShift,
407  bool IsKernelCreatedFromSource, bool IsESIMD);
408 
410  std::string getKernelName();
411 
412  template <typename LambdaNameT> bool lambdaAndKernelHaveEqualName() {
413  // TODO It is unclear a kernel and a lambda/functor must to be equal or not
414  // for parallel_for with sycl::kernel and lambda/functor together
415  // Now if they are equal we extract argumets from lambda/functor for the
416  // kernel. Else it is necessary use set_atg(s) for resolve the order and
417  // values of arguments for the kernel.
418  assert(MKernel && "MKernel is not initialized");
419  const std::string LambdaName = detail::KernelInfo<LambdaNameT>::getName();
420  const std::string KernelName = getKernelName();
421  return LambdaName == KernelName;
422  }
423 
426  void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
427 
434  event finalize();
435 
441  void addStream(const std::shared_ptr<detail::stream_impl> &Stream) {
442  MStreamStorage.push_back(Stream);
443  }
444 
448  template <class FunctorTy>
449  event withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
450  FunctorTy Func) {
451  handler AuxHandler(Queue, MIsHost);
452  AuxHandler.saveCodeLoc(MCodeLoc);
453  Func(AuxHandler);
454  return AuxHandler.finalize();
455  }
456 
457  template <class FunctorTy>
458  friend event detail::withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
459  bool IsHost, FunctorTy Func);
461 
467  void addReduction(const std::shared_ptr<const void> &ReduObj);
468 
469  ~handler() = default;
470 
471  // TODO: Private and unusued. Remove when ABI break is allowed.
472  bool is_host() { return MIsHost; }
473 
474 #ifdef __SYCL_DEVICE_ONLY__
475  // In device compilation accessor isn't inherited from AccessorBaseHost, so
476  // can't detect by it. Since we don't expect it to be ever called in device
477  // execution, just use blind void *.
478  void associateWithHandler(void *AccBase, access::target AccTarget);
479 #else
481  access::target AccTarget);
482 #endif
483 
484  // Recursively calls itself until arguments pack is fully processed.
485  // The version for regular(standard layout) argument.
486  template <typename T, typename... Ts>
487  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&...Args) {
488  set_arg(ArgIndex, std::move(Arg));
489  setArgsHelper(++ArgIndex, std::move(Args)...);
490  }
491 
492  void setArgsHelper(int) {}
493 
494  // setArgHelper for local accessor argument.
495  template <typename DataT, int Dims, access::mode AccessMode,
497  void setArgHelper(int ArgIndex,
498  accessor<DataT, Dims, AccessMode, access::target::local,
499  IsPlaceholder> &&Arg) {
500  detail::LocalAccessorBaseHost *LocalAccBase =
502  detail::LocalAccessorImplPtr LocalAccImpl =
503  detail::getSyclObjImpl(*LocalAccBase);
504  detail::LocalAccessorImplHost *Req = LocalAccImpl.get();
505  MLocalAccStorage.push_back(std::move(LocalAccImpl));
506  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
507  static_cast<int>(access::target::local), ArgIndex);
508  }
509 
510  // setArgHelper for non local accessor argument.
511  template <typename DataT, int Dims, access::mode AccessMode,
514  setArgHelper(
515  int ArgIndex,
519  detail::AccessorImplHost *Req = AccImpl.get();
520  // Add accessor to the list of requirements.
521  MRequirements.push_back(Req);
522  // Store copy of the accessor.
523  MAccStorage.push_back(std::move(AccImpl));
524  // Add accessor to the list of arguments.
525  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
526  static_cast<int>(AccessTarget), ArgIndex);
527  }
528 
529  template <typename T> void setArgHelper(int ArgIndex, T &&Arg) {
530  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
531 
532  if (!std::is_same<cl_mem, T>::value && std::is_pointer<T>::value) {
533  MArgs.emplace_back(detail::kernel_param_kind_t::kind_pointer, StoredArg,
534  sizeof(T), ArgIndex);
535  } else {
536  MArgs.emplace_back(detail::kernel_param_kind_t::kind_std_layout,
537  StoredArg, sizeof(T), ArgIndex);
538  }
539  }
540 
541  void setArgHelper(int ArgIndex, sampler &&Arg) {
542  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
543  MArgs.emplace_back(detail::kernel_param_kind_t::kind_sampler, StoredArg,
544  sizeof(sampler), ArgIndex);
545  }
546 
547  // TODO: Unusued. Remove when ABI break is allowed.
548  void verifyKernelInvoc(const kernel &Kernel) {
549  std::ignore = Kernel;
550  return;
551  }
552 
553  /* The kernel passed to StoreLambda can take an id, an item or an nd_item as
554  * its argument. Since esimd plugin directly invokes the kernel (doesn’t use
555  * piKernelSetArg), the kernel argument type must be known to the plugin.
556  * However, passing kernel argument type to the plugin requires changing ABI
557  * in HostKernel class. To overcome this problem, helpers below wrap the
558  * “original” kernel with a functor that always takes an nd_item as argument.
559  * A functor is used instead of a lambda because extractArgsAndReqsFromLambda
560  * needs access to the “original” kernel and keeps references to its internal
561  * data, i.e. the kernel passed as argument cannot be local in scope. The
562  * functor itself is again encapsulated in a std::function since functor’s
563  * type is unknown to the plugin.
564  */
565 
566  // For 'id, item w/wo offset, nd_item' kernel arguments
567  template <class KernelType, class NormalizedKernelType, int Dims>
568  KernelType *ResetHostKernelHelper(const KernelType &KernelFunc) {
569  NormalizedKernelType NormalizedKernel(KernelFunc);
570  auto NormalizedKernelFunc =
571  std::function<void(const sycl::nd_item<Dims> &)>(NormalizedKernel);
572  auto HostKernelPtr =
573  new detail::HostKernel<decltype(NormalizedKernelFunc),
574  sycl::nd_item<Dims>, Dims>(NormalizedKernelFunc);
575  MHostKernel.reset(HostKernelPtr);
576  return &HostKernelPtr->MKernel.template target<NormalizedKernelType>()
577  ->MKernelFunc;
578  }
579 
580  // For 'sycl::id<Dims>' kernel argument
581  template <class KernelType, typename ArgT, int Dims>
582  typename std::enable_if<std::is_same<ArgT, sycl::id<Dims>>::value,
583  KernelType *>::type
584  ResetHostKernel(const KernelType &KernelFunc) {
585  struct NormalizedKernelType {
586  KernelType MKernelFunc;
587  NormalizedKernelType(const KernelType &KernelFunc)
588  : MKernelFunc(KernelFunc) {}
589  void operator()(const nd_item<Dims> &Arg) {
590  detail::runKernelWithArg(MKernelFunc, Arg.get_global_id());
591  }
592  };
593  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
594  KernelFunc);
595  }
596 
597  // For 'sycl::nd_item<Dims>' kernel argument
598  template <class KernelType, typename ArgT, int Dims>
599  typename std::enable_if<std::is_same<ArgT, sycl::nd_item<Dims>>::value,
600  KernelType *>::type
601  ResetHostKernel(const KernelType &KernelFunc) {
602  struct NormalizedKernelType {
603  KernelType MKernelFunc;
604  NormalizedKernelType(const KernelType &KernelFunc)
605  : MKernelFunc(KernelFunc) {}
606  void operator()(const nd_item<Dims> &Arg) {
607  detail::runKernelWithArg(MKernelFunc, Arg);
608  }
609  };
610  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
611  KernelFunc);
612  }
613 
614  // For 'sycl::item<Dims, without_offset>' kernel argument
615  template <class KernelType, typename ArgT, int Dims>
616  typename std::enable_if<std::is_same<ArgT, sycl::item<Dims, false>>::value,
617  KernelType *>::type
618  ResetHostKernel(const KernelType &KernelFunc) {
619  struct NormalizedKernelType {
620  KernelType MKernelFunc;
621  NormalizedKernelType(const KernelType &KernelFunc)
622  : MKernelFunc(KernelFunc) {}
623  void operator()(const nd_item<Dims> &Arg) {
624  sycl::item<Dims, false> Item = detail::Builder::createItem<Dims, false>(
625  Arg.get_global_range(), Arg.get_global_id());
626  detail::runKernelWithArg(MKernelFunc, Item);
627  }
628  };
629  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
630  KernelFunc);
631  }
632 
633  // For 'sycl::item<Dims, with_offset>' kernel argument
634  template <class KernelType, typename ArgT, int Dims>
635  typename std::enable_if<std::is_same<ArgT, sycl::item<Dims, true>>::value,
636  KernelType *>::type
637  ResetHostKernel(const KernelType &KernelFunc) {
638  struct NormalizedKernelType {
639  KernelType MKernelFunc;
640  NormalizedKernelType(const KernelType &KernelFunc)
641  : MKernelFunc(KernelFunc) {}
642  void operator()(const nd_item<Dims> &Arg) {
643  sycl::item<Dims, true> Item = detail::Builder::createItem<Dims, true>(
644  Arg.get_global_range(), Arg.get_global_id(), Arg.get_offset());
645  detail::runKernelWithArg(MKernelFunc, Item);
646  }
647  };
648  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
649  KernelFunc);
650  }
651 
652  // For 'void' kernel argument (single_task)
653  template <class KernelType, typename ArgT, int Dims>
654  typename std::enable_if_t<std::is_same<ArgT, void>::value, KernelType *>
655  ResetHostKernel(const KernelType &KernelFunc) {
656  struct NormalizedKernelType {
657  KernelType MKernelFunc;
658  NormalizedKernelType(const KernelType &KernelFunc)
659  : MKernelFunc(KernelFunc) {}
660  void operator()(const nd_item<Dims> &Arg) {
661  (void)Arg;
662  detail::runKernelWithoutArg(MKernelFunc);
663  }
664  };
665  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
666  KernelFunc);
667  }
668 
669  // For 'sycl::group<Dims>' kernel argument
670  // 'wrapper'-based approach using 'NormalizedKernelType' struct is not used
671  // for 'void(sycl::group<Dims>)' since 'void(sycl::group<Dims>)' is not
672  // supported in ESIMD.
673  template <class KernelType, typename ArgT, int Dims>
674  typename std::enable_if<std::is_same<ArgT, sycl::group<Dims>>::value,
675  KernelType *>::type
676  ResetHostKernel(const KernelType &KernelFunc) {
677  MHostKernel.reset(
679  return (KernelType *)(MHostKernel->getPtr());
680  }
681 
689  void verifyUsedKernelBundle(const std::string &KernelName);
690 
697  template <typename KernelName, typename KernelType, int Dims,
698  typename LambdaArgType>
699  void StoreLambda(KernelType KernelFunc) {
701 
702  constexpr bool IsCallableWithKernelHandler =
704  LambdaArgType>::value;
705 
706  if (IsCallableWithKernelHandler && MIsHost) {
707  throw sycl::feature_not_supported(
708  "kernel_handler is not yet supported by host device.",
709  PI_ERROR_INVALID_OPERATION);
710  }
711 
712  KernelType *KernelPtr =
713  ResetHostKernel<KernelType, LambdaArgType, Dims>(KernelFunc);
714 
715  using KI = sycl::detail::KernelInfo<KernelName>;
716  constexpr bool KernelHasName =
717  KI::getName() != nullptr && KI::getName()[0] != '\0';
718 
719  // Some host compilers may have different captures from Clang. Currently
720  // there is no stable way of handling this when extracting the captures, so
721  // a static assert is made to fail for incompatible kernel lambdas.
722  static_assert(
723  !KernelHasName || sizeof(KernelFunc) == KI::getKernelSize(),
724  "Unexpected kernel lambda size. This can be caused by an "
725  "external host compiler producing a lambda with an "
726  "unexpected layout. This is a limitation of the compiler."
727  "In many cases the difference is related to capturing constexpr "
728  "variables. In such cases removing constexpr specifier aligns the "
729  "captures between the host compiler and the device compiler."
730  "\n"
731  "In case of MSVC, passing "
732  "-fsycl-host-compiler-options='/std:c++latest' "
733  "might also help.");
734 
735  // Empty name indicates that the compilation happens without integration
736  // header, so don't perform things that require it.
737  if (KernelHasName) {
738  // TODO support ESIMD in no-integration-header case too.
739  MArgs.clear();
740  extractArgsAndReqsFromLambda(reinterpret_cast<char *>(KernelPtr),
741  KI::getNumParams(), &KI::getParamDesc(0),
742  KI::isESIMD());
743  MKernelName = KI::getName();
744  MOSModuleHandle = detail::OSUtil::getOSModuleHandle(KI::getName());
745  } else {
746  // In case w/o the integration header it is necessary to process
747  // accessors from the list(which are associated with this handler) as
748  // arguments.
749  MArgs = std::move(MAssociatedAccesors);
750  }
751 
752  // If the kernel lambda is callable with a kernel_handler argument, manifest
753  // the associated kernel handler.
754  if (IsCallableWithKernelHandler) {
755  getOrInsertHandlerKernelBundle(/*Insert=*/true);
756  }
757  }
758 
763  template <int Dims_Src, int Dims_Dst>
764  static bool IsCopyingRectRegionAvailable(const range<Dims_Src> Src,
765  const range<Dims_Dst> Dst) {
766  if (Dims_Src > Dims_Dst)
767  return false;
768  for (size_t I = 0; I < Dims_Src; ++I)
769  if (Src[I] > Dst[I])
770  return false;
771  return true;
772  }
773 
779  template <typename TSrc, int DimSrc, access::mode ModeSrc,
780  access::target TargetSrc, typename TDst, int DimDst,
781  access::mode ModeDst, access::target TargetDst,
782  access::placeholder IsPHSrc, access::placeholder IsPHDst>
783  detail::enable_if_t<(DimSrc > 0) && (DimDst > 0), bool>
786  if (!MIsHost &&
787  IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range()))
788  return false;
789 
790  range<1> LinearizedRange(Src.size());
791  parallel_for<
792  class __copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
793  ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
794  LinearizedRange, [=](id<1> Id) {
795  size_t Index = Id[0];
796  id<DimSrc> SrcId = detail::getDelinearizedId(Src.get_range(), Index);
797  id<DimDst> DstId = detail::getDelinearizedId(Dst.get_range(), Index);
798  Dst[DstId] = Src[SrcId];
799  });
800  return true;
801  }
802 
810  template <typename TSrc, int DimSrc, access::mode ModeSrc,
811  access::target TargetSrc, typename TDst, int DimDst,
812  access::mode ModeDst, access::target TargetDst,
813  access::placeholder IsPHSrc, access::placeholder IsPHDst>
817  if (!MIsHost)
818  return false;
819 
820  single_task<
821  class __copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
822  ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
823  [=]() { *(Dst.get_pointer()) = *(Src.get_pointer()); });
824  return true;
825  }
826 
827 #ifndef __SYCL_DEVICE_ONLY__
828  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
834  access::target AccTarget, access::placeholder IsPH>
835  detail::enable_if_t<(Dim > 0)>
837  TDst *Dst) {
838  range<Dim> Range = Src.get_range();
839  parallel_for<
840  class __copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
841  Range, [=](id<Dim> Index) {
842  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
843  using TSrcNonConst = typename detail::remove_const_t<TSrc>;
844  (reinterpret_cast<TSrcNonConst *>(Dst))[LinearIndex] = Src[Index];
845  });
846  }
847 
853  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
854  access::target AccTarget, access::placeholder IsPH>
857  TDst *Dst) {
858  single_task<class __copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
859  [=]() {
860  using TSrcNonConst = typename detail::remove_const_t<TSrc>;
861  *(reinterpret_cast<TSrcNonConst *>(Dst)) = *(Src.get_pointer());
862  });
863  }
864 
869  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
870  access::target AccTarget, access::placeholder IsPH>
871  detail::enable_if_t<(Dim > 0)>
872  copyPtrToAccHost(TSrc *Src,
874  range<Dim> Range = Dst.get_range();
875  parallel_for<
876  class __copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
877  Range, [=](id<Dim> Index) {
878  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
879  Dst[Index] = (reinterpret_cast<const TDst *>(Src))[LinearIndex];
880  });
881  }
882 
888  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
889  access::target AccTarget, access::placeholder IsPH>
891  copyPtrToAccHost(TSrc *Src,
893  single_task<class __copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
894  [=]() {
895  *(Dst.get_pointer()) = *(reinterpret_cast<const TDst *>(Src));
896  });
897  }
898 #endif // __SYCL_DEVICE_ONLY__
899 
900  constexpr static bool isConstOrGlobal(access::target AccessTarget) {
901  return AccessTarget == access::target::device ||
902  AccessTarget == access::target::constant_buffer;
903  }
904 
905  constexpr static bool isImageOrImageArray(access::target AccessTarget) {
906  return AccessTarget == access::target::image ||
907  AccessTarget == access::target::image_array;
908  }
909 
910  constexpr static bool
911  isValidTargetForExplicitOp(access::target AccessTarget) {
912  return isConstOrGlobal(AccessTarget) || isImageOrImageArray(AccessTarget);
913  }
914 
915  constexpr static bool isValidModeForSourceAccessor(access::mode AccessMode) {
916  return AccessMode == access::mode::read ||
917  AccessMode == access::mode::read_write;
918  }
919 
920  constexpr static bool
921  isValidModeForDestinationAccessor(access::mode AccessMode) {
922  return AccessMode == access::mode::write ||
923  AccessMode == access::mode::read_write ||
924  AccessMode == access::mode::discard_write ||
925  AccessMode == access::mode::discard_read_write;
926  }
927 
928  template <int Dims, typename LambdaArgType> struct TransformUserItemType {
929  using type = typename std::conditional<
930  std::is_convertible<nd_item<Dims>, LambdaArgType>::value, nd_item<Dims>,
931  typename std::conditional<
932  std::is_convertible<item<Dims>, LambdaArgType>::value, item<Dims>,
933  LambdaArgType>::type>::type;
934  };
935 
947  template <typename KernelName, typename KernelType, int Dims>
948  void parallel_for_lambda_impl(range<Dims> NumWorkItems,
949  KernelType KernelFunc) {
950  throwIfActionIsCreated();
951  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
952 
953  // If 1D kernel argument is an integral type, convert it to sycl::item<1>
954  // If user type is convertible from sycl::item/sycl::nd_item, use
955  // sycl::item/sycl::nd_item to transport item information
956  using TransformedArgType = typename std::conditional<
957  std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
958  typename TransformUserItemType<Dims, LambdaArgType>::type>::type;
959 
960  using NameT =
962 
963  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
964 
965  // Range rounding can be disabled by the user.
966  // Range rounding is not done on the host device.
967  // Range rounding is supported only for newer SYCL standards.
968 #if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \
969  !defined(DPCPP_HOST_DEVICE_OPENMP) && \
970  !defined(DPCPP_HOST_DEVICE_PERF_NATIVE) && SYCL_LANGUAGE_VERSION >= 202001
971  // Range should be a multiple of this for reasonable performance.
972  size_t MinFactorX = 16;
973  // Range should be a multiple of this for improved performance.
974  size_t GoodFactorX = 32;
975  // Range should be at least this to make rounding worthwhile.
976  size_t MinRangeX = 1024;
977 
978  // Check if rounding parameters have been set through environment:
979  // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
980  this->GetRangeRoundingSettings(MinFactorX, GoodFactorX, MinRangeX);
981 
982  // Disable the rounding-up optimizations under these conditions:
983  // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
984  // 2. The kernel is provided via an interoperability method.
985  // 3. The range is already a multiple of the rounding factor.
986  //
987  // Cases 2 and 3 could be supported with extra effort.
988  // As an optimization for the common case it is an
989  // implementation choice to not support those scenarios.
990  // Note that "this_item" is a free function, i.e. not tied to any
991  // specific id or item. When concurrent parallel_fors are executing
992  // on a device it is difficult to tell which parallel_for the call is
993  // being made from. One could replicate portions of the
994  // call-graph to make this_item calls kernel-specific but this is
995  // not considered worthwhile.
996 
997  // Get the kernel name to check condition 2.
998  std::string KName = typeid(NameT *).name();
1000  bool DisableRounding =
1001  this->DisableRangeRounding() ||
1002  (KI::getName() == nullptr || KI::getName()[0] == '\0');
1003 
1004  // Perform range rounding if rounding-up is enabled
1005  // and there are sufficient work-items to need rounding
1006  // and the user-specified range is not a multiple of a "good" value.
1007  if (!DisableRounding && (NumWorkItems[0] >= MinRangeX) &&
1008  (NumWorkItems[0] % MinFactorX != 0)) {
1009  // It is sufficient to round up just the first dimension.
1010  // Multiplying the rounded-up value of the first dimension
1011  // by the values of the remaining dimensions (if any)
1012  // will yield a rounded-up value for the total range.
1013  size_t NewValX =
1014  ((NumWorkItems[0] + GoodFactorX - 1) / GoodFactorX) * GoodFactorX;
1015  if (this->RangeRoundingTrace())
1016  std::cout << "parallel_for range adjusted from " << NumWorkItems[0]
1017  << " to " << NewValX << std::endl;
1018 
1019  using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
1020  auto Wrapper =
1021  getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
1022  KernelFunc, NumWorkItems);
1023 
1024  using KName = std::conditional_t<std::is_same<KernelType, NameT>::value,
1025  decltype(Wrapper), NameWT>;
1026 
1027  range<Dims> AdjustedRange = NumWorkItems;
1028  AdjustedRange.set_range_dim0(NewValX);
1029  kernel_parallel_for_wrapper<KName, TransformedArgType>(Wrapper);
1030 #ifndef __SYCL_DEVICE_ONLY__
1031  detail::checkValueRange<Dims>(AdjustedRange);
1032  MNDRDesc.set(std::move(AdjustedRange));
1033  StoreLambda<KName, decltype(Wrapper), Dims, TransformedArgType>(
1034  std::move(Wrapper));
1035  setType(detail::CG::Kernel);
1036 #endif
1037  } else
1038 #endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ &&
1039  // !DPCPP_HOST_DEVICE_OPENMP && !DPCPP_HOST_DEVICE_PERF_NATIVE &&
1040  // SYCL_LANGUAGE_VERSION >= 202001
1041  {
1042  (void)NumWorkItems;
1043  kernel_parallel_for_wrapper<NameT, TransformedArgType>(KernelFunc);
1044 #ifndef __SYCL_DEVICE_ONLY__
1045  detail::checkValueRange<Dims>(NumWorkItems);
1046  MNDRDesc.set(std::move(NumWorkItems));
1047  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1048  std::move(KernelFunc));
1049  setType(detail::CG::Kernel);
1050 #endif
1051  }
1052  }
1053 
1061  template <int Dims>
1062  void parallel_for_impl(range<Dims> NumWorkItems, kernel Kernel) {
1063  throwIfActionIsCreated();
1064  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1065  detail::checkValueRange<Dims>(NumWorkItems);
1066  MNDRDesc.set(std::move(NumWorkItems));
1067  setType(detail::CG::Kernel);
1068  extractArgsAndReqs();
1069  MKernelName = getKernelName();
1070  }
1071 
1072 #ifdef SYCL_LANGUAGE_VERSION
1073 #define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel]]
1074 #else
1075 #define __SYCL_KERNEL_ATTR__
1076 #endif
1077  // NOTE: the name of this function - "kernel_single_task" - is used by the
1078  // Front End to determine kernel invocation kind.
1079  template <typename KernelName, typename KernelType>
1081 #ifdef __SYCL_NONCONST_FUNCTOR__
1082  kernel_single_task(KernelType KernelFunc) {
1083 #else
1084  kernel_single_task(const KernelType &KernelFunc) {
1085 #endif
1086 #ifdef __SYCL_DEVICE_ONLY__
1087  KernelFunc();
1088 #else
1089  (void)KernelFunc;
1090 #endif
1091  }
1092 
1093  // NOTE: the name of this function - "kernel_single_task" - is used by the
1094  // Front End to determine kernel invocation kind.
1095  template <typename KernelName, typename KernelType>
1097 #ifdef __SYCL_NONCONST_FUNCTOR__
1098  kernel_single_task(KernelType KernelFunc, kernel_handler KH) {
1099 #else
1100  kernel_single_task(const KernelType &KernelFunc, kernel_handler KH) {
1101 #endif
1102 #ifdef __SYCL_DEVICE_ONLY__
1103  KernelFunc(KH);
1104 #else
1105  (void)KernelFunc;
1106  (void)KH;
1107 #endif
1108  }
1109 
1110  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1111  // Front End to determine kernel invocation kind.
1112  template <typename KernelName, typename ElementType, typename KernelType>
1114 #ifdef __SYCL_NONCONST_FUNCTOR__
1115  kernel_parallel_for(KernelType KernelFunc) {
1116 #else
1117  kernel_parallel_for(const KernelType &KernelFunc) {
1118 #endif
1119 #ifdef __SYCL_DEVICE_ONLY__
1120  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1121 #else
1122  (void)KernelFunc;
1123 #endif
1124  }
1125 
1126  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1127  // Front End to determine kernel invocation kind.
1128  template <typename KernelName, typename ElementType, typename KernelType>
1130 #ifdef __SYCL_NONCONST_FUNCTOR__
1131  kernel_parallel_for(KernelType KernelFunc, kernel_handler KH) {
1132 #else
1133  kernel_parallel_for(const KernelType &KernelFunc, kernel_handler KH) {
1134 #endif
1135 #ifdef __SYCL_DEVICE_ONLY__
1136  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1137 #else
1138  (void)KernelFunc;
1139  (void)KH;
1140 #endif
1141  }
1142 
1143  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1144  // used by the Front End to determine kernel invocation kind.
1145  template <typename KernelName, typename ElementType, typename KernelType>
1147 #ifdef __SYCL_NONCONST_FUNCTOR__
1148  kernel_parallel_for_work_group(KernelType KernelFunc) {
1149 #else
1150  kernel_parallel_for_work_group(const KernelType &KernelFunc) {
1151 #endif
1152 #ifdef __SYCL_DEVICE_ONLY__
1153  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1154 #else
1155  (void)KernelFunc;
1156 #endif
1157  }
1158 
1159  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1160  // used by the Front End to determine kernel invocation kind.
1161  template <typename KernelName, typename ElementType, typename KernelType>
1163 #ifdef __SYCL_NONCONST_FUNCTOR__
1164  kernel_parallel_for_work_group(KernelType KernelFunc, kernel_handler KH) {
1165 #else
1166  kernel_parallel_for_work_group(const KernelType &KernelFunc,
1167  kernel_handler KH) {
1168 #endif
1169 #ifdef __SYCL_DEVICE_ONLY__
1170  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1171 #else
1172  (void)KernelFunc;
1173  (void)KH;
1174 #endif
1175  }
1176 
1177  // Wrappers for kernel_*** functions above with and without support of
1178  // additional kernel_handler argument.
1179 
1180  // NOTE: to support kernel_handler argument in kernel lambdas, only
1181  // kernel_***_wrapper functions must be called in this code
1182 
1183  // Wrappers for kernel_single_task(...)
1184 
1185  template <typename KernelName, typename KernelType>
1186  std::enable_if_t<detail::KernelLambdaHasKernelHandlerArgT<KernelType>::value>
1187 #ifdef __SYCL_NONCONST_FUNCTOR__
1188  kernel_single_task_wrapper(KernelType KernelFunc) {
1189 #else
1190  kernel_single_task_wrapper(const KernelType &KernelFunc) {
1191 #endif
1192 #ifdef __SYCL_DEVICE_ONLY__
1193  detail::CheckDeviceCopyable<KernelType>();
1194 #endif // __SYCL_DEVICE_ONLY__
1195  kernel_handler KH;
1196  kernel_single_task<KernelName>(KernelFunc, KH);
1197  }
1198 
1199  template <typename KernelName, typename KernelType>
1200  std::enable_if_t<!detail::KernelLambdaHasKernelHandlerArgT<KernelType>::value>
1201 #ifdef __SYCL_NONCONST_FUNCTOR__
1202  kernel_single_task_wrapper(KernelType KernelFunc) {
1203 #else
1204  kernel_single_task_wrapper(const KernelType &KernelFunc) {
1205 #endif
1206 #ifdef __SYCL_DEVICE_ONLY__
1207  detail::CheckDeviceCopyable<KernelType>();
1208 #endif // __SYCL_DEVICE_ONLY__
1209  kernel_single_task<KernelName>(KernelFunc);
1210  }
1211 
1212  // Wrappers for kernel_parallel_for(...)
1213 
1214  template <typename KernelName, typename ElementType, typename KernelType>
1216  detail::KernelLambdaHasKernelHandlerArgT<KernelType, ElementType>::value>
1217 #ifdef __SYCL_NONCONST_FUNCTOR__
1218  kernel_parallel_for_wrapper(KernelType KernelFunc) {
1219 #else
1220  kernel_parallel_for_wrapper(const KernelType &KernelFunc) {
1221 #endif
1222 #ifdef __SYCL_DEVICE_ONLY__
1223  detail::CheckDeviceCopyable<KernelType>();
1224 #endif // __SYCL_DEVICE_ONLY__
1225  kernel_handler KH;
1226  kernel_parallel_for<KernelName, ElementType>(KernelFunc, KH);
1227  }
1228 
1229  template <typename KernelName, typename ElementType, typename KernelType>
1231  !detail::KernelLambdaHasKernelHandlerArgT<KernelType, ElementType>::value>
1232 #ifdef __SYCL_NONCONST_FUNCTOR__
1233  kernel_parallel_for_wrapper(KernelType KernelFunc) {
1234 #else
1235  kernel_parallel_for_wrapper(const KernelType &KernelFunc) {
1236 #endif
1237 #ifdef __SYCL_DEVICE_ONLY__
1238  detail::CheckDeviceCopyable<KernelType>();
1239 #endif // __SYCL_DEVICE_ONLY__
1240  kernel_parallel_for<KernelName, ElementType>(KernelFunc);
1241  }
1242 
1243  // Wrappers for kernel_parallel_for_work_group(...)
1244 
1245  template <typename KernelName, typename ElementType, typename KernelType>
1247  detail::KernelLambdaHasKernelHandlerArgT<KernelType, ElementType>::value>
1248 #ifdef __SYCL_NONCONST_FUNCTOR__
1249  kernel_parallel_for_work_group_wrapper(KernelType KernelFunc) {
1250 #else
1251  kernel_parallel_for_work_group_wrapper(const KernelType &KernelFunc) {
1252 #endif
1253 #ifdef __SYCL_DEVICE_ONLY__
1254  detail::CheckDeviceCopyable<KernelType>();
1255 #endif // __SYCL_DEVICE_ONLY__
1256  kernel_handler KH;
1257  kernel_parallel_for_work_group<KernelName, ElementType>(KernelFunc, KH);
1258  }
1259 
1260  template <typename KernelName, typename ElementType, typename KernelType>
1262  !detail::KernelLambdaHasKernelHandlerArgT<KernelType, ElementType>::value>
1263 #ifdef __SYCL_NONCONST_FUNCTOR__
1264  kernel_parallel_for_work_group_wrapper(KernelType KernelFunc) {
1265 #else
1266  kernel_parallel_for_work_group_wrapper(const KernelType &KernelFunc) {
1267 #endif
1268 #ifdef __SYCL_DEVICE_ONLY__
1269  detail::CheckDeviceCopyable<KernelType>();
1270 #endif // __SYCL_DEVICE_ONLY__
1271  kernel_parallel_for_work_group<KernelName, ElementType>(KernelFunc);
1272  }
1273 
1274  void setStateExplicitKernelBundle();
1275  void setStateSpecConstSet();
1276  bool isStateExplicitKernelBundle() const;
1277 
1278  std::shared_ptr<detail::kernel_bundle_impl>
1279  getOrInsertHandlerKernelBundle(bool Insert) const;
1280 
1281  void setHandlerKernelBundle(kernel Kernel);
1282 
1283  void setHandlerKernelBundle(
1284  const std::shared_ptr<detail::kernel_bundle_impl> &NewKernelBundleImpPtr);
1285 
1286  template <typename FuncT>
1288  detail::check_fn_signature<detail::remove_reference_t<FuncT>,
1289  void()>::value ||
1290  detail::check_fn_signature<detail::remove_reference_t<FuncT>,
1291  void(interop_handle)>::value>
1292  host_task_impl(FuncT &&Func) {
1293  throwIfActionIsCreated();
1294 
1295  MNDRDesc.set(range<1>(1));
1296  MArgs = std::move(MAssociatedAccesors);
1297 
1298  MHostTask.reset(new detail::HostTask(std::move(Func)));
1299 
1300  setType(detail::CG::CodeplayHostTask);
1301  }
1302 
1303 public:
1304  handler(const handler &) = delete;
1305  handler(handler &&) = delete;
1306  handler &operator=(const handler &) = delete;
1307  handler &operator=(handler &&) = delete;
1308 
1309 #if __cplusplus >= 201703L
1310  template <auto &SpecName>
1311  void set_specialization_constant(
1312  typename std::remove_reference_t<decltype(SpecName)>::value_type Value) {
1313 
1314  setStateSpecConstSet();
1315 
1316  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1317  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1318 
1319  detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1321  .set_specialization_constant<SpecName>(Value);
1322  }
1323 
1324  template <auto &SpecName>
1325  typename std::remove_reference_t<decltype(SpecName)>::value_type
1326  get_specialization_constant() const {
1327 
1328  if (isStateExplicitKernelBundle())
1329  throw sycl::exception(make_error_code(errc::invalid),
1330  "Specialization constants cannot be read after "
1331  "explicitly setting the used kernel bundle");
1332 
1333  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1334  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1335 
1336  return detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1338  .get_specialization_constant<SpecName>();
1339  }
1340 
1341 #endif
1342 
1343  void
1344  use_kernel_bundle(const kernel_bundle<bundle_state::executable> &ExecBundle);
1345 
1353  template <typename DataT, int Dims, access::mode AccMode,
1354  access::target AccTarget>
1355  void
1357  Acc) {
1358  associateWithHandler(&Acc, AccTarget);
1359  }
1360 
1364  void depends_on(event Event);
1365 
1369  void depends_on(const std::vector<event> &Events);
1370 
1371  template <typename T>
1372  using remove_cv_ref_t =
1374 
1375  template <typename U, typename T>
1376  using is_same_type = std::is_same<remove_cv_ref_t<U>, remove_cv_ref_t<T>>;
1377 
1378  template <typename T> struct ShouldEnableSetArg {
1379  static constexpr bool value =
1380  std::is_trivially_copyable<detail::remove_reference_t<T>>::value
1381 #if SYCL_LANGUAGE_VERSION && SYCL_LANGUAGE_VERSION <= 201707
1382  && std::is_standard_layout<detail::remove_reference_t<T>>::value
1383 #endif
1384  || is_same_type<sampler, T>::value // Sampler
1386  std::is_pointer<remove_cv_ref_t<T>>::value) // USM
1387  || is_same_type<cl_mem, T>::value; // Interop
1388  };
1389 
1396  template <typename T>
1398  set_arg(int ArgIndex, T &&Arg) {
1399  setArgHelper(ArgIndex, std::move(Arg));
1400  }
1401 
1402  template <typename DataT, int Dims, access::mode AccessMode,
1404  void
1405  set_arg(int ArgIndex,
1407  setArgHelper(ArgIndex, std::move(Arg));
1408  }
1409 
1410  template <typename DataT, int Dims>
1411  void set_arg(int ArgIndex, local_accessor<DataT, Dims> Arg) {
1412  setArgHelper(ArgIndex, std::move(Arg));
1413  }
1414 
1420  template <typename... Ts> void set_args(Ts &&...Args) {
1421  setArgsHelper(0, std::move(Args)...);
1422  }
1423 
1431  template <typename KernelName = detail::auto_name, typename KernelType>
1432 #ifdef __SYCL_NONCONST_FUNCTOR__
1433  void single_task(KernelType KernelFunc) {
1434 #else
1435  void single_task(const KernelType &KernelFunc) {
1436 #endif
1437  throwIfActionIsCreated();
1438  using NameT =
1440  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1441  kernel_single_task_wrapper<NameT>(KernelFunc);
1442 #ifndef __SYCL_DEVICE_ONLY__
1443  // No need to check if range is out of INT_MAX limits as it's compile-time
1444  // known constant.
1445  MNDRDesc.set(range<1>{1});
1446 
1447  StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(KernelFunc);
1448  setType(detail::CG::Kernel);
1449 #endif
1450  }
1451 
1452  template <typename KernelName = detail::auto_name, typename KernelType>
1453 #ifdef __SYCL_NONCONST_FUNCTOR__
1454  void parallel_for(range<1> NumWorkItems, KernelType KernelFunc) {
1455 #else
1456  void parallel_for(range<1> NumWorkItems, const KernelType &KernelFunc) {
1457 #endif
1458  parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
1459  }
1460 
1461  template <typename KernelName = detail::auto_name, typename KernelType>
1462 #ifdef __SYCL_NONCONST_FUNCTOR__
1463  void parallel_for(range<2> NumWorkItems, KernelType KernelFunc) {
1464 #else
1465  void parallel_for(range<2> NumWorkItems, const KernelType &KernelFunc) {
1466 #endif
1467  parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
1468  }
1469 
1470  template <typename KernelName = detail::auto_name, typename KernelType>
1471 #ifdef __SYCL_NONCONST_FUNCTOR__
1472  void parallel_for(range<3> NumWorkItems, KernelType KernelFunc) {
1473 #else
1474  void parallel_for(range<3> NumWorkItems, const KernelType &KernelFunc) {
1475 #endif
1476  parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
1477  }
1478 
1483  template <typename FuncT>
1485  "run_on_host_intel() is deprecated, use host_task() instead")
1486  void run_on_host_intel(FuncT Func) {
1487  throwIfActionIsCreated();
1488  // No need to check if range is out of INT_MAX limits as it's compile-time
1489  // known constant
1490  MNDRDesc.set(range<1>{1});
1491 
1492  MArgs = std::move(MAssociatedAccesors);
1493  MHostKernel.reset(new detail::HostKernel<FuncT, void, 1>(std::move(Func)));
1494  setType(detail::CG::RunOnHostIntel);
1495  }
1496 
1498  template <typename FuncT>
1501  void()>::value ||
1503  void(interop_handle)>::value>
1504  host_task(FuncT &&Func) {
1505  host_task_impl(Func);
1506  }
1507 
1508 // replace _KERNELFUNCPARAM(KernelFunc) with KernelType KernelFunc
1509 // or const KernelType &KernelFunc
1510 #ifdef __SYCL_NONCONST_FUNCTOR__
1511 #define _KERNELFUNCPARAM(a) KernelType a
1512 #else
1513 #define _KERNELFUNCPARAM(a) const KernelType &a
1514 #endif
1515 
1529  template <typename KernelName = detail::auto_name, typename KernelType,
1530  int Dims>
1531  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL2020")
1532  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
1534  throwIfActionIsCreated();
1535  using NameT =
1537  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1538  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
1539  (void)NumWorkItems;
1540  (void)WorkItemOffset;
1541  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
1542 #ifndef __SYCL_DEVICE_ONLY__
1543  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
1544  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
1545  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1546  setType(detail::CG::Kernel);
1547 #endif
1548  }
1549 
1562  template <typename KernelName = detail::auto_name, typename KernelType,
1563  int Dims>
1564  void parallel_for(nd_range<Dims> ExecutionRange,
1566  throwIfActionIsCreated();
1567  using NameT =
1569  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1570  using LambdaArgType =
1571  sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
1572  // If user type is convertible from sycl::item/sycl::nd_item, use
1573  // sycl::item/sycl::nd_item to transport item information
1574  using TransformedArgType =
1575  typename TransformUserItemType<Dims, LambdaArgType>::type;
1576  (void)ExecutionRange;
1577  kernel_parallel_for_wrapper<NameT, TransformedArgType>(KernelFunc);
1578 #ifndef __SYCL_DEVICE_ONLY__
1579  detail::checkValueRange<Dims>(ExecutionRange);
1580  MNDRDesc.set(std::move(ExecutionRange));
1581  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1582  std::move(KernelFunc));
1583  setType(detail::CG::Kernel);
1584 #endif
1585  }
1586 
1587 // "if constexpr" simplifies implementation/increases readability in comparison
1588 // with SFINAE-based approach.
1589 #if __cplusplus >= 201703L
1590  template <typename KernelName = detail::auto_name, typename KernelType,
1599  int Dims, typename Reduction>
1600  void parallel_for(range<Dims> Range, Reduction Redu,
1602  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1603 
1604  // Before running the kernels, check that device has enough local memory
1605  // to hold local arrays required for the tree-reduction algorithm.
1606  constexpr bool IsTreeReduction =
1607  !Reduction::has_fast_reduce && !Reduction::has_fast_atomics;
1608  size_t OneElemSize =
1609  IsTreeReduction ? sizeof(typename Reduction::result_type) : 0;
1610  uint32_t NumConcurrentWorkGroups =
1611 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
1612  __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
1613 #else
1615 #endif
1616  // TODO: currently the preferred work group size is determined for the given
1617  // queue/device, while it is safer to use queries to the kernel pre-compiled
1618  // for the device.
1619  size_t PrefWGSize = detail::reduGetPreferredWGSize(MQueue, OneElemSize);
1620  if (detail::reduCGFuncForRange<KernelName>(*this, KernelFunc, Range,
1621  PrefWGSize,
1622  NumConcurrentWorkGroups, Redu)) {
1623  this->finalize();
1624  MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
1625  detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1626  });
1627  }
1628  }
1629 
1630  template <typename KernelName = detail::auto_name, typename KernelType,
1631  int Dims, typename Reduction>
1632  void parallel_for(nd_range<Dims> Range, Reduction Redu,
1634  if constexpr (!Reduction::has_fast_atomics &&
1635  !Reduction::has_float64_atomics) {
1636  // The most basic implementation.
1637  parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
1638  return;
1639  } else { // Can't "early" return for "if constexpr".
1640  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1641  if constexpr (Reduction::has_float64_atomics) {
1648 
1649  if (D.has(aspect::atomic64)) {
1650 
1651  detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc, Range,
1652  Redu);
1653  } else {
1654  // Resort to basic implementation as well.
1655  parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
1656  return;
1657  }
1658  } else {
1659  // Use fast sycl::atomic operations to update reduction variable at the
1660  // end of each work-group work.
1661  detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
1662  }
1663  // If the reduction variable must be initialized with the identity value
1664  // before the kernel run, then an additional working accessor is created,
1665  // initialized with the identity value and used in the kernel. That
1666  // working accessor is then copied to user's accessor or USM pointer after
1667  // the kernel run.
1668  // For USM pointers without initialize_to_identity properties the same
1669  // scheme with working accessor is used as re-using user's USM pointer in
1670  // the kernel would require creation of another variant of user's kernel,
1671  // which does not seem efficient.
1672  if (Reduction::is_usm || Redu.initializeToIdentity()) {
1673  this->finalize();
1674  MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
1675  detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1676  });
1677  }
1678  }
1679  }
1680 
1681  template <typename KernelName, typename KernelType, int Dims,
1682  typename Reduction>
1683  void parallel_for_impl(nd_range<Dims> Range, Reduction Redu,
1684  KernelType KernelFunc) {
1685  // This parallel_for() is lowered to the following sequence:
1686  // 1) Call a kernel that a) call user's lambda function and b) performs
1687  // one iteration of reduction, storing the partial reductions/sums
1688  // to either a newly created global buffer or to user's reduction
1689  // accessor. So, if the original 'Range' has totally
1690  // N1 elements and work-group size is W, then after the first iteration
1691  // there will be N2 partial sums where N2 = N1 / W.
1692  // If (N2 == 1) then the partial sum is written to user's accessor.
1693  // Otherwise, a new global buffer is created and partial sums are written
1694  // to it.
1695  // 2) Call an aux kernel (if necessary, i.e. if N2 > 1) as many times as
1696  // necessary to reduce all partial sums into one final sum.
1697 
1698  // Before running the kernels, check that device has enough local memory
1699  // to hold local arrays that may be required for the reduction algorithm.
1700  // TODO: If the work-group-size is limited by the local memory, then
1701  // a special version of the main kernel may be created. The one that would
1702  // not use local accessors, which means it would not do the reduction in
1703  // the main kernel, but simply generate Range.get_global_range.size() number
1704  // of partial sums, leaving the reduction work to the additional/aux
1705  // kernels.
1706  constexpr bool HFR = Reduction::has_fast_reduce;
1707  size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
1708  // TODO: currently the maximal work group size is determined for the given
1709  // queue/device, while it may be safer to use queries to the kernel compiled
1710  // for the device.
1711  size_t MaxWGSize = detail::reduGetMaxWGSize(MQueue, OneElemSize);
1712  if (Range.get_local_range().size() > MaxWGSize)
1713  throw sycl::runtime_error("The implementation handling parallel_for with"
1714  " reduction requires work group size not bigger"
1715  " than " +
1716  std::to_string(MaxWGSize),
1717  PI_ERROR_INVALID_WORK_GROUP_SIZE);
1718 
1719  // 1. Call the kernel that includes user's lambda function.
1720  detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
1721  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1722  this->finalize();
1723 
1724  // 2. Run the additional kernel as many times as needed to reduce
1725  // all partial sums into one scalar.
1726 
1727  // TODO: Create a special slow/sequential version of the kernel that would
1728  // handle the reduction instead of reporting an assert below.
1729  if (MaxWGSize <= 1)
1730  throw sycl::runtime_error("The implementation handling parallel_for with "
1731  "reduction requires the maximal work group "
1732  "size to be greater than 1 to converge. "
1733  "The maximal work group size depends on the "
1734  "device and the size of the objects passed to "
1735  "the reduction.",
1736  PI_ERROR_INVALID_WORK_GROUP_SIZE);
1737  size_t NWorkItems = Range.get_group_range().size();
1738  while (NWorkItems > 1) {
1739  MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
1740  NWorkItems = detail::reduAuxCGFunc<KernelName, KernelType>(
1741  AuxHandler, NWorkItems, MaxWGSize, Redu);
1742  });
1743  } // end while (NWorkItems > 1)
1744 
1745  if (Reduction::is_usm) {
1746  MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
1747  detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1748  });
1749  }
1750  }
1751 
1752  // This version of parallel_for may handle one or more reductions packed in
1753  // \p Rest argument. The last element in \p Rest pack is the kernel function,
1754  // everything else is reduction(s).
1755  // TODO: this variant is currently enabled for 2+ reductions only as the
1756  // versions handling 1 reduction variable are more efficient right now.
1757  //
1758  // This is basically a tree reduction where we re-use user's reduction
1759  // variable instead of creating temporary storage for the last iteration
1760  // (#WG == 1).
1761  template <typename KernelName = detail::auto_name, int Dims,
1762  typename... RestT>
1763  std::enable_if_t<(sizeof...(RestT) >= 3 &&
1764  detail::AreAllButLastReductions<RestT...>::value)>
1765  parallel_for(nd_range<Dims> Range, RestT... Rest) {
1766  std::tuple<RestT...> ArgsTuple(Rest...);
1767  constexpr size_t NumArgs = sizeof...(RestT);
1768  auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
1769  auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
1770  auto ReduTuple = detail::tuple_select_elements(ArgsTuple, ReduIndices);
1771 
1772  size_t LocalMemPerWorkItem =
1773  detail::reduGetMemPerWorkItem(ReduTuple, ReduIndices);
1774  // TODO: currently the maximal work group size is determined for the given
1775  // queue/device, while it is safer to use queries to the kernel compiled
1776  // for the device.
1777  size_t MaxWGSize = detail::reduGetMaxWGSize(MQueue, LocalMemPerWorkItem);
1778  if (Range.get_local_range().size() > MaxWGSize)
1779  throw sycl::runtime_error("The implementation handling parallel_for with"
1780  " reduction requires work group size not bigger"
1781  " than " +
1782  std::to_string(MaxWGSize),
1783  PI_ERROR_INVALID_WORK_GROUP_SIZE);
1784 
1785  detail::reduCGFuncMulti<KernelName>(*this, KernelFunc, Range, ReduTuple,
1786  ReduIndices);
1787  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1788  this->finalize();
1789 
1790  size_t NWorkItems = Range.get_group_range().size();
1791  while (NWorkItems > 1) {
1792  MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
1793  NWorkItems = detail::reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
1794  AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
1795  });
1796  } // end while (NWorkItems > 1)
1797  }
1798 #endif // __cplusplus >= 201703L
1799 
1810  template <typename KernelName = detail::auto_name, typename KernelType,
1811  int Dims>
1814  throwIfActionIsCreated();
1815  using NameT =
1817  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1818  using LambdaArgType =
1819  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
1820  (void)NumWorkGroups;
1821  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
1822 #ifndef __SYCL_DEVICE_ONLY__
1823  detail::checkValueRange<Dims>(NumWorkGroups);
1824  MNDRDesc.setNumWorkGroups(NumWorkGroups);
1825  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1826  setType(detail::CG::Kernel);
1827 #endif // __SYCL_DEVICE_ONLY__
1828  }
1829 
1842  template <typename KernelName = detail::auto_name, typename KernelType,
1843  int Dims>
1845  range<Dims> WorkGroupSize,
1847  throwIfActionIsCreated();
1848  using NameT =
1850  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1851  using LambdaArgType =
1852  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
1853  (void)NumWorkGroups;
1854  (void)WorkGroupSize;
1855  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
1856 #ifndef __SYCL_DEVICE_ONLY__
1857  nd_range<Dims> ExecRange =
1858  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
1859  detail::checkValueRange<Dims>(ExecRange);
1860  MNDRDesc.set(std::move(ExecRange));
1861  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1862  setType(detail::CG::Kernel);
1863 #endif // __SYCL_DEVICE_ONLY__
1864  }
1865 
1872  void single_task(kernel Kernel) {
1873  throwIfActionIsCreated();
1874  // Ignore any set kernel bundles and use the one associated with the kernel
1875  setHandlerKernelBundle(Kernel);
1876  // No need to check if range is out of INT_MAX limits as it's compile-time
1877  // known constant
1878  MNDRDesc.set(range<1>{1});
1879  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1880  setType(detail::CG::Kernel);
1881  extractArgsAndReqs();
1882  MKernelName = getKernelName();
1883  }
1884 
1885  void parallel_for(range<1> NumWorkItems, kernel Kernel) {
1886  parallel_for_impl(NumWorkItems, Kernel);
1887  }
1888 
1889  void parallel_for(range<2> NumWorkItems, kernel Kernel) {
1890  parallel_for_impl(NumWorkItems, Kernel);
1891  }
1892 
1893  void parallel_for(range<3> NumWorkItems, kernel Kernel) {
1894  parallel_for_impl(NumWorkItems, Kernel);
1895  }
1896 
1905  template <int Dims>
1906  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
1907  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
1908  kernel Kernel) {
1909  throwIfActionIsCreated();
1910  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1911  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
1912  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
1913  setType(detail::CG::Kernel);
1914  extractArgsAndReqs();
1915  MKernelName = getKernelName();
1916  }
1917 
1926  template <int Dims> void parallel_for(nd_range<Dims> NDRange, kernel Kernel) {
1927  throwIfActionIsCreated();
1928  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1929  detail::checkValueRange<Dims>(NDRange);
1930  MNDRDesc.set(std::move(NDRange));
1931  setType(detail::CG::Kernel);
1932  extractArgsAndReqs();
1933  MKernelName = getKernelName();
1934  }
1935 
1942  template <typename KernelName = detail::auto_name, typename KernelType>
1944  throwIfActionIsCreated();
1945  // Ignore any set kernel bundles and use the one associated with the kernel
1946  setHandlerKernelBundle(Kernel);
1947  using NameT =
1949  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1950  (void)Kernel;
1951  kernel_single_task<NameT>(KernelFunc);
1952 #ifndef __SYCL_DEVICE_ONLY__
1953  // No need to check if range is out of INT_MAX limits as it's compile-time
1954  // known constant
1955  MNDRDesc.set(range<1>{1});
1956  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1957  setType(detail::CG::Kernel);
1958  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
1959  extractArgsAndReqs();
1960  MKernelName = getKernelName();
1961  } else
1962  StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(std::move(KernelFunc));
1963 #else
1964  detail::CheckDeviceCopyable<KernelType>();
1965 #endif
1966  }
1967 
1971  template <typename FuncT>
1972  __SYCL_DEPRECATED("interop_task() is deprecated, use host_task() instead")
1973  void interop_task(FuncT Func) {
1974 
1975  MInteropTask.reset(new detail::InteropTask(std::move(Func)));
1976  setType(detail::CG::CodeplayInteropTask);
1977  }
1978 
1986  template <typename KernelName = detail::auto_name, typename KernelType,
1987  int Dims>
1988  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
1990  throwIfActionIsCreated();
1991  // Ignore any set kernel bundles and use the one associated with the kernel
1992  setHandlerKernelBundle(Kernel);
1993  using NameT =
1995  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1996  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
1997  (void)Kernel;
1998  (void)NumWorkItems;
1999  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2000 #ifndef __SYCL_DEVICE_ONLY__
2001  detail::checkValueRange<Dims>(NumWorkItems);
2002  MNDRDesc.set(std::move(NumWorkItems));
2003  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2004  setType(detail::CG::Kernel);
2005  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2006  extractArgsAndReqs();
2007  MKernelName = getKernelName();
2008  } else
2009  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2010  std::move(KernelFunc));
2011 #endif
2012  }
2013 
2023  template <typename KernelName = detail::auto_name, typename KernelType,
2024  int Dims>
2025  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
2026  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
2027  id<Dims> WorkItemOffset, _KERNELFUNCPARAM(KernelFunc)) {
2028  throwIfActionIsCreated();
2029  // Ignore any set kernel bundles and use the one associated with the kernel
2030  setHandlerKernelBundle(Kernel);
2031  using NameT =
2033  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2034  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
2035  (void)Kernel;
2036  (void)NumWorkItems;
2037  (void)WorkItemOffset;
2038  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2039 #ifndef __SYCL_DEVICE_ONLY__
2040  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2041  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
2042  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2043  setType(detail::CG::Kernel);
2044  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2045  extractArgsAndReqs();
2046  MKernelName = getKernelName();
2047  } else
2048  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2049  std::move(KernelFunc));
2050 #endif
2051  }
2052 
2062  template <typename KernelName = detail::auto_name, typename KernelType,
2063  int Dims>
2064  void parallel_for(kernel Kernel, nd_range<Dims> NDRange,
2066  throwIfActionIsCreated();
2067  // Ignore any set kernel bundles and use the one associated with the kernel
2068  setHandlerKernelBundle(Kernel);
2069  using NameT =
2071  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2072  using LambdaArgType =
2073  sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
2074  (void)Kernel;
2075  (void)NDRange;
2076  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2077 #ifndef __SYCL_DEVICE_ONLY__
2078  detail::checkValueRange<Dims>(NDRange);
2079  MNDRDesc.set(std::move(NDRange));
2080  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2081  setType(detail::CG::Kernel);
2082  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2083  extractArgsAndReqs();
2084  MKernelName = getKernelName();
2085  } else
2086  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2087  std::move(KernelFunc));
2088 #endif
2089  }
2090 
2104  template <typename KernelName = detail::auto_name, typename KernelType,
2105  int Dims>
2106  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2108  throwIfActionIsCreated();
2109  // Ignore any set kernel bundles and use the one associated with the kernel
2110  setHandlerKernelBundle(Kernel);
2111  using NameT =
2113  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2114  using LambdaArgType =
2115  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
2116  (void)Kernel;
2117  (void)NumWorkGroups;
2118  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2119 #ifndef __SYCL_DEVICE_ONLY__
2120  detail::checkValueRange<Dims>(NumWorkGroups);
2121  MNDRDesc.setNumWorkGroups(NumWorkGroups);
2122  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2123  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2124  setType(detail::CG::Kernel);
2125 #endif // __SYCL_DEVICE_ONLY__
2126  }
2127 
2143  template <typename KernelName = detail::auto_name, typename KernelType,
2144  int Dims>
2145  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2146  range<Dims> WorkGroupSize,
2148  throwIfActionIsCreated();
2149  // Ignore any set kernel bundles and use the one associated with the kernel
2150  setHandlerKernelBundle(Kernel);
2151  using NameT =
2153  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2154  using LambdaArgType =
2155  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
2156  (void)Kernel;
2157  (void)NumWorkGroups;
2158  (void)WorkGroupSize;
2159  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2160 #ifndef __SYCL_DEVICE_ONLY__
2161  nd_range<Dims> ExecRange =
2162  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
2163  detail::checkValueRange<Dims>(ExecRange);
2164  MNDRDesc.set(std::move(ExecRange));
2165  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2166  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2167  setType(detail::CG::Kernel);
2168 #endif // __SYCL_DEVICE_ONLY__
2169  }
2170 
2171  // Clean up KERNELFUNC macro.
2172 #undef _KERNELFUNCPARAM
2173 
2174  // Explicit copy operations API
2175 
2183  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2184  access::target AccessTarget,
2185  access::placeholder IsPlaceholder = access::placeholder::false_t>
2187  std::shared_ptr<T_Dst> Dst) {
2188  throwIfActionIsCreated();
2189  static_assert(isValidTargetForExplicitOp(AccessTarget),
2190  "Invalid accessor target for the copy method.");
2191  static_assert(isValidModeForSourceAccessor(AccessMode),
2192  "Invalid accessor mode for the copy method.");
2193  // Make sure data shared_ptr points to is not released until we finish
2194  // work with it.
2195  MSharedPtrStorage.push_back(Dst);
2196  typename std::shared_ptr<T_Dst>::element_type *RawDstPtr = Dst.get();
2197  copy(Src, RawDstPtr);
2198  }
2199 
2207  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2208  access::target AccessTarget,
2209  access::placeholder IsPlaceholder = access::placeholder::false_t>
2210  void
2211  copy(std::shared_ptr<T_Src> Src,
2213  throwIfActionIsCreated();
2214  static_assert(isValidTargetForExplicitOp(AccessTarget),
2215  "Invalid accessor target for the copy method.");
2216  static_assert(isValidModeForDestinationAccessor(AccessMode),
2217  "Invalid accessor mode for the copy method.");
2218  // Make sure data shared_ptr points to is not released until we finish
2219  // work with it.
2220  MSharedPtrStorage.push_back(Src);
2221  typename std::shared_ptr<T_Src>::element_type *RawSrcPtr = Src.get();
2222  copy(RawSrcPtr, Dst);
2223  }
2224 
2232  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2233  access::target AccessTarget,
2234  access::placeholder IsPlaceholder = access::placeholder::false_t>
2236  T_Dst *Dst) {
2237  throwIfActionIsCreated();
2238  static_assert(isValidTargetForExplicitOp(AccessTarget),
2239  "Invalid accessor target for the copy method.");
2240  static_assert(isValidModeForSourceAccessor(AccessMode),
2241  "Invalid accessor mode for the copy method.");
2242 #ifndef __SYCL_DEVICE_ONLY__
2243  if (MIsHost) {
2244  // TODO: Temporary implementation for host. Should be handled by memory
2245  // manager.
2246  copyAccToPtrHost(Src, Dst);
2247  return;
2248  }
2249 #endif
2250  setType(detail::CG::CopyAccToPtr);
2251 
2253  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2254 
2255  MRequirements.push_back(AccImpl.get());
2256  MSrcPtr = static_cast<void *>(AccImpl.get());
2257  MDstPtr = static_cast<void *>(Dst);
2258  // Store copy of accessor to the local storage to make sure it is alive
2259  // until we finish
2260  MAccStorage.push_back(std::move(AccImpl));
2261  }
2262 
2270  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2271  access::target AccessTarget,
2272  access::placeholder IsPlaceholder = access::placeholder::false_t>
2273  void
2274  copy(const T_Src *Src,
2276  throwIfActionIsCreated();
2277  static_assert(isValidTargetForExplicitOp(AccessTarget),
2278  "Invalid accessor target for the copy method.");
2279  static_assert(isValidModeForDestinationAccessor(AccessMode),
2280  "Invalid accessor mode for the copy method.");
2281 #ifndef __SYCL_DEVICE_ONLY__
2282  if (MIsHost) {
2283  // TODO: Temporary implementation for host. Should be handled by memory
2284  // manager.
2285  copyPtrToAccHost(Src, Dst);
2286  return;
2287  }
2288 #endif
2289  setType(detail::CG::CopyPtrToAcc);
2290 
2292  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2293 
2294  MRequirements.push_back(AccImpl.get());
2295  MSrcPtr = const_cast<T_Src *>(Src);
2296  MDstPtr = static_cast<void *>(AccImpl.get());
2297  // Store copy of accessor to the local storage to make sure it is alive
2298  // until we finish
2299  MAccStorage.push_back(std::move(AccImpl));
2300  }
2301 
2309  template <
2310  typename T_Src, int Dims_Src, access::mode AccessMode_Src,
2311  access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
2312  access::mode AccessMode_Dst, access::target AccessTarget_Dst,
2313  access::placeholder IsPlaceholder_Src = access::placeholder::false_t,
2314  access::placeholder IsPlaceholder_Dst = access::placeholder::false_t>
2315  void copy(accessor<T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src,
2316  IsPlaceholder_Src>
2317  Src,
2318  accessor<T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst,
2319  IsPlaceholder_Dst>
2320  Dst) {
2321  throwIfActionIsCreated();
2322  static_assert(isValidTargetForExplicitOp(AccessTarget_Src),
2323  "Invalid source accessor target for the copy method.");
2324  static_assert(isValidTargetForExplicitOp(AccessTarget_Dst),
2325  "Invalid destination accessor target for the copy method.");
2326  static_assert(isValidModeForSourceAccessor(AccessMode_Src),
2327  "Invalid source accessor mode for the copy method.");
2328  static_assert(isValidModeForDestinationAccessor(AccessMode_Dst),
2329  "Invalid destination accessor mode for the copy method.");
2330  if (Dst.get_size() < Src.get_size())
2331  throw sycl::invalid_object_error(
2332  "The destination accessor size is too small to copy the memory into.",
2333  PI_ERROR_INVALID_OPERATION);
2334 
2335  if (copyAccToAccHelper(Src, Dst))
2336  return;
2337  setType(detail::CG::CopyAccToAcc);
2338 
2339  detail::AccessorBaseHost *AccBaseSrc = (detail::AccessorBaseHost *)&Src;
2340  detail::AccessorImplPtr AccImplSrc = detail::getSyclObjImpl(*AccBaseSrc);
2341 
2342  detail::AccessorBaseHost *AccBaseDst = (detail::AccessorBaseHost *)&Dst;
2343  detail::AccessorImplPtr AccImplDst = detail::getSyclObjImpl(*AccBaseDst);
2344 
2345  MRequirements.push_back(AccImplSrc.get());
2346  MRequirements.push_back(AccImplDst.get());
2347  MSrcPtr = AccImplSrc.get();
2348  MDstPtr = AccImplDst.get();
2349  // Store copy of accessor to the local storage to make sure it is alive
2350  // until we finish
2351  MAccStorage.push_back(std::move(AccImplSrc));
2352  MAccStorage.push_back(std::move(AccImplDst));
2353  }
2354 
2359  template <typename T, int Dims, access::mode AccessMode,
2360  access::target AccessTarget,
2361  access::placeholder IsPlaceholder = access::placeholder::false_t>
2362  void
2364  throwIfActionIsCreated();
2365  static_assert(isValidTargetForExplicitOp(AccessTarget),
2366  "Invalid accessor target for the update_host method.");
2367  setType(detail::CG::UpdateHost);
2368 
2370  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2371 
2372  MDstPtr = static_cast<void *>(AccImpl.get());
2373  MRequirements.push_back(AccImpl.get());
2374  MAccStorage.push_back(std::move(AccImpl));
2375  }
2376 
2385  template <typename T, int Dims, access::mode AccessMode,
2386  access::target AccessTarget,
2387  access::placeholder IsPlaceholder = access::placeholder::false_t,
2388  typename PropertyListT = property_list>
2389  void
2391  Dst,
2392  const T &Pattern) {
2393  throwIfActionIsCreated();
2394  // TODO add check:T must be an integral scalar value or a SYCL vector type
2395  static_assert(isValidTargetForExplicitOp(AccessTarget),
2396  "Invalid accessor target for the fill method.");
2397  if (!MIsHost && (((Dims == 1) && isConstOrGlobal(AccessTarget)) ||
2398  isImageOrImageArray(AccessTarget))) {
2399  setType(detail::CG::Fill);
2400 
2402  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2403 
2404  MDstPtr = static_cast<void *>(AccImpl.get());
2405  MRequirements.push_back(AccImpl.get());
2406  MAccStorage.push_back(std::move(AccImpl));
2407 
2408  MPattern.resize(sizeof(T));
2409  auto PatternPtr = reinterpret_cast<T *>(MPattern.data());
2410  *PatternPtr = Pattern;
2411  } else {
2412 
2413  // TODO: Temporary implementation for host. Should be handled by memory
2414  // manger.
2415  range<Dims> Range = Dst.get_range();
2416  parallel_for<
2417  class __fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
2418  Range, [=](id<Dims> Index) { Dst[Index] = Pattern; });
2419  }
2420  }
2421 
2428  template <typename T> void fill(void *Ptr, const T &Pattern, size_t Count) {
2429  throwIfActionIsCreated();
2430  static_assert(std::is_trivially_copyable<T>::value,
2431  "Pattern must be trivially copyable");
2432  parallel_for<class __usmfill<T>>(range<1>(Count), [=](id<1> Index) {
2433  T *CastedPtr = static_cast<T *>(Ptr);
2434  CastedPtr[Index] = Pattern;
2435  });
2436  }
2437 
2442  throwIfActionIsCreated();
2443  setType(detail::CG::Barrier);
2444  }
2445 
2449  __SYCL2020_DEPRECATED("use 'ext_oneapi_barrier' instead")
2450  void barrier() { ext_oneapi_barrier(); }
2451 
2458  void ext_oneapi_barrier(const std::vector<event> &WaitList);
2459 
2466  __SYCL2020_DEPRECATED("use 'ext_oneapi_barrier' instead")
2467  void barrier(const std::vector<event> &WaitList);
2468 
2478  void memcpy(void *Dest, const void *Src, size_t Count);
2479 
2489  template <typename T> void copy(const T *Src, T *Dest, size_t Count) {
2490  this->memcpy(Dest, Src, Count * sizeof(T));
2491  }
2492 
2501  void memset(void *Dest, int Value, size_t Count);
2502 
2509  void prefetch(const void *Ptr, size_t Count);
2510 
2517  void mem_advise(const void *Ptr, size_t Length, int Advice);
2518 
2519 private:
2520  std::shared_ptr<detail::handler_impl> MImpl;
2521  std::shared_ptr<detail::queue_impl> MQueue;
2526  std::vector<std::vector<char>> MArgsStorage;
2527  std::vector<detail::AccessorImplPtr> MAccStorage;
2528  std::vector<detail::LocalAccessorImplPtr> MLocalAccStorage;
2529  std::vector<std::shared_ptr<detail::stream_impl>> MStreamStorage;
2530  mutable std::vector<std::shared_ptr<const void>> MSharedPtrStorage;
2532  std::vector<detail::ArgDesc> MArgs;
2536  std::vector<detail::ArgDesc> MAssociatedAccesors;
2538  std::vector<detail::AccessorImplHost *> MRequirements;
2540  detail::NDRDescT MNDRDesc;
2541  std::string MKernelName;
2543  std::shared_ptr<detail::kernel_impl> MKernel;
2547  detail::CG::CGTYPE MCGType = detail::CG::None;
2549  void *MSrcPtr = nullptr;
2551  void *MDstPtr = nullptr;
2553  size_t MLength = 0;
2555  std::vector<char> MPattern;
2557  std::unique_ptr<detail::HostKernelBase> MHostKernel;
2559  std::unique_ptr<detail::HostTask> MHostTask;
2560  detail::OSModuleHandle MOSModuleHandle = detail::OSUtil::ExeModuleHandle;
2561  // Storage for a lambda or function when using InteropTasks
2562  std::unique_ptr<detail::InteropTask> MInteropTask;
2564  std::vector<detail::EventImplPtr> MEvents;
2567  std::vector<detail::EventImplPtr> MEventsWaitWithBarrier;
2568 
2569  bool MIsHost = false;
2570 
2571  detail::code_location MCodeLoc = {};
2572  bool MIsFinalized = false;
2573  event MLastEvent;
2574 
2575  // Make queue_impl class friend to be able to call finalize method.
2576  friend class detail::queue_impl;
2577  // Make accessor class friend to keep the list of associated accessors.
2578  template <typename DataT, int Dims, access::mode AccMode,
2579  access::target AccTarget, access::placeholder isPlaceholder,
2580  typename PropertyListT>
2581  friend class accessor;
2583 
2584  template <typename DataT, int Dimensions, access::mode AccessMode,
2587  // Make stream class friend to be able to keep the list of associated streams
2588  friend class stream;
2589  friend class detail::stream_impl;
2590  // Make reduction friends to store buffers and arrays created for it
2591  // in handler from reduction methods.
2592  template <typename T, class BinaryOperation, int Dims, size_t Extent,
2593  typename RedOutVar>
2595 
2596 #ifndef __SYCL_DEVICE_ONLY__
2597  friend void detail::associateWithHandler(handler &,
2599  access::target);
2600 #endif
2601 
2602  friend class ::MockHandler;
2603  friend class detail::queue_impl;
2604 
2605  bool DisableRangeRounding();
2606 
2607  bool RangeRoundingTrace();
2608 
2609  void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
2610  size_t &MinRange);
2611 
2612  template <typename WrapperT, typename TransformedArgType, int Dims,
2613  typename KernelType,
2615  KernelType, TransformedArgType>::value> * = nullptr>
2616  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
2617  range<Dims> NumWorkItems) {
2618  return detail::RoundedRangeKernelWithKH<TransformedArgType, Dims,
2619  KernelType>(NumWorkItems,
2620  KernelFunc);
2621  }
2622 
2623  template <typename WrapperT, typename TransformedArgType, int Dims,
2624  typename KernelType,
2626  KernelType, TransformedArgType>::value> * = nullptr>
2627  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
2628  range<Dims> NumWorkItems) {
2630  NumWorkItems, KernelFunc);
2631  }
2632 };
2633 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
2634 } // namespace sycl
sycl::_V1::handler::copy
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2186
sycl::_V1::detail::reduGetMaxNumConcurrentWorkGroups
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< queue_impl > Queue)
sycl::_V1::property_list
Objects of the property_list class are containers for the SYCL properties.
Definition: property_list.hpp:26
sycl::_V1::detail::AccessorImplPtr
std::shared_ptr< AccessorImplHost > AccessorImplPtr
Definition: accessor.hpp:471
sycl::_V1::detail::RoundedRangeKernelWithKH::operator()
void operator()(TransformedArgType Arg, kernel_handler KH) const
Definition: handler.hpp:225
sycl::_V1::detail::associateWithHandler
void associateWithHandler(handler &, AccessorBaseHost *, access::target)
Definition: handler_proxy.cpp:17
sycl::_V1::detail::RoundedRangeKernel::operator()
void operator()(TransformedArgType Arg) const
Definition: handler.hpp:207
sycl::_V1::detail::LocalAccessorImplHost
Definition: accessor_impl.hpp:113
property_list.hpp
sycl::_V1::detail::image_accessor
Definition: accessor.hpp:557
__usmfill
Definition: handler.hpp:49
cg.hpp
sycl::_V1::handler::ext_oneapi_barrier
void ext_oneapi_barrier()
Prevents any commands submitted afterward to this queue from executing until all commands previously ...
Definition: handler.hpp:2441
sycl::_V1::access::mode
mode
Definition: access.hpp:28
sycl::_V1::detail::runKernelWithoutArg
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType >::value > runKernelWithoutArg(KernelType KernelName)
Definition: cg_types.hpp:183
sycl::_V1::sampler
Encapsulates a configuration for sampling an image accessor.
Definition: sampler.hpp:66
sycl::_V1::ext::intel::experimental::prefetch
prefetch_impl< _B > prefetch
Definition: fpga_lsu.hpp:47
sycl::_V1::handler::set_arg
detail::enable_if_t< ShouldEnableSetArg< T >::value, void > set_arg(int ArgIndex, T &&Arg)
Sets argument for OpenCL interoperability kernels.
Definition: handler.hpp:1398
sycl::_V1::detail::reduction_impl_algo
Definition: handler.hpp:239
sycl::_V1::nd_item::get_global_id
id< dimensions > get_global_id() const
Definition: nd_item.hpp:40
sycl::_V1::detail::kernel_param_desc_t
Definition: kernel_desc.hpp:48
sycl::_V1::make_error_code
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:91
sycl::_V1::detail::auto_name
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:37
item.hpp
__copyAcc2Acc
Definition: handler.hpp:67
sycl::_V1::detail::kernel_param_kind_t
kernel_param_kind_t
Definition: kernel_desc.hpp:37
stl.hpp
sycl::_V1::detail::remove_const_t
typename std::remove_const< T >::type remove_const_t
Definition: stl_type_traits.hpp:30
cg_types.hpp
sycl::_V1::detail::AccessorImplHost
Definition: accessor_impl.hpp:42
__SYCL_INLINE_VER_NAMESPACE
#define __SYCL_INLINE_VER_NAMESPACE(X)
Definition: defines_elementary.hpp:13
sycl::_V1::handler::parallel_for_work_group
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2106
sycl::_V1::detail::lambda_arg_type
decltype(argument_helper< F, SuggestedArgType >(0)) lambda_arg_type
Definition: handler.hpp:110
sycl::_V1::detail::reduCGFuncMulti
void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
sycl::_V1::detail::InteropTask
Definition: cg_types.hpp:220
sycl::_V1::detail::reduCGFuncForRange
bool reduCGFuncForRange(handler &CGH, KernelType KernelFunc, const range< Dims > &Range, size_t MaxWGSize, uint32_t NumConcurrentWorkGroups, Reduction &Redu)
If we are given sycl::range and not sycl::nd_range we have more freedom in how to split the iteration...
sycl::_V1::detail::checkValueRange
detail::enable_if_t< std::is_same< T, nd_range< Dims > >::value > checkValueRange(const T &V)
Definition: handler.hpp:189
handler_proxy.hpp
sycl::_V1::detail::LocalAccessorBaseHost
Definition: accessor.hpp:515
sycl::_V1::detail::KernelLambdaHasKernelHandlerArgT
Definition: cg_types.hpp:174
__copyAcc2Ptr
Definition: handler.hpp:54
context.hpp
_KERNELFUNCPARAM
#define _KERNELFUNCPARAM(a)
Definition: handler.hpp:1511
sycl::_V1::detail::runKernelWithArg
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType, ArgType >::value > runKernelWithArg(KernelType KernelName, ArgType Arg)
Definition: cg_types.hpp:197
event.hpp
os_util.hpp
sycl::_V1::stream
A buffered output stream that allows outputting the values of built-in, vector and SYCL types to the ...
Definition: stream.hpp:742
sycl::_V1::handler::parallel_for
void parallel_for(nd_range< Dims > NDRange, kernel Kernel)
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:1926
sycl
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:13
sycl::_V1::detail::HostKernel
Definition: cg_types.hpp:245
sycl::_V1::handler::parallel_for_work_group
void parallel_for_work_group(range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:1812
sycl::_V1::detail::RoundedRangeKernelWithKH::RoundedRangeKernelWithKH
RoundedRangeKernelWithKH(range< Dims > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:222
max
simd< _Tp, _Abi > max(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
sycl::_V1::detail::reduCGFuncAtomic64
void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &Redu)
sycl::_V1::event
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:38
sycl::_V1::detail::memcpy
void memcpy(void *Dst, const void *Src, std::size_t Size)
sycl::_V1::detail::KernelBundleImplPtr
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
Definition: kernel_bundle.hpp:133
sycl::_V1::handler::parallel_for
void parallel_for(range< 1 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1885
access.hpp
sycl::_V1::detail::reduGetMemPerWorkItem
size_t reduGetMemPerWorkItem(std::tuple< ReductionT... > &ReduTuple, std::index_sequence< Is... >)
sycl::_V1::handler::set_args
void set_args(Ts &&...Args)
Sets arguments for OpenCL interoperability kernels.
Definition: handler.hpp:1420
sycl::_V1::handler::single_task
void single_task(kernel Kernel)
Invokes a SYCL kernel.
Definition: handler.hpp:1872
sycl::_V1::handler::host_task
detail::enable_if_t< detail::check_fn_signature< detail::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< detail::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
Definition: handler.hpp:1504
sycl::_V1::id
A unique identifier of an item in an index space.
Definition: array.hpp:17
id.hpp
interop_handle.hpp
sycl::_V1::detail::reduSaveFinalResultToUserMem
std::shared_ptr< event > reduSaveFinalResultToUserMem(std::shared_ptr< detail::queue_impl > Queue, bool IsHost, std::tuple< Reduction... > &ReduTuple, std::index_sequence< Is... >)
sycl::_V1::detail::check_fn_signature
Definition: cg_types.hpp:126
sycl::_V1::detail::reduGetPreferredWGSize
size_t reduGetPreferredWGSize(std::shared_ptr< queue_impl > &Queue, size_t LocalMemBytesPerWorkItem)
Definition: reduction.cpp:105
sycl::_V1::detail::enable_if_t
typename std::enable_if< B, T >::type enable_if_t
Definition: stl_type_traits.hpp:24
sycl::_V1::kernel
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:69
sycl::_V1::detail::write
void write(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize, unsigned WIOffset, const char *Str, unsigned Len, unsigned Padding=0)
Definition: stream.hpp:109
sycl::_V1::handler::parallel_for
void parallel_for(kernel Kernel, range< Dims > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range.
Definition: handler.hpp:1988
sycl::_V1::range
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: buffer.hpp:26
nd_range.hpp
sycl::_V1::ext::intel::esimd::barrier
__ESIMD_API void barrier()
Generic work-group barrier.
Definition: memory.hpp:914
export.hpp
sycl::_V1::detail::remove_cv_t
typename std::remove_cv< T >::type remove_cv_t
Definition: stl_type_traits.hpp:32
sycl::_V1::access::placeholder
placeholder
Definition: access.hpp:43
sycl::_V1::nd_item
Identifies an instance of the function object executing at each point in an nd_range.
Definition: helpers.hpp:32
std::get
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:199
sycl::_V1::handler::require
void require(accessor< DataT, Dims, AccMode, AccTarget, access::placeholder::true_t > Acc)
Requires access to the memory object associated with the placeholder accessor.
Definition: handler.hpp:1356
sycl::_V1::handler::parallel_for
void parallel_for(range< 3 > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:1472
sycl::_V1::detail::reduAuxCGFunc
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
sycl::_V1::detail::stream_impl
Definition: stream_impl.hpp:25
sycl::_V1::detail::reduCGFunc
void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &Redu)
sycl::_V1::handler::parallel_for_work_group
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2145
kernel.hpp
sycl::_V1::handler::parallel_for
void parallel_for(range< 3 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1893
sycl::_V1::handler::single_task
void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function.
Definition: handler.hpp:1943
kernel_bundle.hpp
sycl::_V1::item
Identifies an instance of the function object executing at each point in a range.
Definition: helpers.hpp:28
cl.h
sycl::_V1::handler::is_same_type
std::is_same< remove_cv_ref_t< U >, remove_cv_ref_t< T > > is_same_type
Definition: handler.hpp:1376
sycl::_V1::handler::__SYCL_DEPRECATED
__SYCL_DEPRECATED("interop_task() is deprecated, use host_task() instead") void interop_task(FuncT Func)
Invokes a lambda on the host.
Definition: handler.hpp:1972
sycl::_V1::Dimensions
class __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_SPECIAL_CLASS Dimensions
Definition: accessor.hpp:2686
sycl::_V1::handler::parallel_for
void parallel_for(range< 1 > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:1454
sycl::_V1::handler::remove_cv_ref_t
typename detail::remove_cv_t< detail::remove_reference_t< T > > remove_cv_ref_t
Definition: handler.hpp:1373
sycl::_V1::handler::set_arg
void set_arg(int ArgIndex, accessor< DataT, Dims, AccessMode, AccessTarget, IsPlaceholder > Arg)
Definition: handler.hpp:1405
sycl::_V1::handler::parallel_for
void parallel_for(nd_range< Dims > ExecutionRange, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified nd_range.
Definition: handler.hpp:1564
sycl::_V1::detail::tuple_select_elements
std::tuple< std::tuple_element_t< Is, TupleT >... > tuple_select_elements(TupleT Tuple, std::index_sequence< Is... >)
sycl::_V1::IsPlaceholder
class __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_SPECIAL_CLASS IsPlaceholder
Definition: accessor.hpp:2688
sycl::_V1::detail::withAuxHandler
event withAuxHandler(std::shared_ptr< detail::queue_impl > Queue, bool IsHost, FunctorTy Func)
sycl::_V1::handler
Command group handler class.
Definition: handler.hpp:352
sycl::_V1::nd_item::get_offset
id< dimensions > get_offset() const
Definition: nd_item.hpp:105
sycl::_V1::handler::parallel_for
void parallel_for(kernel Kernel, nd_range< Dims > NDRange, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:2064
sycl::_V1::handler::single_task
void single_task(KernelType KernelFunc)
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:1433
sycl::_V1::nd_range
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: uniform.hpp:38
sycl::_V1::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:47
sycl::_V1::handler::__SYCL_DEPRECATED
__SYCL_DEPRECATED("run_on_host_intel() is deprecated, use host_task() instead") void run_on_host_intel(FuncT Func)
Defines and invokes a SYCL kernel on host device.
Definition: handler.hpp:1484
sycl::_V1::access::target
target
Definition: access.hpp:17
sycl::_V1::access::target::device
@ device
sycl::_V1::AccessMode
class __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_SPECIAL_CLASS AccessMode
Definition: accessor.hpp:2686
sycl::_V1::detail::OSModuleHandle
intptr_t OSModuleHandle
Uniquely identifies an operating system module (executable or a dynamic library)
Definition: os_util.hpp:48
sycl::_V1::handler::copy
void copy(accessor< T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src, IsPlaceholder_Src > Src, accessor< T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst, IsPlaceholder_Dst > Dst)
Copies the content of memory object accessed by Src to the memory object accessed by Dst.
Definition: handler.hpp:2315
sycl::_V1::handler::set_arg
void set_arg(int ArgIndex, local_accessor< DataT, Dims > Arg)
Definition: handler.hpp:1411
sycl::_V1::detail::get_kernel_wrapper_name_t
Definition: handler.hpp:115
sycl::_V1::detail::queue_impl
Definition: queue_impl.hpp:54
__SYCL_KERNEL_ATTR__
#define __SYCL_KERNEL_ATTR__
Definition: handler.hpp:1075
sycl::_V1::accessor
Definition: accessor.hpp:227
sycl::_V1::detail::remove_reference_t
typename std::remove_reference< T >::type remove_reference_t
Definition: stl_type_traits.hpp:35
sycl::_V1::detail::LocalAccessorImplPtr
std::shared_ptr< LocalAccessorImplHost > LocalAccessorImplPtr
Definition: accessor.hpp:513
sycl::_V1::detail::__pf_kernel_wrapper
Definition: handler.hpp:113
accessor.hpp
sycl::_V1::handler::fill
void fill(void *Ptr, const T &Pattern, size_t Count)
Fills the specified memory with the specified pattern.
Definition: handler.hpp:2428
sycl::_V1::detail::get_kernel_name_t::name
Name name
Definition: kernel.hpp:43
sycl::_V1::detail::NDRDescT
Definition: cg_types.hpp:41
sycl::_V1::detail::member_ptr_helper
static Arg member_ptr_helper(RetType(Func::*)(Arg) const)
sycl::_V1::detail::RoundedRangeKernel
Definition: handler.hpp:202
sycl::_V1::detail::CG::CGTYPE
CGTYPE
Type of the command group.
Definition: cg.hpp:55
KernelFunc
std::function< void(const sycl::nd_item< NDims > &)> KernelFunc
Definition: pi_esimd_emulator.cpp:173
sycl::_V1::detail::RoundedRangeKernelWithKH
Definition: handler.hpp:220
kernel_handler.hpp
sycl::_V1::detail::code_location
Definition: common.hpp:66
sycl::_V1::handler::fill
void fill(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT > Dst, const T &Pattern)
Fills memory pointed by accessor with the pattern given.
Definition: handler.hpp:2390
sycl::_V1::local_accessor
Definition: multi_ptr.hpp:24
std
Definition: accessor.hpp:3071
sycl::_V1::detail::getDeviceFromHandler
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
Definition: accessor.cpp:15
sycl::_V1::detail::AreAllButLastReductions
Predicate returning true if all template type parameters except the last one are reductions.
Definition: handler.hpp:312
sycl::_V1::handler::copy
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, T_Dst *Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2235
sampler.hpp
sycl::_V1::handler::parallel_for
void parallel_for(range< 2 > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:1463
sycl::_V1::handler::copy
void copy(const T_Src *Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2274
sycl::_V1::detail::getDelinearizedId
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:323
sycl::_V1::handler::update_host
void update_host(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder > Acc)
Provides guarantees that the memory object accessed via Acc is updated on the host after command grou...
Definition: handler.hpp:2363
sycl::_V1::handler::copy
void copy(std::shared_ptr< T_Src > Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2211
sycl::_V1::detail::KernelInfo
Definition: kernel_desc.hpp:82
sycl::_V1::detail::getLinearIndex
size_t getLinearIndex(const T< Dims > &Index, const U< Dims > &Range)
Definition: common.hpp:373
sycl::_V1::handler::parallel_for
void parallel_for(range< 2 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1889
__fill
Definition: handler.hpp:47
nd_item.hpp
sycl::_V1::handler::ShouldEnableSetArg
Definition: handler.hpp:1378
sycl::_V1::handler::parallel_for_work_group
void parallel_for_work_group(range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:1844
sycl::_V1::nd_item::get_global_range
range< dimensions > get_global_range() const
Definition: nd_item.hpp:92
std::cout
__SYCL_EXTERN_STREAM_ATTRS ostream cout
Linked to standard output.
sycl::_V1::detail::argument_helper
SuggestedArgType argument_helper(...)
sycl::_V1::interop_handle
Definition: interop_handle.hpp:36
sycl::_V1::detail::reduGetMaxWGSize
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
sycl::_V1::detail::RoundedRangeKernel::RoundedRangeKernel
RoundedRangeKernel(range< Dims > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:204
__SYCL2020_DEPRECATED
#define __SYCL2020_DEPRECATED(message)
Definition: defines_elementary.hpp:54
sycl::_V1::detail::getSyclObjImpl
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: common.hpp:240
__copyPtr2Acc
Definition: handler.hpp:59
sycl::_V1::detail::AccessorBaseHost
Definition: accessor.hpp:473