DPC++ Runtime
Runtime libraries for oneAPI DPC++
handler.hpp
Go to the documentation of this file.
1 //==-------- handler.hpp --- SYCL command group handler --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
12 #include <CL/sycl/accessor.hpp>
13 #include <CL/sycl/context.hpp>
14 #include <CL/sycl/detail/cg.hpp>
16 #include <CL/sycl/detail/cl.h>
20 #include <CL/sycl/event.hpp>
21 #include <CL/sycl/id.hpp>
23 #include <CL/sycl/item.hpp>
24 #include <CL/sycl/kernel.hpp>
27 #include <CL/sycl/nd_item.hpp>
28 #include <CL/sycl/nd_range.hpp>
30 #include <CL/sycl/sampler.hpp>
31 #include <CL/sycl/stl.hpp>
32 
33 #include <functional>
34 #include <limits>
35 #include <memory>
36 #include <tuple>
37 #include <type_traits>
38 
39 // SYCL_LANGUAGE_VERSION is 4 digit year followed by 2 digit revision
40 #if !SYCL_LANGUAGE_VERSION || SYCL_LANGUAGE_VERSION < 202001
41 #define __SYCL_NONCONST_FUNCTOR__
42 #endif
43 
44 template <typename DataT, int Dimensions, cl::sycl::access::mode AccessMode,
45  cl::sycl::access::target AccessTarget,
46  cl::sycl::access::placeholder IsPlaceholder>
47 class __fill;
48 
49 template <typename T> class __usmfill;
50 
51 template <typename T_Src, typename T_Dst, int Dims,
52  cl::sycl::access::mode AccessMode,
53  cl::sycl::access::target AccessTarget,
54  cl::sycl::access::placeholder IsPlaceholder>
56 
57 template <typename T_Src, typename T_Dst, int Dims,
58  cl::sycl::access::mode AccessMode,
59  cl::sycl::access::target AccessTarget,
60  cl::sycl::access::placeholder IsPlaceholder>
62 
63 template <typename T_Src, int Dims_Src, cl::sycl::access::mode AccessMode_Src,
64  cl::sycl::access::target AccessTarget_Src, typename T_Dst,
65  int Dims_Dst, cl::sycl::access::mode AccessMode_Dst,
66  cl::sycl::access::target AccessTarget_Dst,
67  cl::sycl::access::placeholder IsPlaceholder_Src,
68  cl::sycl::access::placeholder IsPlaceholder_Dst>
70 
71 // For unit testing purposes
72 class MockHandler;
73 
75 namespace sycl {
76 
77 // Forward declaration
78 
79 class handler;
80 template <typename T, int Dimensions, typename AllocatorT, typename Enable>
81 class buffer;
82 namespace detail {
83 
84 class handler_impl;
85 class kernel_impl;
86 class queue_impl;
87 class stream_impl;
88 template <typename DataT, int Dimensions, access::mode AccessMode,
89  access::target AccessTarget, access::placeholder IsPlaceholder>
90 class image_accessor;
91 template <typename RetType, typename Func, typename Arg>
92 static Arg member_ptr_helper(RetType (Func::*)(Arg) const);
93 
94 // Non-const version of the above template to match functors whose 'operator()'
95 // is declared w/o the 'const' qualifier.
96 template <typename RetType, typename Func, typename Arg>
97 static Arg member_ptr_helper(RetType (Func::*)(Arg));
98 
99 // template <typename RetType, typename Func>
100 // static void member_ptr_helper(RetType (Func::*)() const);
101 
102 // template <typename RetType, typename Func>
103 // static void member_ptr_helper(RetType (Func::*)());
104 
105 template <typename F, typename SuggestedArgType>
106 decltype(member_ptr_helper(&F::operator())) argument_helper(int);
107 
108 template <typename F, typename SuggestedArgType>
109 SuggestedArgType argument_helper(...);
110 
111 template <typename F, typename SuggestedArgType>
112 using lambda_arg_type = decltype(argument_helper<F, SuggestedArgType>(0));
113 
114 // Used when parallel_for range is rounded-up.
115 template <typename Name> class __pf_kernel_wrapper;
116 
117 template <typename Type> struct get_kernel_wrapper_name_t {
119 };
120 
121 __SYCL_EXPORT device getDeviceFromHandler(handler &);
122 
123 #if __SYCL_ID_QUERIES_FIT_IN_INT__
124 template <typename T> struct NotIntMsg;
125 
126 template <int Dims> struct NotIntMsg<range<Dims>> {
127  constexpr static const char *Msg =
128  "Provided range is out of integer limits. Pass "
129  "`-fno-sycl-id-queries-fit-in-int' to disable range check.";
130 };
131 
132 template <int Dims> struct NotIntMsg<id<Dims>> {
133  constexpr static const char *Msg =
134  "Provided offset is out of integer limits. Pass "
135  "`-fno-sycl-id-queries-fit-in-int' to disable offset check.";
136 };
137 #endif
138 
139 #if __SYCL_ID_QUERIES_FIT_IN_INT__
140 template <typename T, typename ValT>
141 typename detail::enable_if_t<std::is_same<ValT, size_t>::value ||
142  std::is_same<ValT, unsigned long long>::value>
143 checkValueRangeImpl(ValT V) {
144  static constexpr size_t Limit =
145  static_cast<size_t>((std::numeric_limits<int>::max)());
146  if (V > Limit)
147  throw runtime_error(NotIntMsg<T>::Msg, PI_ERROR_INVALID_VALUE);
148 }
149 #endif
150 
151 template <int Dims, typename T>
152 typename detail::enable_if_t<std::is_same<T, range<Dims>>::value ||
153  std::is_same<T, id<Dims>>::value>
154 checkValueRange(const T &V) {
155 #if __SYCL_ID_QUERIES_FIT_IN_INT__
156  for (size_t Dim = 0; Dim < Dims; ++Dim)
157  checkValueRangeImpl<T>(V[Dim]);
158 
159  {
160  unsigned long long Product = 1;
161  for (size_t Dim = 0; Dim < Dims; ++Dim) {
162  Product *= V[Dim];
163  // check value now to prevent product overflow in the end
164  checkValueRangeImpl<T>(Product);
165  }
166  }
167 #else
168  (void)V;
169 #endif
170 }
171 
172 template <int Dims>
173 void checkValueRange(const range<Dims> &R, const id<Dims> &O) {
174 #if __SYCL_ID_QUERIES_FIT_IN_INT__
175  checkValueRange<Dims>(R);
176  checkValueRange<Dims>(O);
177 
178  for (size_t Dim = 0; Dim < Dims; ++Dim) {
179  unsigned long long Sum = R[Dim] + O[Dim];
180 
181  checkValueRangeImpl<range<Dims>>(Sum);
182  }
183 #else
184  (void)R;
185  (void)O;
186 #endif
187 }
188 
189 template <int Dims, typename T>
191 checkValueRange(const T &V) {
192 #if __SYCL_ID_QUERIES_FIT_IN_INT__
193  checkValueRange<Dims>(V.get_global_range());
194  checkValueRange<Dims>(V.get_local_range());
195  checkValueRange<Dims>(V.get_offset());
196 
197  checkValueRange<Dims>(V.get_global_range(), V.get_offset());
198 #else
199  (void)V;
200 #endif
201 }
202 
203 template <typename TransformedArgType, int Dims, typename KernelType>
205 public:
206  RoundedRangeKernel(range<Dims> NumWorkItems, KernelType KernelFunc)
207  : NumWorkItems(NumWorkItems), KernelFunc(KernelFunc) {}
208 
209  void operator()(TransformedArgType Arg) const {
210  if (Arg[0] >= NumWorkItems[0])
211  return;
212  Arg.set_allowed_range(NumWorkItems);
213  KernelFunc(Arg);
214  }
215 
216 private:
217  range<Dims> NumWorkItems;
218  KernelType KernelFunc;
219 };
220 
221 template <typename TransformedArgType, int Dims, typename KernelType>
223 public:
225  : NumWorkItems(NumWorkItems), KernelFunc(KernelFunc) {}
226 
227  void operator()(TransformedArgType Arg, kernel_handler KH) const {
228  if (Arg[0] >= NumWorkItems[0])
229  return;
230  Arg.set_allowed_range(NumWorkItems);
231  KernelFunc(Arg, KH);
232  }
233 
234 private:
235  range<Dims> NumWorkItems;
236  KernelType KernelFunc;
237 };
238 
239 } // namespace detail
240 
241 namespace ext {
242 namespace oneapi {
243 namespace detail {
244 template <typename T, class BinaryOperation, int Dims, size_t Extent,
245  class Algorithm>
247 
250 
251 template <typename KernelName, typename KernelType, int Dims, class Reduction>
252 void reduCGFunc(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
253  size_t MaxWGSize, uint32_t NumConcurrentWorkGroups,
254  Reduction &Redu);
255 
256 template <typename KernelName, typename KernelType, int Dims, class Reduction>
258 reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
259  const nd_range<Dims> &Range, Reduction &Redu);
260 
261 template <typename KernelName, typename KernelType, int Dims, class Reduction>
263 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
264  Reduction &Redu);
265 
266 template <typename KernelName, typename KernelType, int Dims, class Reduction>
268 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
269  Reduction &Redu);
270 
271 template <typename KernelName, typename KernelType, class Reduction>
273 reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
274  Reduction &Redu);
275 
276 template <typename KernelName, typename KernelType, int Dims,
277  typename... Reductions, size_t... Is>
278 void reduCGFunc(handler &CGH, KernelType KernelFunc,
279  const nd_range<Dims> &Range,
280  std::tuple<Reductions...> &ReduTuple,
281  std::index_sequence<Is...>);
282 
283 template <typename KernelName, typename KernelType, typename... Reductions,
284  size_t... Is>
285 size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
286  std::tuple<Reductions...> &ReduTuple,
287  std::index_sequence<Is...>);
288 
289 template <typename KernelName, class Reduction>
290 std::enable_if_t<!Reduction::is_usm>
291 reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu);
292 
293 template <typename KernelName, class Reduction>
294 std::enable_if_t<Reduction::is_usm>
295 reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu);
296 
297 template <typename... Reduction, size_t... Is>
298 std::shared_ptr<event>
299 reduSaveFinalResultToUserMem(std::shared_ptr<detail::queue_impl> Queue,
300  bool IsHost, std::tuple<Reduction...> &ReduTuple,
301  std::index_sequence<Is...>);
302 
303 template <typename Reduction, typename... RestT>
304 std::enable_if_t<!Reduction::is_usm>
305 reduSaveFinalResultToUserMemHelper(std::vector<event> &Events,
306  std::shared_ptr<detail::queue_impl> Queue,
307  bool IsHost, Reduction &Redu, RestT... Rest);
308 
309 __SYCL_EXPORT uint32_t
310 reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
311 
312 __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
313  size_t LocalMemBytesPerWorkItem);
314 
315 template <typename... ReductionT, size_t... Is>
316 size_t reduGetMemPerWorkItem(std::tuple<ReductionT...> &ReduTuple,
317  std::index_sequence<Is...>);
318 
319 template <typename TupleT, std::size_t... Is>
320 std::tuple<std::tuple_element_t<Is, TupleT>...>
321 tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>);
322 
323 template <typename FirstT, typename... RestT> struct AreAllButLastReductions;
324 
325 } // namespace detail
326 } // namespace oneapi
327 } // namespace ext
328 
362 class __SYCL_EXPORT handler {
363 private:
368  handler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost);
369 
379  handler(std::shared_ptr<detail::queue_impl> Queue,
380  std::shared_ptr<detail::queue_impl> PrimaryQueue,
381  std::shared_ptr<detail::queue_impl> SecondaryQueue, bool IsHost);
382 
384  template <typename T, typename F = typename detail::remove_const_t<
386  F *storePlainArg(T &&Arg) {
387  MArgsStorage.emplace_back(sizeof(T));
388  auto Storage = reinterpret_cast<F *>(MArgsStorage.back().data());
389  *Storage = Arg;
390  return Storage;
391  }
392 
393  void setType(detail::CG::CGTYPE Type) {
394  constexpr detail::CG::CG_VERSION Version = detail::CG::CG_VERSION::V1;
395  MCGType = static_cast<detail::CG::CGTYPE>(
396  getVersionedCGType(Type, static_cast<int>(Version)));
397  }
398 
399  detail::CG::CGTYPE getType() {
400  return static_cast<detail::CG::CGTYPE>(getUnversionedCGType(MCGType));
401  }
402 
403  void throwIfActionIsCreated() {
404  if (detail::CG::None != getType())
405  throw sycl::runtime_error("Attempt to set multiple actions for the "
406  "command group. Command group must consist of "
407  "a single kernel or explicit memory operation.",
408  PI_ERROR_INVALID_OPERATION);
409  }
410 
414  void
415  extractArgsAndReqsFromLambda(char *LambdaPtr, size_t KernelArgsNum,
416  const detail::kernel_param_desc_t *KernelArgs);
417 
420  void
421  extractArgsAndReqsFromLambda(char *LambdaPtr, size_t KernelArgsNum,
422  const detail::kernel_param_desc_t *KernelArgs,
423  bool IsESIMD);
424 
426  void extractArgsAndReqs();
427 
429  void processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
430  const int Size, const size_t Index, size_t &IndexShift,
431  bool IsKernelCreatedFromSource);
432 
433  void processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
434  const int Size, const size_t Index, size_t &IndexShift,
435  bool IsKernelCreatedFromSource, bool IsESIMD);
436 
438  std::string getKernelName();
439 
440  template <typename LambdaNameT> bool lambdaAndKernelHaveEqualName() {
441  // TODO It is unclear a kernel and a lambda/functor must to be equal or not
442  // for parallel_for with sycl::kernel and lambda/functor together
443  // Now if they are equal we extract argumets from lambda/functor for the
444  // kernel. Else it is necessary use set_atg(s) for resolve the order and
445  // values of arguments for the kernel.
446  assert(MKernel && "MKernel is not initialized");
447  const std::string LambdaName = detail::KernelInfo<LambdaNameT>::getName();
448  const std::string KernelName = getKernelName();
449  return LambdaName == KernelName;
450  }
451 
454  void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
455 
462  event finalize();
463 
469  void addStream(const std::shared_ptr<detail::stream_impl> &Stream) {
470  MStreamStorage.push_back(Stream);
471  }
472 
478  void addReduction(const std::shared_ptr<const void> &ReduObj);
479 
480  ~handler() = default;
481 
482  bool is_host() { return MIsHost; }
483 
485  access::target AccTarget);
486 
487  // Recursively calls itself until arguments pack is fully processed.
488  // The version for regular(standard layout) argument.
489  template <typename T, typename... Ts>
490  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&... Args) {
491  set_arg(ArgIndex, std::move(Arg));
492  setArgsHelper(++ArgIndex, std::move(Args)...);
493  }
494 
495  void setArgsHelper(int) {}
496 
497  // setArgHelper for local accessor argument.
498  template <typename DataT, int Dims, access::mode AccessMode,
499  access::placeholder IsPlaceholder>
500  void setArgHelper(int ArgIndex,
501  accessor<DataT, Dims, AccessMode, access::target::local,
502  IsPlaceholder> &&Arg) {
503  detail::LocalAccessorBaseHost *LocalAccBase =
505  detail::LocalAccessorImplPtr LocalAccImpl =
506  detail::getSyclObjImpl(*LocalAccBase);
507  detail::LocalAccessorImplHost *Req = LocalAccImpl.get();
508  MLocalAccStorage.push_back(std::move(LocalAccImpl));
509  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
510  static_cast<int>(access::target::local), ArgIndex);
511  }
512 
513  // setArgHelper for non local accessor argument.
514  template <typename DataT, int Dims, access::mode AccessMode,
515  access::target AccessTarget, access::placeholder IsPlaceholder>
517  setArgHelper(
518  int ArgIndex,
522  detail::Requirement *Req = AccImpl.get();
523  // Add accessor to the list of requirements.
524  MRequirements.push_back(Req);
525  // Store copy of the accessor.
526  MAccStorage.push_back(std::move(AccImpl));
527  // Add accessor to the list of arguments.
528  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
529  static_cast<int>(AccessTarget), ArgIndex);
530  }
531 
532  template <typename T> void setArgHelper(int ArgIndex, T &&Arg) {
533  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
534 
535  if (!std::is_same<cl_mem, T>::value && std::is_pointer<T>::value) {
536  MArgs.emplace_back(detail::kernel_param_kind_t::kind_pointer, StoredArg,
537  sizeof(T), ArgIndex);
538  } else {
539  MArgs.emplace_back(detail::kernel_param_kind_t::kind_std_layout,
540  StoredArg, sizeof(T), ArgIndex);
541  }
542  }
543 
544  void setArgHelper(int ArgIndex, sampler &&Arg) {
545  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
546  MArgs.emplace_back(detail::kernel_param_kind_t::kind_sampler, StoredArg,
547  sizeof(sampler), ArgIndex);
548  }
549 
550  void verifyKernelInvoc(const kernel &Kernel) {
551  if (is_host()) {
552  throw invalid_object_error(
553  "This kernel invocation method cannot be used on the host",
554  PI_ERROR_INVALID_DEVICE);
555  }
556  if (Kernel.is_host()) {
557  throw invalid_object_error("Invalid kernel type, OpenCL expected",
558  PI_ERROR_INVALID_KERNEL);
559  }
560  }
561 
562  /* The kernel passed to StoreLambda can take an id, an item or an nd_item as
563  * its argument. Since esimd plugin directly invokes the kernel (doesn’t use
564  * piKernelSetArg), the kernel argument type must be known to the plugin.
565  * However, passing kernel argument type to the plugin requires changing ABI
566  * in HostKernel class. To overcome this problem, helpers below wrap the
567  * “original” kernel with a functor that always takes an nd_item as argument.
568  * A functor is used instead of a lambda because extractArgsAndReqsFromLambda
569  * needs access to the “original” kernel and keeps references to its internal
570  * data, i.e. the kernel passed as argument cannot be local in scope. The
571  * functor itself is again encapsulated in a std::function since functor’s
572  * type is unknown to the plugin.
573  */
574 
575  // For 'id, item w/wo offset, nd_item' kernel arguments
576  template <class KernelType, class NormalizedKernelType, int Dims>
577  KernelType *ResetHostKernelHelper(const KernelType &KernelFunc) {
578  NormalizedKernelType NormalizedKernel(KernelFunc);
579  auto NormalizedKernelFunc =
580  std::function<void(const sycl::nd_item<Dims> &)>(NormalizedKernel);
581  auto HostKernelPtr =
582  new detail::HostKernel<decltype(NormalizedKernelFunc),
583  sycl::nd_item<Dims>, Dims>(NormalizedKernelFunc);
584  MHostKernel.reset(HostKernelPtr);
585  return &HostKernelPtr->MKernel.template target<NormalizedKernelType>()
586  ->MKernelFunc;
587  }
588 
589  // For 'sycl::id<Dims>' kernel argument
590  template <class KernelType, typename ArgT, int Dims>
591  typename std::enable_if<std::is_same<ArgT, sycl::id<Dims>>::value,
592  KernelType *>::type
593  ResetHostKernel(const KernelType &KernelFunc) {
594  struct NormalizedKernelType {
595  KernelType MKernelFunc;
596  NormalizedKernelType(const KernelType &KernelFunc)
597  : MKernelFunc(KernelFunc) {}
598  void operator()(const nd_item<Dims> &Arg) {
599  detail::runKernelWithArg(MKernelFunc, Arg.get_global_id());
600  }
601  };
602  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
603  KernelFunc);
604  }
605 
606  // For 'sycl::nd_item<Dims>' kernel argument
607  template <class KernelType, typename ArgT, int Dims>
608  typename std::enable_if<std::is_same<ArgT, sycl::nd_item<Dims>>::value,
609  KernelType *>::type
610  ResetHostKernel(const KernelType &KernelFunc) {
611  struct NormalizedKernelType {
612  KernelType MKernelFunc;
613  NormalizedKernelType(const KernelType &KernelFunc)
614  : MKernelFunc(KernelFunc) {}
615  void operator()(const nd_item<Dims> &Arg) {
616  detail::runKernelWithArg(MKernelFunc, Arg);
617  }
618  };
619  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
620  KernelFunc);
621  }
622 
623  // For 'sycl::item<Dims, without_offset>' kernel argument
624  template <class KernelType, typename ArgT, int Dims>
625  typename std::enable_if<std::is_same<ArgT, sycl::item<Dims, false>>::value,
626  KernelType *>::type
627  ResetHostKernel(const KernelType &KernelFunc) {
628  struct NormalizedKernelType {
629  KernelType MKernelFunc;
630  NormalizedKernelType(const KernelType &KernelFunc)
631  : MKernelFunc(KernelFunc) {}
632  void operator()(const nd_item<Dims> &Arg) {
633  sycl::item<Dims, false> Item = detail::Builder::createItem<Dims, false>(
634  Arg.get_global_range(), Arg.get_global_id());
635  detail::runKernelWithArg(MKernelFunc, Item);
636  }
637  };
638  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
639  KernelFunc);
640  }
641 
642  // For 'sycl::item<Dims, with_offset>' kernel argument
643  template <class KernelType, typename ArgT, int Dims>
644  typename std::enable_if<std::is_same<ArgT, sycl::item<Dims, true>>::value,
645  KernelType *>::type
646  ResetHostKernel(const KernelType &KernelFunc) {
647  struct NormalizedKernelType {
648  KernelType MKernelFunc;
649  NormalizedKernelType(const KernelType &KernelFunc)
650  : MKernelFunc(KernelFunc) {}
651  void operator()(const nd_item<Dims> &Arg) {
652  sycl::item<Dims, true> Item = detail::Builder::createItem<Dims, true>(
653  Arg.get_global_range(), Arg.get_global_id(), Arg.get_offset());
654  detail::runKernelWithArg(MKernelFunc, Item);
655  }
656  };
657  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
658  KernelFunc);
659  }
660 
661  // For 'void' kernel argument (single_task)
662  template <class KernelType, typename ArgT, int Dims>
663  typename std::enable_if_t<std::is_same<ArgT, void>::value, KernelType *>
664  ResetHostKernel(const KernelType &KernelFunc) {
665  struct NormalizedKernelType {
666  KernelType MKernelFunc;
667  NormalizedKernelType(const KernelType &KernelFunc)
668  : MKernelFunc(KernelFunc) {}
669  void operator()(const nd_item<Dims> &Arg) {
670  (void)Arg;
671  detail::runKernelWithoutArg(MKernelFunc);
672  }
673  };
674  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
675  KernelFunc);
676  }
677 
678  // For 'sycl::group<Dims>' kernel argument
679  // 'wrapper'-based approach using 'NormalizedKernelType' struct is not used
680  // for 'void(sycl::group<Dims>)' since 'void(sycl::group<Dims>)' is not
681  // supported in ESIMD.
682  template <class KernelType, typename ArgT, int Dims>
683  typename std::enable_if<std::is_same<ArgT, sycl::group<Dims>>::value,
684  KernelType *>::type
685  ResetHostKernel(const KernelType &KernelFunc) {
686  MHostKernel.reset(
688  return (KernelType *)(MHostKernel->getPtr());
689  }
690 
698  void verifyUsedKernelBundle(const std::string &KernelName);
699 
706  template <typename KernelName, typename KernelType, int Dims,
707  typename LambdaArgType>
708  void StoreLambda(KernelType KernelFunc) {
710 
711  constexpr bool IsCallableWithKernelHandler =
713  LambdaArgType>::value;
714 
715  if (IsCallableWithKernelHandler && MIsHost) {
716  throw cl::sycl::feature_not_supported(
717  "kernel_handler is not yet supported by host device.",
718  PI_ERROR_INVALID_OPERATION);
719  }
720  KernelType *KernelPtr =
721  ResetHostKernel<KernelType, LambdaArgType, Dims>(KernelFunc);
722 
724  // Empty name indicates that the compilation happens without integration
725  // header, so don't perform things that require it.
726  if (KI::getName() != nullptr && KI::getName()[0] != '\0') {
727  // TODO support ESIMD in no-integration-header case too.
728  MArgs.clear();
729  extractArgsAndReqsFromLambda(reinterpret_cast<char *>(KernelPtr),
730  KI::getNumParams(), &KI::getParamDesc(0),
731  KI::isESIMD());
732  MKernelName = KI::getName();
733  MOSModuleHandle = detail::OSUtil::getOSModuleHandle(KI::getName());
734  } else {
735  // In case w/o the integration header it is necessary to process
736  // accessors from the list(which are associated with this handler) as
737  // arguments.
738  MArgs = std::move(MAssociatedAccesors);
739  }
740 
741  // If the kernel lambda is callable with a kernel_handler argument, manifest
742  // the associated kernel handler.
743  if (IsCallableWithKernelHandler) {
744  getOrInsertHandlerKernelBundle(/*Insert=*/true);
745  }
746  }
747 
752  template <int Dims_Src, int Dims_Dst>
753  static bool IsCopyingRectRegionAvailable(const range<Dims_Src> Src,
754  const range<Dims_Dst> Dst) {
755  if (Dims_Src > Dims_Dst)
756  return false;
757  for (size_t I = 0; I < Dims_Src; ++I)
758  if (Src[I] > Dst[I])
759  return false;
760  return true;
761  }
762 
763  // TODO: Delete these functions when ABI breaking changes are allowed.
764  // Currently these functions are unused but they are static members of
765  // the exported class 'handler' and has got into sycl library some time ago
766  // and must stay there for a while.
767  static id<1> getDelinearizedIndex(const range<1> Range, const size_t Index) {
768  return detail::getDelinearizedId(Range, Index);
769  }
770  static id<2> getDelinearizedIndex(const range<2> Range, const size_t Index) {
771  return detail::getDelinearizedId(Range, Index);
772  }
773  static id<3> getDelinearizedIndex(const range<3> Range, const size_t Index) {
774  return detail::getDelinearizedId(Range, Index);
775  }
776 
782  template <typename TSrc, int DimSrc, access::mode ModeSrc,
783  access::target TargetSrc, typename TDst, int DimDst,
784  access::mode ModeDst, access::target TargetDst,
785  access::placeholder IsPHSrc, access::placeholder IsPHDst>
786  detail::enable_if_t<(DimSrc > 0) && (DimDst > 0), bool>
789  if (!MIsHost &&
790  IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range()))
791  return false;
792 
793  range<1> LinearizedRange(Src.size());
794  parallel_for<class __copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc,
795  TDst, DimDst, ModeDst, TargetDst,
796  IsPHSrc, IsPHDst>>
797  (LinearizedRange, [=](id<1> Id) {
798  size_t Index = Id[0];
799  id<DimSrc> SrcId = detail::getDelinearizedId(Src.get_range(), Index);
800  id<DimDst> DstId = detail::getDelinearizedId(Dst.get_range(), Index);
801  Dst[DstId] = Src[SrcId];
802  });
803  return true;
804  }
805 
813  template <typename TSrc, int DimSrc, access::mode ModeSrc,
814  access::target TargetSrc, typename TDst, int DimDst,
815  access::mode ModeDst, access::target TargetDst,
816  access::placeholder IsPHSrc, access::placeholder IsPHDst>
820  if (!MIsHost)
821  return false;
822 
823  single_task<class __copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc,
824  TDst, DimDst, ModeDst, TargetDst,
825  IsPHSrc, IsPHDst>> ([=]() {
826  *(Dst.get_pointer()) = *(Src.get_pointer());
827  });
828  return true;
829  }
830 
831 #ifndef __SYCL_DEVICE_ONLY__
832  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
838  access::target AccTarget, access::placeholder IsPH>
839  detail::enable_if_t<(Dim > 0)>
841  TDst *Dst) {
842  range<Dim> Range = Src.get_range();
843  parallel_for<class __copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>
844  (Range, [=](id<Dim> Index) {
845  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
846  using TSrcNonConst = typename detail::remove_const_t<TSrc>;
847  (reinterpret_cast<TSrcNonConst *>(Dst))[LinearIndex] = Src[Index];
848  });
849  }
850 
856  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
857  access::target AccTarget, access::placeholder IsPH>
860  TDst *Dst) {
861  single_task<class __copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>
862  ([=]() {
863  using TSrcNonConst = typename detail::remove_const_t<TSrc>;
864  *(reinterpret_cast<TSrcNonConst *>(Dst)) = *(Src.get_pointer());
865  });
866  }
867 
872  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
873  access::target AccTarget, access::placeholder IsPH>
874  detail::enable_if_t<(Dim > 0)>
875  copyPtrToAccHost(TSrc *Src,
877  range<Dim> Range = Dst.get_range();
878  parallel_for<class __copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>
879  (Range, [=](id<Dim> Index) {
880  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
881  Dst[Index] = (reinterpret_cast<const TDst *>(Src))[LinearIndex];
882  });
883  }
884 
890  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
891  access::target AccTarget, access::placeholder IsPH>
893  copyPtrToAccHost(TSrc *Src,
895  single_task<class __copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>
896  ([=]() {
897  *(Dst.get_pointer()) = *(reinterpret_cast<const TDst *>(Src));
898  });
899  }
900 #endif // __SYCL_DEVICE_ONLY__
901 
902  constexpr static bool isConstOrGlobal(access::target AccessTarget) {
903  return AccessTarget == access::target::device ||
904  AccessTarget == access::target::constant_buffer;
905  }
906 
907  constexpr static bool isImageOrImageArray(access::target AccessTarget) {
908  return AccessTarget == access::target::image ||
909  AccessTarget == access::target::image_array;
910  }
911 
912  constexpr static bool
913  isValidTargetForExplicitOp(access::target AccessTarget) {
914  return isConstOrGlobal(AccessTarget) || isImageOrImageArray(AccessTarget);
915  }
916 
917  constexpr static bool isValidModeForSourceAccessor(access::mode AccessMode) {
918  return AccessMode == access::mode::read ||
919  AccessMode == access::mode::read_write;
920  }
921 
922  constexpr static bool
923  isValidModeForDestinationAccessor(access::mode AccessMode) {
924  return AccessMode == access::mode::write ||
925  AccessMode == access::mode::read_write ||
926  AccessMode == access::mode::discard_write ||
927  AccessMode == access::mode::discard_read_write;
928  }
929 
930  template <int Dims, typename LambdaArgType> struct TransformUserItemType {
931  using type = typename std::conditional<
932  std::is_convertible<nd_item<Dims>, LambdaArgType>::value, nd_item<Dims>,
933  typename std::conditional<
934  std::is_convertible<item<Dims>, LambdaArgType>::value, item<Dims>,
935  LambdaArgType>::type>::type;
936  };
937 
949  template <typename KernelName, typename KernelType, int Dims>
950  void parallel_for_lambda_impl(range<Dims> NumWorkItems,
951  KernelType KernelFunc) {
952  throwIfActionIsCreated();
954 
955  // If 1D kernel argument is an integral type, convert it to sycl::item<1>
956  // If user type is convertible from sycl::item/sycl::nd_item, use
957  // sycl::item/sycl::nd_item to transport item information
958  using TransformedArgType = typename std::conditional<
959  std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
960  typename TransformUserItemType<Dims, LambdaArgType>::type>::type;
961 
962  using NameT =
964 
965  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
966 
967  // Range rounding can be disabled by the user.
968  // Range rounding is not done on the host device.
969  // Range rounding is supported only for newer SYCL standards.
970 #if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \
971  !defined(DPCPP_HOST_DEVICE_OPENMP) && \
972  !defined(DPCPP_HOST_DEVICE_PERF_NATIVE) && SYCL_LANGUAGE_VERSION >= 202001
973  // Range should be a multiple of this for reasonable performance.
974  size_t MinFactorX = 16;
975  // Range should be a multiple of this for improved performance.
976  size_t GoodFactorX = 32;
977  // Range should be at least this to make rounding worthwhile.
978  size_t MinRangeX = 1024;
979 
980  // Check if rounding parameters have been set through environment:
981  // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
982  this->GetRangeRoundingSettings(MinFactorX, GoodFactorX, MinRangeX);
983 
984  // Disable the rounding-up optimizations under these conditions:
985  // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
986  // 2. The kernel is provided via an interoperability method.
987  // 3. The range is already a multiple of the rounding factor.
988  //
989  // Cases 2 and 3 could be supported with extra effort.
990  // As an optimization for the common case it is an
991  // implementation choice to not support those scenarios.
992  // Note that "this_item" is a free function, i.e. not tied to any
993  // specific id or item. When concurrent parallel_fors are executing
994  // on a device it is difficult to tell which parallel_for the call is
995  // being made from. One could replicate portions of the
996  // call-graph to make this_item calls kernel-specific but this is
997  // not considered worthwhile.
998 
999  // Get the kernel name to check condition 2.
1000  std::string KName = typeid(NameT *).name();
1001  using KI = detail::KernelInfo<KernelName>;
1002  bool DisableRounding =
1003  this->DisableRangeRounding() ||
1004  (KI::getName() == nullptr || KI::getName()[0] == '\0');
1005 
1006  // Perform range rounding if rounding-up is enabled
1007  // and there are sufficient work-items to need rounding
1008  // and the user-specified range is not a multiple of a "good" value.
1009  if (!DisableRounding && (NumWorkItems[0] >= MinRangeX) &&
1010  (NumWorkItems[0] % MinFactorX != 0)) {
1011  // It is sufficient to round up just the first dimension.
1012  // Multiplying the rounded-up value of the first dimension
1013  // by the values of the remaining dimensions (if any)
1014  // will yield a rounded-up value for the total range.
1015  size_t NewValX =
1016  ((NumWorkItems[0] + GoodFactorX - 1) / GoodFactorX) * GoodFactorX;
1017  if (this->RangeRoundingTrace())
1018  std::cout << "parallel_for range adjusted from " << NumWorkItems[0]
1019  << " to " << NewValX << std::endl;
1020 
1021  using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
1022  auto Wrapper =
1023  getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
1024  KernelFunc, NumWorkItems);
1025 
1026  using KName = std::conditional_t<std::is_same<KernelType, NameT>::value,
1027  decltype(Wrapper), NameWT>;
1028 
1029  range<Dims> AdjustedRange = NumWorkItems;
1030  AdjustedRange.set_range_dim0(NewValX);
1031  kernel_parallel_for_wrapper<KName, TransformedArgType>(Wrapper);
1032 #ifndef __SYCL_DEVICE_ONLY__
1033  detail::checkValueRange<Dims>(AdjustedRange);
1034  MNDRDesc.set(std::move(AdjustedRange));
1035  StoreLambda<KName, decltype(Wrapper), Dims, TransformedArgType>(
1036  std::move(Wrapper));
1037  setType(detail::CG::Kernel);
1038 #endif
1039  } else
1040 #endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ &&
1041  // !DPCPP_HOST_DEVICE_OPENMP && !DPCPP_HOST_DEVICE_PERF_NATIVE &&
1042  // SYCL_LANGUAGE_VERSION >= 202001
1043  {
1044  (void)NumWorkItems;
1045  kernel_parallel_for_wrapper<NameT, TransformedArgType>(KernelFunc);
1046 #ifndef __SYCL_DEVICE_ONLY__
1047  detail::checkValueRange<Dims>(NumWorkItems);
1048  MNDRDesc.set(std::move(NumWorkItems));
1049  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1050  std::move(KernelFunc));
1051  setType(detail::CG::Kernel);
1052 #endif
1053  }
1054  }
1055 
1063  template <int Dims>
1064  void parallel_for_impl(range<Dims> NumWorkItems, kernel Kernel) {
1065  throwIfActionIsCreated();
1066  verifyKernelInvoc(Kernel);
1067  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1068  detail::checkValueRange<Dims>(NumWorkItems);
1069  MNDRDesc.set(std::move(NumWorkItems));
1070  setType(detail::CG::Kernel);
1071  extractArgsAndReqs();
1072  MKernelName = getKernelName();
1073  }
1074 
1075 #ifdef SYCL_LANGUAGE_VERSION
1076 #define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel]]
1077 #else
1078 #define __SYCL_KERNEL_ATTR__
1079 #endif
1080  // NOTE: the name of this function - "kernel_single_task" - is used by the
1081  // Front End to determine kernel invocation kind.
1082  template <typename KernelName, typename KernelType>
1084 #ifdef __SYCL_NONCONST_FUNCTOR__
1085  kernel_single_task(KernelType KernelFunc) {
1086 #else
1087  kernel_single_task(const KernelType &KernelFunc) {
1088 #endif
1089 #ifdef __SYCL_DEVICE_ONLY__
1090  KernelFunc();
1091 #else
1092  (void)KernelFunc;
1093 #endif
1094  }
1095 
1096  // NOTE: the name of this function - "kernel_single_task" - is used by the
1097  // Front End to determine kernel invocation kind.
1098  template <typename KernelName, typename KernelType>
1100 #ifdef __SYCL_NONCONST_FUNCTOR__
1101  kernel_single_task(KernelType KernelFunc, kernel_handler KH) {
1102 #else
1103  kernel_single_task(const KernelType &KernelFunc, kernel_handler KH) {
1104 #endif
1105 #ifdef __SYCL_DEVICE_ONLY__
1106  KernelFunc(KH);
1107 #else
1108  (void)KernelFunc;
1109  (void)KH;
1110 #endif
1111  }
1112 
1113  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1114  // Front End to determine kernel invocation kind.
1115  template <typename KernelName, typename ElementType, typename KernelType>
1117 #ifdef __SYCL_NONCONST_FUNCTOR__
1118  kernel_parallel_for(KernelType KernelFunc) {
1119 #else
1120  kernel_parallel_for(const KernelType &KernelFunc) {
1121 #endif
1122 #ifdef __SYCL_DEVICE_ONLY__
1123  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1124 #else
1125  (void)KernelFunc;
1126 #endif
1127  }
1128 
1129  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1130  // Front End to determine kernel invocation kind.
1131  template <typename KernelName, typename ElementType, typename KernelType>
1133 #ifdef __SYCL_NONCONST_FUNCTOR__
1134  kernel_parallel_for(KernelType KernelFunc, kernel_handler KH) {
1135 #else
1136  kernel_parallel_for(const KernelType &KernelFunc, kernel_handler KH) {
1137 #endif
1138 #ifdef __SYCL_DEVICE_ONLY__
1139  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1140 #else
1141  (void)KernelFunc;
1142  (void)KH;
1143 #endif
1144  }
1145 
1146  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1147  // used by the Front End to determine kernel invocation kind.
1148  template <typename KernelName, typename ElementType, typename KernelType>
1150 #ifdef __SYCL_NONCONST_FUNCTOR__
1151  kernel_parallel_for_work_group(KernelType KernelFunc) {
1152 #else
1153  kernel_parallel_for_work_group(const KernelType &KernelFunc) {
1154 #endif
1155 #ifdef __SYCL_DEVICE_ONLY__
1156  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1157 #else
1158  (void)KernelFunc;
1159 #endif
1160  }
1161 
1162  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1163  // used by the Front End to determine kernel invocation kind.
1164  template <typename KernelName, typename ElementType, typename KernelType>
1166 #ifdef __SYCL_NONCONST_FUNCTOR__
1167  kernel_parallel_for_work_group(KernelType KernelFunc, kernel_handler KH) {
1168 #else
1169  kernel_parallel_for_work_group(const KernelType &KernelFunc,
1170  kernel_handler KH) {
1171 #endif
1172 #ifdef __SYCL_DEVICE_ONLY__
1173  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1174 #else
1175  (void)KernelFunc;
1176  (void)KH;
1177 #endif
1178  }
1179 
1180  // Wrappers for kernel_*** functions above with and without support of
1181  // additional kernel_handler argument.
1182 
1183  // NOTE: to support kernel_handler argument in kernel lambdas, only
1184  // kernel_***_wrapper functions must be called in this code
1185 
1186  // Wrappers for kernel_single_task(...)
1187 
1188  template <typename KernelName, typename KernelType>
1189  std::enable_if_t<detail::KernelLambdaHasKernelHandlerArgT<KernelType>::value>
1190 #ifdef __SYCL_NONCONST_FUNCTOR__
1191  kernel_single_task_wrapper(KernelType KernelFunc) {
1192 #else
1193  kernel_single_task_wrapper(const KernelType &KernelFunc) {
1194 #endif
1195 #ifdef __SYCL_DEVICE_ONLY__
1196  detail::CheckDeviceCopyable<KernelType>();
1197 #endif // __SYCL_DEVICE_ONLY__
1198  kernel_handler KH;
1199  kernel_single_task<KernelName>(KernelFunc, KH);
1200  }
1201 
1202  template <typename KernelName, typename KernelType>
1203  std::enable_if_t<!detail::KernelLambdaHasKernelHandlerArgT<KernelType>::value>
1204 #ifdef __SYCL_NONCONST_FUNCTOR__
1205  kernel_single_task_wrapper(KernelType KernelFunc) {
1206 #else
1207  kernel_single_task_wrapper(const KernelType &KernelFunc) {
1208 #endif
1209 #ifdef __SYCL_DEVICE_ONLY__
1210  detail::CheckDeviceCopyable<KernelType>();
1211 #endif // __SYCL_DEVICE_ONLY__
1212  kernel_single_task<KernelName>(KernelFunc);
1213  }
1214 
1215  // Wrappers for kernel_parallel_for(...)
1216 
1217  template <typename KernelName, typename ElementType, typename KernelType>
1219  detail::KernelLambdaHasKernelHandlerArgT<KernelType, ElementType>::value>
1220 #ifdef __SYCL_NONCONST_FUNCTOR__
1221  kernel_parallel_for_wrapper(KernelType KernelFunc) {
1222 #else
1223  kernel_parallel_for_wrapper(const KernelType &KernelFunc) {
1224 #endif
1225 #ifdef __SYCL_DEVICE_ONLY__
1226  detail::CheckDeviceCopyable<KernelType>();
1227 #endif // __SYCL_DEVICE_ONLY__
1228  kernel_handler KH;
1229  kernel_parallel_for<KernelName, ElementType>(KernelFunc, KH);
1230  }
1231 
1232  template <typename KernelName, typename ElementType, typename KernelType>
1234  !detail::KernelLambdaHasKernelHandlerArgT<KernelType, ElementType>::value>
1235 #ifdef __SYCL_NONCONST_FUNCTOR__
1236  kernel_parallel_for_wrapper(KernelType KernelFunc) {
1237 #else
1238  kernel_parallel_for_wrapper(const KernelType &KernelFunc) {
1239 #endif
1240 #ifdef __SYCL_DEVICE_ONLY__
1241  detail::CheckDeviceCopyable<KernelType>();
1242 #endif // __SYCL_DEVICE_ONLY__
1243  kernel_parallel_for<KernelName, ElementType>(KernelFunc);
1244  }
1245 
1246  // Wrappers for kernel_parallel_for_work_group(...)
1247 
1248  template <typename KernelName, typename ElementType, typename KernelType>
1250  detail::KernelLambdaHasKernelHandlerArgT<KernelType, ElementType>::value>
1251 #ifdef __SYCL_NONCONST_FUNCTOR__
1252  kernel_parallel_for_work_group_wrapper(KernelType KernelFunc) {
1253 #else
1254  kernel_parallel_for_work_group_wrapper(const KernelType &KernelFunc) {
1255 #endif
1256 #ifdef __SYCL_DEVICE_ONLY__
1257  detail::CheckDeviceCopyable<KernelType>();
1258 #endif // __SYCL_DEVICE_ONLY__
1259  kernel_handler KH;
1260  kernel_parallel_for_work_group<KernelName, ElementType>(KernelFunc, KH);
1261  }
1262 
1263  template <typename KernelName, typename ElementType, typename KernelType>
1265  !detail::KernelLambdaHasKernelHandlerArgT<KernelType, ElementType>::value>
1266 #ifdef __SYCL_NONCONST_FUNCTOR__
1267  kernel_parallel_for_work_group_wrapper(KernelType KernelFunc) {
1268 #else
1269  kernel_parallel_for_work_group_wrapper(const KernelType &KernelFunc) {
1270 #endif
1271 #ifdef __SYCL_DEVICE_ONLY__
1272  detail::CheckDeviceCopyable<KernelType>();
1273 #endif // __SYCL_DEVICE_ONLY__
1274  kernel_parallel_for_work_group<KernelName, ElementType>(KernelFunc);
1275  }
1276 
1277  std::shared_ptr<detail::handler_impl> getHandlerImpl() const;
1278  std::shared_ptr<detail::handler_impl> evictHandlerImpl() const;
1279 
1280  void setStateExplicitKernelBundle();
1281  void setStateSpecConstSet();
1282  bool isStateExplicitKernelBundle() const;
1283 
1284  std::shared_ptr<detail::kernel_bundle_impl>
1285  getOrInsertHandlerKernelBundle(bool Insert) const;
1286 
1287  void setHandlerKernelBundle(
1288  const std::shared_ptr<detail::kernel_bundle_impl> &NewKernelBundleImpPtr);
1289 
1290  template <typename FuncT>
1292  detail::check_fn_signature<detail::remove_reference_t<FuncT>,
1293  void()>::value ||
1294  detail::check_fn_signature<detail::remove_reference_t<FuncT>,
1295  void(interop_handle)>::value>
1296  host_task_impl(FuncT &&Func) {
1297  throwIfActionIsCreated();
1298 
1299  MNDRDesc.set(range<1>(1));
1300  MArgs = std::move(MAssociatedAccesors);
1301 
1302  MHostTask.reset(new detail::HostTask(std::move(Func)));
1303 
1304  setType(detail::CG::CodeplayHostTask);
1305  }
1306 
1307 public:
1308  handler(const handler &) = delete;
1309  handler(handler &&) = delete;
1310  handler &operator=(const handler &) = delete;
1311  handler &operator=(handler &&) = delete;
1312 
1313 #if __cplusplus >= 201703L
1314  template <auto &SpecName>
1315  void set_specialization_constant(
1316  typename std::remove_reference_t<decltype(SpecName)>::value_type Value) {
1317 
1318  setStateSpecConstSet();
1319 
1320  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1321  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1322 
1323  detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1325  .set_specialization_constant<SpecName>(Value);
1326  }
1327 
1328  template <auto &SpecName>
1329  typename std::remove_reference_t<decltype(SpecName)>::value_type
1330  get_specialization_constant() const {
1331 
1332  if (isStateExplicitKernelBundle())
1333  throw sycl::exception(make_error_code(errc::invalid),
1334  "Specialization constants cannot be read after "
1335  "explicitly setting the used kernel bundle");
1336 
1337  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1338  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1339 
1340  return detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1342  .get_specialization_constant<SpecName>();
1343  }
1344 
1345 #endif
1346 
1347  void
1348  use_kernel_bundle(const kernel_bundle<bundle_state::executable> &ExecBundle);
1349 
1357  template <typename DataT, int Dims, access::mode AccMode,
1358  access::target AccTarget>
1359  void
1361  Acc) {
1362 #ifndef __SYCL_DEVICE_ONLY__
1363  associateWithHandler(&Acc, AccTarget);
1364 #else
1365  (void)Acc;
1366 #endif
1367  }
1368 
1372  void depends_on(event Event);
1373 
1377  void depends_on(const std::vector<event> &Events);
1378 
1379  template <typename T>
1380  using remove_cv_ref_t =
1382 
1383  template <typename U, typename T>
1384  using is_same_type = std::is_same<remove_cv_ref_t<U>, remove_cv_ref_t<T>>;
1385 
1386  template <typename T> struct ShouldEnableSetArg {
1387  static constexpr bool value =
1388  std::is_trivially_copyable<detail::remove_reference_t<T>>::value
1389 #if SYCL_LANGUAGE_VERSION && SYCL_LANGUAGE_VERSION <= 201707
1390  && std::is_standard_layout<detail::remove_reference_t<T>>::value
1391 #endif
1392  || is_same_type<sampler, T>::value // Sampler
1394  std::is_pointer<remove_cv_ref_t<T>>::value) // USM
1395  || is_same_type<cl_mem, T>::value; // Interop
1396  };
1397 
1404  template <typename T>
1406  set_arg(int ArgIndex, T &&Arg) {
1407  setArgHelper(ArgIndex, std::move(Arg));
1408  }
1409 
1410  template <typename DataT, int Dims, access::mode AccessMode,
1411  access::target AccessTarget, access::placeholder IsPlaceholder>
1412  void
1413  set_arg(int ArgIndex,
1415  setArgHelper(ArgIndex, std::move(Arg));
1416  }
1417 
1423  template <typename... Ts> void set_args(Ts &&... Args) {
1424  setArgsHelper(0, std::move(Args)...);
1425  }
1426 
1434  template <typename KernelName = detail::auto_name, typename KernelType>
1435 #ifdef __SYCL_NONCONST_FUNCTOR__
1436  void single_task(KernelType KernelFunc) {
1437 #else
1438  void single_task(const KernelType &KernelFunc) {
1439 #endif
1440  throwIfActionIsCreated();
1441  using NameT =
1443  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1444  kernel_single_task_wrapper<NameT>(KernelFunc);
1445 #ifndef __SYCL_DEVICE_ONLY__
1446  // No need to check if range is out of INT_MAX limits as it's compile-time
1447  // known constant.
1448  MNDRDesc.set(range<1>{1});
1449 
1450  StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(KernelFunc);
1451  setType(detail::CG::Kernel);
1452 #endif
1453  }
1454 
1455  template <typename KernelName = detail::auto_name, typename KernelType>
1456 #ifdef __SYCL_NONCONST_FUNCTOR__
1457  void parallel_for(range<1> NumWorkItems, KernelType KernelFunc) {
1458 #else
1459  void parallel_for(range<1> NumWorkItems, const KernelType &KernelFunc) {
1460 #endif
1461  parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
1462  }
1463 
1464  template <typename KernelName = detail::auto_name, typename KernelType>
1465 #ifdef __SYCL_NONCONST_FUNCTOR__
1466  void parallel_for(range<2> NumWorkItems, KernelType KernelFunc) {
1467 #else
1468  void parallel_for(range<2> NumWorkItems, const KernelType &KernelFunc) {
1469 #endif
1470  parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
1471  }
1472 
1473  template <typename KernelName = detail::auto_name, typename KernelType>
1474 #ifdef __SYCL_NONCONST_FUNCTOR__
1475  void parallel_for(range<3> NumWorkItems, KernelType KernelFunc) {
1476 #else
1477  void parallel_for(range<3> NumWorkItems, const KernelType &KernelFunc) {
1478 #endif
1479  parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
1480  }
1481 
1486  template <typename FuncT>
1488  "run_on_host_intel() is deprecated, use host_task() instead")
1489  void run_on_host_intel(FuncT Func) {
1490  throwIfActionIsCreated();
1491  // No need to check if range is out of INT_MAX limits as it's compile-time
1492  // known constant
1493  MNDRDesc.set(range<1>{1});
1494 
1495  MArgs = std::move(MAssociatedAccesors);
1496  MHostKernel.reset(new detail::HostKernel<FuncT, void, 1>(std::move(Func)));
1497  setType(detail::CG::RunOnHostIntel);
1498  }
1499 
1501  template <typename FuncT>
1504  void()>::value ||
1506  void(interop_handle)>::value>
1507  host_task(FuncT &&Func) {
1508  host_task_impl(Func);
1509  }
1510 
1511 // replace _KERNELFUNCPARAM(KernelFunc) with KernelType KernelFunc
1512 // or const KernelType &KernelFunc
1513 #ifdef __SYCL_NONCONST_FUNCTOR__
1514 #define _KERNELFUNCPARAM(a) KernelType a
1515 #else
1516 #define _KERNELFUNCPARAM(a) const KernelType &a
1517 #endif
1518 
1532  template <typename KernelName = detail::auto_name, typename KernelType,
1533  int Dims>
1534  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL2020")
1535  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
1537  throwIfActionIsCreated();
1538  using NameT =
1540  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1542  (void)NumWorkItems;
1543  (void)WorkItemOffset;
1544  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
1545 #ifndef __SYCL_DEVICE_ONLY__
1546  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
1547  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
1548  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1549  setType(detail::CG::Kernel);
1550 #endif
1551  }
1552 
1565  template <typename KernelName = detail::auto_name, typename KernelType,
1566  int Dims>
1567  void parallel_for(nd_range<Dims> ExecutionRange,
1569  throwIfActionIsCreated();
1570  using NameT =
1572  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1573  using LambdaArgType =
1575  // If user type is convertible from sycl::item/sycl::nd_item, use
1576  // sycl::item/sycl::nd_item to transport item information
1577  using TransformedArgType =
1578  typename TransformUserItemType<Dims, LambdaArgType>::type;
1579  (void)ExecutionRange;
1580  kernel_parallel_for_wrapper<NameT, TransformedArgType>(KernelFunc);
1581 #ifndef __SYCL_DEVICE_ONLY__
1582  detail::checkValueRange<Dims>(ExecutionRange);
1583  MNDRDesc.set(std::move(ExecutionRange));
1584  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1585  std::move(KernelFunc));
1586  setType(detail::CG::Kernel);
1587 #endif
1588  }
1589 
1598  template <typename KernelName = detail::auto_name, typename KernelType,
1599  int Dims, typename Reduction>
1600  void parallel_for(range<Dims> Range, Reduction Redu,
1602  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1603 
1604  // Before running the kernels, check that device has enough local memory
1605  // to hold local arrays required for the tree-reduction algorithm.
1606  constexpr bool IsTreeReduction =
1607  !Reduction::has_fast_reduce && !Reduction::has_fast_atomics;
1608  size_t OneElemSize =
1609  IsTreeReduction ? sizeof(typename Reduction::result_type) : 0;
1610  uint32_t NumConcurrentWorkGroups =
1611 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
1612  __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
1613 #else
1615 #endif
1616  // TODO: currently the maximal work group size is determined for the given
1617  // queue/device, while it is safer to use queries to the kernel pre-compiled
1618  // for the device.
1619  size_t MaxWGSize =
1620  ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
1621  ext::oneapi::detail::reduCGFunc<KernelName>(
1622  *this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu);
1623  if (Reduction::is_usm ||
1624  (Reduction::has_fast_atomics && Redu.initializeToIdentity()) ||
1625  (!Reduction::has_fast_atomics && Redu.hasUserDiscardWriteAccessor())) {
1626  this->finalize();
1627  handler CopyHandler(QueueCopy, MIsHost);
1628  CopyHandler.saveCodeLoc(MCodeLoc);
1629  ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1630  Redu);
1631  MLastEvent = CopyHandler.finalize();
1632  }
1633  }
1634 
1638  //
1639  // If the reduction variable must be initialized with the identity value
1640  // before the kernel run, then an additional working accessor is created,
1641  // initialized with the identity value and used in the kernel. That working
1642  // accessor is then copied to user's accessor or USM pointer after
1643  // the kernel run.
1644  // For USM pointers without initialize_to_identity properties the same scheme
1645  // with working accessor is used as re-using user's USM pointer in the kernel
1646  // would require creation of another variant of user's kernel, which does not
1647  // seem efficient.
1648  template <typename KernelName = detail::auto_name, typename KernelType,
1649  int Dims, typename Reduction>
1651  parallel_for(nd_range<Dims> Range, Reduction Redu,
1653  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1654  ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
1655 
1656  if (Reduction::is_usm || Redu.initializeToIdentity()) {
1657  this->finalize();
1658  handler CopyHandler(QueueCopy, MIsHost);
1659  CopyHandler.saveCodeLoc(MCodeLoc);
1660  ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1661  Redu);
1662  MLastEvent = CopyHandler.finalize();
1663  }
1664  }
1665 
1672  //
1673  // If the reduction variable must be initialized with the identity value
1674  // before the kernel run, then an additional working accessor is created,
1675  // initialized with the identity value and used in the kernel. That working
1676  // accessor is then copied to user's accessor or USM pointer after
1677  // the kernel run.
1678  // For USM pointers without initialize_to_identity properties the same scheme
1679  // with working accessor is used as re-using user's USM pointer in the kernel
1680  // would require creation of another variant of user's kernel, which does not
1681  // seem efficient.
1682  template <typename KernelName = detail::auto_name, typename KernelType,
1683  int Dims, typename Reduction>
1685  parallel_for(nd_range<Dims> Range, Reduction Redu,
1687 
1688  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1690 
1691  if (D.has(aspect::atomic64)) {
1692 
1693  ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
1694  Range, Redu);
1695 
1696  if (Reduction::is_usm || Redu.initializeToIdentity()) {
1697  this->finalize();
1698  handler CopyHandler(QueueCopy, MIsHost);
1699  CopyHandler.saveCodeLoc(MCodeLoc);
1700  ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1701  CopyHandler, Redu);
1702  MLastEvent = CopyHandler.finalize();
1703  }
1704  } else {
1705  parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1706  }
1707  }
1708 
1723  template <typename KernelName = detail::auto_name, typename KernelType,
1724  int Dims, typename Reduction>
1725  detail::enable_if_t<!Reduction::has_fast_atomics &&
1726  !Reduction::has_atomic_add_float64>
1727  parallel_for(nd_range<Dims> Range, Reduction Redu,
1729 
1730  parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1731  }
1732 
1733  template <typename KernelName, typename KernelType, int Dims,
1734  typename Reduction>
1736  parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
1737  KernelType KernelFunc) {
1738  // This parallel_for() is lowered to the following sequence:
1739  // 1) Call a kernel that a) call user's lambda function and b) performs
1740  // one iteration of reduction, storing the partial reductions/sums
1741  // to either a newly created global buffer or to user's reduction
1742  // accessor. So, if the original 'Range' has totally
1743  // N1 elements and work-group size is W, then after the first iteration
1744  // there will be N2 partial sums where N2 = N1 / W.
1745  // If (N2 == 1) then the partial sum is written to user's accessor.
1746  // Otherwise, a new global buffer is created and partial sums are written
1747  // to it.
1748  // 2) Call an aux kernel (if necessary, i.e. if N2 > 1) as many times as
1749  // necessary to reduce all partial sums into one final sum.
1750 
1751  // Before running the kernels, check that device has enough local memory
1752  // to hold local arrays that may be required for the reduction algorithm.
1753  // TODO: If the work-group-size is limited by the local memory, then
1754  // a special version of the main kernel may be created. The one that would
1755  // not use local accessors, which means it would not do the reduction in
1756  // the main kernel, but simply generate Range.get_global_range.size() number
1757  // of partial sums, leaving the reduction work to the additional/aux
1758  // kernels.
1759  constexpr bool HFR = Reduction::has_fast_reduce;
1760  size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
1761  // TODO: currently the maximal work group size is determined for the given
1762  // queue/device, while it may be safer to use queries to the kernel compiled
1763  // for the device.
1764  size_t MaxWGSize =
1765  ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
1766  if (Range.get_local_range().size() > MaxWGSize)
1767  throw sycl::runtime_error("The implementation handling parallel_for with"
1768  " reduction requires work group size not bigger"
1769  " than " +
1770  std::to_string(MaxWGSize),
1771  PI_ERROR_INVALID_WORK_GROUP_SIZE);
1772 
1773  // 1. Call the kernel that includes user's lambda function.
1774  ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
1775  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1776  this->finalize();
1777 
1778  // 2. Run the additional kernel as many times as needed to reduce
1779  // all partial sums into one scalar.
1780 
1781  // TODO: Create a special slow/sequential version of the kernel that would
1782  // handle the reduction instead of reporting an assert below.
1783  if (MaxWGSize <= 1)
1784  throw sycl::runtime_error("The implementation handling parallel_for with "
1785  "reduction requires the maximal work group "
1786  "size to be greater than 1 to converge. "
1787  "The maximal work group size depends on the "
1788  "device and the size of the objects passed to "
1789  "the reduction.",
1790  PI_ERROR_INVALID_WORK_GROUP_SIZE);
1791  size_t NWorkItems = Range.get_group_range().size();
1792  while (NWorkItems > 1) {
1793  handler AuxHandler(QueueCopy, MIsHost);
1794  AuxHandler.saveCodeLoc(MCodeLoc);
1795 
1796  NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
1797  AuxHandler, NWorkItems, MaxWGSize, Redu);
1798  MLastEvent = AuxHandler.finalize();
1799  } // end while (NWorkItems > 1)
1800 
1801  if (Reduction::is_usm || Redu.hasUserDiscardWriteAccessor()) {
1802  handler CopyHandler(QueueCopy, MIsHost);
1803  CopyHandler.saveCodeLoc(MCodeLoc);
1804  ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1805  Redu);
1806  MLastEvent = CopyHandler.finalize();
1807  }
1808  }
1809 
1810  // This version of parallel_for may handle one or more reductions packed in
1811  // \p Rest argument. Note thought that the last element in \p Rest pack is
1812  // the kernel function.
1813  // TODO: this variant is currently enabled for 2+ reductions only as the
1814  // versions handling 1 reduction variable are more efficient right now.
1815  //
1816  // Algorithm:
1817  // 1) discard_write accessor (DWAcc), InitializeToIdentity = true:
1818  // a) Create uninitialized buffer and read_write accessor (RWAcc).
1819  // b) discard-write partial sums to RWAcc.
1820  // c) Repeat the steps (a) and (b) to get one final sum.
1821  // d) Copy RWAcc to DWAcc.
1822  // 2) read_write accessor (RWAcc), InitializeToIdentity = false:
1823  // a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1824  // re-use user's RWAcc (if #work-groups is 1).
1825  // b) discard-write to RWAcc (#WG > 1), or update-write (#WG == 1).
1826  // c) Repeat the steps (a) and (b) to get one final sum.
1827  // 3) read_write accessor (RWAcc), InitializeToIdentity = true:
1828  // a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1829  // re-use user's RWAcc (if #work-groups is 1).
1830  // b) discard-write to RWAcc.
1831  // c) Repeat the steps (a) and (b) to get one final sum.
1832  // 4) USM pointer, InitializeToIdentity = false:
1833  // a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1834  // re-use user's USM pointer (if #work-groups is 1).
1835  // b) discard-write to RWAcc (#WG > 1) or
1836  // update-write to USM pointer (#WG == 1).
1837  // c) Repeat the steps (a) and (b) to get one final sum.
1838  // 5) USM pointer, InitializeToIdentity = true:
1839  // a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1840  // re-use user's USM pointer (if #work-groups is 1).
1841  // b) discard-write to RWAcc (#WG > 1) or
1842  // discard-write to USM pointer (#WG == 1).
1843  // c) Repeat the steps (a) and (b) to get one final sum.
1844  template <typename KernelName = detail::auto_name, int Dims,
1845  typename... RestT>
1847  (sizeof...(RestT) >= 3 &&
1849  parallel_for(nd_range<Dims> Range, RestT... Rest) {
1850  std::tuple<RestT...> ArgsTuple(Rest...);
1851  constexpr size_t NumArgs = sizeof...(RestT);
1852  auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
1853  auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
1854  auto ReduTuple =
1855  ext::oneapi::detail::tuple_select_elements(ArgsTuple, ReduIndices);
1856 
1857  size_t LocalMemPerWorkItem =
1858  ext::oneapi::detail::reduGetMemPerWorkItem(ReduTuple, ReduIndices);
1859  // TODO: currently the maximal work group size is determined for the given
1860  // queue/device, while it is safer to use queries to the kernel compiled
1861  // for the device.
1862  size_t MaxWGSize =
1863  ext::oneapi::detail::reduGetMaxWGSize(MQueue, LocalMemPerWorkItem);
1864  if (Range.get_local_range().size() > MaxWGSize)
1865  throw sycl::runtime_error("The implementation handling parallel_for with"
1866  " reduction requires work group size not bigger"
1867  " than " +
1868  std::to_string(MaxWGSize),
1869  PI_ERROR_INVALID_WORK_GROUP_SIZE);
1870 
1871  ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
1872  ReduTuple, ReduIndices);
1873  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1874  this->finalize();
1875 
1876  size_t NWorkItems = Range.get_group_range().size();
1877  while (NWorkItems > 1) {
1878  handler AuxHandler(QueueCopy, MIsHost);
1879  AuxHandler.saveCodeLoc(MCodeLoc);
1880 
1881  NWorkItems =
1882  ext::oneapi::detail::reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
1883  AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
1884  MLastEvent = AuxHandler.finalize();
1885  } // end while (NWorkItems > 1)
1886 
1888  QueueCopy, MIsHost, ReduTuple, ReduIndices);
1889  if (CopyEvent)
1890  MLastEvent = *CopyEvent;
1891  }
1892 
1903  template <typename KernelName = detail::auto_name, typename KernelType,
1904  int Dims>
1907  throwIfActionIsCreated();
1908  using NameT =
1910  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1911  using LambdaArgType =
1913  (void)NumWorkGroups;
1914  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
1915 #ifndef __SYCL_DEVICE_ONLY__
1916  detail::checkValueRange<Dims>(NumWorkGroups);
1917  MNDRDesc.setNumWorkGroups(NumWorkGroups);
1918  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1919  setType(detail::CG::Kernel);
1920 #endif // __SYCL_DEVICE_ONLY__
1921  }
1922 
1935  template <typename KernelName = detail::auto_name, typename KernelType,
1936  int Dims>
1938  range<Dims> WorkGroupSize,
1940  throwIfActionIsCreated();
1941  using NameT =
1943  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1944  using LambdaArgType =
1946  (void)NumWorkGroups;
1947  (void)WorkGroupSize;
1948  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
1949 #ifndef __SYCL_DEVICE_ONLY__
1950  nd_range<Dims> ExecRange =
1951  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
1952  detail::checkValueRange<Dims>(ExecRange);
1953  MNDRDesc.set(std::move(ExecRange));
1954  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1955  setType(detail::CG::Kernel);
1956 #endif // __SYCL_DEVICE_ONLY__
1957  }
1958 
1965  void single_task(kernel Kernel) {
1966  throwIfActionIsCreated();
1967  verifyKernelInvoc(Kernel);
1968  // Ignore any set kernel bundles and use the one associated with the kernel
1969  setHandlerKernelBundle(detail::getSyclObjImpl(Kernel.get_kernel_bundle()));
1970  // No need to check if range is out of INT_MAX limits as it's compile-time
1971  // known constant
1972  MNDRDesc.set(range<1>{1});
1973  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1974  setType(detail::CG::Kernel);
1975  extractArgsAndReqs();
1976  MKernelName = getKernelName();
1977  }
1978 
1979  void parallel_for(range<1> NumWorkItems, kernel Kernel) {
1980  parallel_for_impl(NumWorkItems, Kernel);
1981  }
1982 
1983  void parallel_for(range<2> NumWorkItems, kernel Kernel) {
1984  parallel_for_impl(NumWorkItems, Kernel);
1985  }
1986 
1987  void parallel_for(range<3> NumWorkItems, kernel Kernel) {
1988  parallel_for_impl(NumWorkItems, Kernel);
1989  }
1990 
1999  template <int Dims>
2000  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
2001  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
2002  kernel Kernel) {
2003  throwIfActionIsCreated();
2004  verifyKernelInvoc(Kernel);
2005  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2006  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2007  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
2008  setType(detail::CG::Kernel);
2009  extractArgsAndReqs();
2010  MKernelName = getKernelName();
2011  }
2012 
2021  template <int Dims> void parallel_for(nd_range<Dims> NDRange, kernel Kernel) {
2022  throwIfActionIsCreated();
2023  verifyKernelInvoc(Kernel);
2024  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2025  detail::checkValueRange<Dims>(NDRange);
2026  MNDRDesc.set(std::move(NDRange));
2027  setType(detail::CG::Kernel);
2028  extractArgsAndReqs();
2029  MKernelName = getKernelName();
2030  }
2031 
2038  template <typename KernelName = detail::auto_name, typename KernelType>
2040  throwIfActionIsCreated();
2041  // Ignore any set kernel bundles and use the one associated with the kernel
2042  setHandlerKernelBundle(detail::getSyclObjImpl(Kernel.get_kernel_bundle()));
2043  using NameT =
2045  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2046  (void)Kernel;
2047  kernel_single_task<NameT>(KernelFunc);
2048 #ifndef __SYCL_DEVICE_ONLY__
2049  // No need to check if range is out of INT_MAX limits as it's compile-time
2050  // known constant
2051  MNDRDesc.set(range<1>{1});
2052  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2053  setType(detail::CG::Kernel);
2054  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2055  extractArgsAndReqs();
2056  MKernelName = getKernelName();
2057  } else
2058  StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(std::move(KernelFunc));
2059 #else
2060  detail::CheckDeviceCopyable<KernelType>();
2061 #endif
2062  }
2063 
2067  template <typename FuncT>
2068  __SYCL_DEPRECATED("interop_task() is deprecated, use host_task() instead")
2069  void interop_task(FuncT Func) {
2070 
2071  MInteropTask.reset(new detail::InteropTask(std::move(Func)));
2072  setType(detail::CG::CodeplayInteropTask);
2073  }
2074 
2082  template <typename KernelName = detail::auto_name, typename KernelType,
2083  int Dims>
2084  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
2086  throwIfActionIsCreated();
2087  // Ignore any set kernel bundles and use the one associated with the kernel
2088  setHandlerKernelBundle(detail::getSyclObjImpl(Kernel.get_kernel_bundle()));
2089  using NameT =
2091  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2093  (void)Kernel;
2094  (void)NumWorkItems;
2095  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2096 #ifndef __SYCL_DEVICE_ONLY__
2097  detail::checkValueRange<Dims>(NumWorkItems);
2098  MNDRDesc.set(std::move(NumWorkItems));
2099  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2100  setType(detail::CG::Kernel);
2101  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2102  extractArgsAndReqs();
2103  MKernelName = getKernelName();
2104  } else
2105  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2106  std::move(KernelFunc));
2107 #endif
2108  }
2109 
2119  template <typename KernelName = detail::auto_name, typename KernelType,
2120  int Dims>
2121  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
2122  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
2123  id<Dims> WorkItemOffset, _KERNELFUNCPARAM(KernelFunc)) {
2124  throwIfActionIsCreated();
2125  // Ignore any set kernel bundles and use the one associated with the kernel
2126  setHandlerKernelBundle(detail::getSyclObjImpl(Kernel.get_kernel_bundle()));
2127  using NameT =
2129  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2131  (void)Kernel;
2132  (void)NumWorkItems;
2133  (void)WorkItemOffset;
2134  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2135 #ifndef __SYCL_DEVICE_ONLY__
2136  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2137  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
2138  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2139  setType(detail::CG::Kernel);
2140  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2141  extractArgsAndReqs();
2142  MKernelName = getKernelName();
2143  } else
2144  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2145  std::move(KernelFunc));
2146 #endif
2147  }
2148 
2158  template <typename KernelName = detail::auto_name, typename KernelType,
2159  int Dims>
2160  void parallel_for(kernel Kernel, nd_range<Dims> NDRange,
2162  throwIfActionIsCreated();
2163  // Ignore any set kernel bundles and use the one associated with the kernel
2164  setHandlerKernelBundle(detail::getSyclObjImpl(Kernel.get_kernel_bundle()));
2165  using NameT =
2167  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2168  using LambdaArgType =
2170  (void)Kernel;
2171  (void)NDRange;
2172  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2173 #ifndef __SYCL_DEVICE_ONLY__
2174  detail::checkValueRange<Dims>(NDRange);
2175  MNDRDesc.set(std::move(NDRange));
2176  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2177  setType(detail::CG::Kernel);
2178  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2179  extractArgsAndReqs();
2180  MKernelName = getKernelName();
2181  } else
2182  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2183  std::move(KernelFunc));
2184 #endif
2185  }
2186 
2200  template <typename KernelName = detail::auto_name, typename KernelType,
2201  int Dims>
2202  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2204  throwIfActionIsCreated();
2205  // Ignore any set kernel bundles and use the one associated with the kernel
2206  setHandlerKernelBundle(detail::getSyclObjImpl(Kernel.get_kernel_bundle()));
2207  using NameT =
2209  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2210  using LambdaArgType =
2212  (void)Kernel;
2213  (void)NumWorkGroups;
2214  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2215 #ifndef __SYCL_DEVICE_ONLY__
2216  detail::checkValueRange<Dims>(NumWorkGroups);
2217  MNDRDesc.setNumWorkGroups(NumWorkGroups);
2218  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2219  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2220  setType(detail::CG::Kernel);
2221 #endif // __SYCL_DEVICE_ONLY__
2222  }
2223 
2239  template <typename KernelName = detail::auto_name, typename KernelType,
2240  int Dims>
2241  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2242  range<Dims> WorkGroupSize,
2244  throwIfActionIsCreated();
2245  // Ignore any set kernel bundles and use the one associated with the kernel
2246  setHandlerKernelBundle(detail::getSyclObjImpl(Kernel.get_kernel_bundle()));
2247  using NameT =
2249  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2250  using LambdaArgType =
2252  (void)Kernel;
2253  (void)NumWorkGroups;
2254  (void)WorkGroupSize;
2255  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2256 #ifndef __SYCL_DEVICE_ONLY__
2257  nd_range<Dims> ExecRange =
2258  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
2259  detail::checkValueRange<Dims>(ExecRange);
2260  MNDRDesc.set(std::move(ExecRange));
2261  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2262  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2263  setType(detail::CG::Kernel);
2264 #endif // __SYCL_DEVICE_ONLY__
2265  }
2266 
2267  // Clean up KERNELFUNC macro.
2268 #undef _KERNELFUNCPARAM
2269 
2270  // Explicit copy operations API
2271 
2279  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2280  access::target AccessTarget,
2281  access::placeholder IsPlaceholder = access::placeholder::false_t>
2283  std::shared_ptr<T_Dst> Dst) {
2284  throwIfActionIsCreated();
2285  static_assert(isValidTargetForExplicitOp(AccessTarget),
2286  "Invalid accessor target for the copy method.");
2287  static_assert(isValidModeForSourceAccessor(AccessMode),
2288  "Invalid accessor mode for the copy method.");
2289  // Make sure data shared_ptr points to is not released until we finish
2290  // work with it.
2291  MSharedPtrStorage.push_back(Dst);
2292  typename std::shared_ptr<T_Dst>::element_type *RawDstPtr = Dst.get();
2293  copy(Src, RawDstPtr);
2294  }
2295 
2303  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2304  access::target AccessTarget,
2305  access::placeholder IsPlaceholder = access::placeholder::false_t>
2306  void
2307  copy(std::shared_ptr<T_Src> Src,
2309  throwIfActionIsCreated();
2310  static_assert(isValidTargetForExplicitOp(AccessTarget),
2311  "Invalid accessor target for the copy method.");
2312  static_assert(isValidModeForDestinationAccessor(AccessMode),
2313  "Invalid accessor mode for the copy method.");
2314  // Make sure data shared_ptr points to is not released until we finish
2315  // work with it.
2316  MSharedPtrStorage.push_back(Src);
2317  typename std::shared_ptr<T_Src>::element_type *RawSrcPtr = Src.get();
2318  copy(RawSrcPtr, Dst);
2319  }
2320 
2328  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2329  access::target AccessTarget,
2330  access::placeholder IsPlaceholder = access::placeholder::false_t>
2332  T_Dst *Dst) {
2333  throwIfActionIsCreated();
2334  static_assert(isValidTargetForExplicitOp(AccessTarget),
2335  "Invalid accessor target for the copy method.");
2336  static_assert(isValidModeForSourceAccessor(AccessMode),
2337  "Invalid accessor mode for the copy method.");
2338 #ifndef __SYCL_DEVICE_ONLY__
2339  if (MIsHost) {
2340  // TODO: Temporary implementation for host. Should be handled by memory
2341  // manager.
2342  copyAccToPtrHost(Src, Dst);
2343  return;
2344  }
2345 #endif
2346  setType(detail::CG::CopyAccToPtr);
2347 
2349  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2350 
2351  MRequirements.push_back(AccImpl.get());
2352  MSrcPtr = static_cast<void *>(AccImpl.get());
2353  MDstPtr = static_cast<void *>(Dst);
2354  // Store copy of accessor to the local storage to make sure it is alive
2355  // until we finish
2356  MAccStorage.push_back(std::move(AccImpl));
2357  }
2358 
2366  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2367  access::target AccessTarget,
2368  access::placeholder IsPlaceholder = access::placeholder::false_t>
2369  void
2370  copy(const T_Src *Src,
2372  throwIfActionIsCreated();
2373  static_assert(isValidTargetForExplicitOp(AccessTarget),
2374  "Invalid accessor target for the copy method.");
2375  static_assert(isValidModeForDestinationAccessor(AccessMode),
2376  "Invalid accessor mode for the copy method.");
2377 #ifndef __SYCL_DEVICE_ONLY__
2378  if (MIsHost) {
2379  // TODO: Temporary implementation for host. Should be handled by memory
2380  // manager.
2381  copyPtrToAccHost(Src, Dst);
2382  return;
2383  }
2384 #endif
2385  setType(detail::CG::CopyPtrToAcc);
2386 
2388  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2389 
2390  MRequirements.push_back(AccImpl.get());
2391  MSrcPtr = const_cast<T_Src *>(Src);
2392  MDstPtr = static_cast<void *>(AccImpl.get());
2393  // Store copy of accessor to the local storage to make sure it is alive
2394  // until we finish
2395  MAccStorage.push_back(std::move(AccImpl));
2396  }
2397 
2405  template <
2406  typename T_Src, int Dims_Src, access::mode AccessMode_Src,
2407  access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
2408  access::mode AccessMode_Dst, access::target AccessTarget_Dst,
2409  access::placeholder IsPlaceholder_Src = access::placeholder::false_t,
2410  access::placeholder IsPlaceholder_Dst = access::placeholder::false_t>
2411  void copy(accessor<T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src,
2412  IsPlaceholder_Src>
2413  Src,
2414  accessor<T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst,
2415  IsPlaceholder_Dst>
2416  Dst) {
2417  throwIfActionIsCreated();
2418  static_assert(isValidTargetForExplicitOp(AccessTarget_Src),
2419  "Invalid source accessor target for the copy method.");
2420  static_assert(isValidTargetForExplicitOp(AccessTarget_Dst),
2421  "Invalid destination accessor target for the copy method.");
2422  static_assert(isValidModeForSourceAccessor(AccessMode_Src),
2423  "Invalid source accessor mode for the copy method.");
2424  static_assert(isValidModeForDestinationAccessor(AccessMode_Dst),
2425  "Invalid destination accessor mode for the copy method.");
2426  if (Dst.get_size() < Src.get_size())
2427  throw sycl::invalid_object_error(
2428  "The destination accessor size is too small to copy the memory into.",
2429  PI_ERROR_INVALID_OPERATION);
2430 
2431  if (copyAccToAccHelper(Src, Dst))
2432  return;
2433  setType(detail::CG::CopyAccToAcc);
2434 
2435  detail::AccessorBaseHost *AccBaseSrc = (detail::AccessorBaseHost *)&Src;
2436  detail::AccessorImplPtr AccImplSrc = detail::getSyclObjImpl(*AccBaseSrc);
2437 
2438  detail::AccessorBaseHost *AccBaseDst = (detail::AccessorBaseHost *)&Dst;
2439  detail::AccessorImplPtr AccImplDst = detail::getSyclObjImpl(*AccBaseDst);
2440 
2441  MRequirements.push_back(AccImplSrc.get());
2442  MRequirements.push_back(AccImplDst.get());
2443  MSrcPtr = AccImplSrc.get();
2444  MDstPtr = AccImplDst.get();
2445  // Store copy of accessor to the local storage to make sure it is alive
2446  // until we finish
2447  MAccStorage.push_back(std::move(AccImplSrc));
2448  MAccStorage.push_back(std::move(AccImplDst));
2449  }
2450 
2455  template <typename T, int Dims, access::mode AccessMode,
2456  access::target AccessTarget,
2457  access::placeholder IsPlaceholder = access::placeholder::false_t>
2458  void
2460  throwIfActionIsCreated();
2461  static_assert(isValidTargetForExplicitOp(AccessTarget),
2462  "Invalid accessor target for the update_host method.");
2463  setType(detail::CG::UpdateHost);
2464 
2466  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2467 
2468  MDstPtr = static_cast<void *>(AccImpl.get());
2469  MRequirements.push_back(AccImpl.get());
2470  MAccStorage.push_back(std::move(AccImpl));
2471  }
2472 
2481  template <typename T, int Dims, access::mode AccessMode,
2482  access::target AccessTarget,
2483  access::placeholder IsPlaceholder = access::placeholder::false_t,
2484  typename PropertyListT = property_list>
2485  void
2487  Dst,
2488  const T &Pattern) {
2489  throwIfActionIsCreated();
2490  // TODO add check:T must be an integral scalar value or a SYCL vector type
2491  static_assert(isValidTargetForExplicitOp(AccessTarget),
2492  "Invalid accessor target for the fill method.");
2493  if (!MIsHost && (((Dims == 1) && isConstOrGlobal(AccessTarget)) ||
2494  isImageOrImageArray(AccessTarget))) {
2495  setType(detail::CG::Fill);
2496 
2498  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2499 
2500  MDstPtr = static_cast<void *>(AccImpl.get());
2501  MRequirements.push_back(AccImpl.get());
2502  MAccStorage.push_back(std::move(AccImpl));
2503 
2504  MPattern.resize(sizeof(T));
2505  auto PatternPtr = reinterpret_cast<T *>(MPattern.data());
2506  *PatternPtr = Pattern;
2507  } else {
2508 
2509  // TODO: Temporary implementation for host. Should be handled by memory
2510  // manger.
2511  range<Dims> Range = Dst.get_range();
2512  parallel_for<class __fill<T, Dims, AccessMode, AccessTarget,
2513  IsPlaceholder>>(Range, [=](id<Dims> Index) {
2514  Dst[Index] = Pattern;
2515  });
2516  }
2517  }
2518 
2525  template <typename T> void fill(void *Ptr, const T &Pattern, size_t Count) {
2526  throwIfActionIsCreated();
2527  static_assert(std::is_trivially_copyable<T>::value,
2528  "Pattern must be trivially copyable");
2529  parallel_for<class __usmfill<T>>(range<1>(Count), [=](id<1> Index) {
2530  T *CastedPtr = static_cast<T *>(Ptr);
2531  CastedPtr[Index] = Pattern;
2532  });
2533  }
2534 
2539  throwIfActionIsCreated();
2540  setType(detail::CG::Barrier);
2541  }
2542 
2546  __SYCL2020_DEPRECATED("use 'ext_oneapi_barrier' instead")
2547  void barrier() { ext_oneapi_barrier(); }
2548 
2555  void ext_oneapi_barrier(const std::vector<event> &WaitList);
2556 
2563  __SYCL2020_DEPRECATED("use 'ext_oneapi_barrier' instead")
2564  void barrier(const std::vector<event> &WaitList);
2565 
2575  void memcpy(void *Dest, const void *Src, size_t Count);
2576 
2586  template <typename T> void copy(const T *Src, T *Dest, size_t Count) {
2587  this->memcpy(Dest, Src, Count * sizeof(T));
2588  }
2589 
2598  void memset(void *Dest, int Value, size_t Count);
2599 
2606  void prefetch(const void *Ptr, size_t Count);
2607 
2614  void mem_advise(const void *Ptr, size_t Length, int Advice);
2615 
2616 private:
2617  std::shared_ptr<detail::queue_impl> MQueue;
2622  std::vector<std::vector<char>> MArgsStorage;
2623  std::vector<detail::AccessorImplPtr> MAccStorage;
2624  std::vector<detail::LocalAccessorImplPtr> MLocalAccStorage;
2625  std::vector<std::shared_ptr<detail::stream_impl>> MStreamStorage;
2626  mutable std::vector<std::shared_ptr<const void>> MSharedPtrStorage;
2628  std::vector<detail::ArgDesc> MArgs;
2632  std::vector<detail::ArgDesc> MAssociatedAccesors;
2634  std::vector<detail::Requirement *> MRequirements;
2636  detail::NDRDescT MNDRDesc;
2637  std::string MKernelName;
2639  std::shared_ptr<detail::kernel_impl> MKernel;
2643  detail::CG::CGTYPE MCGType = detail::CG::None;
2645  void *MSrcPtr = nullptr;
2647  void *MDstPtr = nullptr;
2649  size_t MLength = 0;
2651  std::vector<char> MPattern;
2653  std::unique_ptr<detail::HostKernelBase> MHostKernel;
2655  std::unique_ptr<detail::HostTask> MHostTask;
2656  detail::OSModuleHandle MOSModuleHandle = detail::OSUtil::ExeModuleHandle;
2657  // Storage for a lambda or function when using InteropTasks
2658  std::unique_ptr<detail::InteropTask> MInteropTask;
2660  std::vector<detail::EventImplPtr> MEvents;
2663  std::vector<detail::EventImplPtr> MEventsWaitWithBarrier;
2664 
2665  bool MIsHost = false;
2666 
2667  detail::code_location MCodeLoc = {};
2668  bool MIsFinalized = false;
2669  event MLastEvent;
2670 
2671  // Make queue_impl class friend to be able to call finalize method.
2672  friend class detail::queue_impl;
2673  // Make accessor class friend to keep the list of associated accessors.
2674  template <typename DataT, int Dims, access::mode AccMode,
2675  access::target AccTarget, access::placeholder isPlaceholder,
2676  typename PropertyListT>
2677  friend class accessor;
2679 
2680  template <typename DataT, int Dimensions, access::mode AccessMode,
2681  access::target AccessTarget, access::placeholder IsPlaceholder>
2683  // Make stream class friend to be able to keep the list of associated streams
2684  friend class stream;
2685  friend class detail::stream_impl;
2686  // Make reduction friends to store buffers and arrays created for it
2687  // in handler from reduction methods.
2688  template <typename T, class BinaryOperation, int Dims, size_t Extent,
2689  class Algorithm>
2691 
2692  // This method needs to call the method finalize() and also access to private
2693  // ctor/dtor.
2694  template <typename Reduction, typename... RestT>
2695  std::enable_if_t<!Reduction::is_usm> friend ext::oneapi::detail::
2697  std::vector<event> &Events, std::shared_ptr<detail::queue_impl> Queue,
2698  bool IsHost, Reduction &, RestT...);
2699 
2700  friend void detail::associateWithHandler(handler &,
2702  access::target);
2703 
2704  friend class ::MockHandler;
2705  friend class detail::queue_impl;
2706 
2707  bool DisableRangeRounding();
2708 
2709  bool RangeRoundingTrace();
2710 
2711  void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
2712  size_t &MinRange);
2713 
2714  template <typename WrapperT, typename TransformedArgType, int Dims,
2715  typename KernelType,
2717  KernelType, TransformedArgType>::value> * = nullptr>
2718  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
2719  range<Dims> NumWorkItems) {
2720  return detail::RoundedRangeKernelWithKH<TransformedArgType, Dims,
2721  KernelType>(NumWorkItems,
2722  KernelFunc);
2723  }
2724 
2725  template <typename WrapperT, typename TransformedArgType, int Dims,
2726  typename KernelType,
2728  KernelType, TransformedArgType>::value> * = nullptr>
2729  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
2730  range<Dims> NumWorkItems) {
2732  NumWorkItems, KernelFunc);
2733  }
2734 };
2735 } // namespace sycl
2736 } // __SYCL_INLINE_NAMESPACE(cl)
cl::sycl::detail::associateWithHandler
void associateWithHandler(handler &, AccessorBaseHost *, access::target)
Definition: handler_proxy.cpp:17
cl::sycl::nd_range
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:23
cl::sycl::handler::parallel_for
detail::enable_if_t<!Reduction::has_fast_atomics &&!Reduction::has_atomic_add_float64 > parallel_for(nd_range< Dims > Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified nd_range.
Definition: handler.hpp:1727
cl::sycl::detail::stream_impl
Definition: stream_impl.hpp:25
cl::sycl::handler::host_task
detail::enable_if_t< detail::check_fn_signature< detail::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< detail::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
Definition: handler.hpp:1507
property_list.hpp
cl::sycl::ext::oneapi::detail::reduction_impl_algo
Templated class for implementations of specific reduction algorithms.
Definition: handler.hpp:246
cl::sycl::handler::parallel_for
void parallel_for(range< 1 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1979
__usmfill
Definition: handler.hpp:49
cg.hpp
cl::sycl::detail::getDelinearizedId
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:318
cl::sycl::detail::NDRDescT
Definition: cg_types.hpp:41
cl::sycl::detail::check_fn_signature
Definition: cg_types.hpp:126
cl::sycl::detail::getLinearIndex
size_t getLinearIndex(const T< Dims > &Index, const U< Dims > &Range)
Definition: common.hpp:336
cl::sycl::detail::member_ptr_helper
static Arg member_ptr_helper(RetType(Func::*)(Arg) const)
cl::sycl::detail::LocalAccessorImplHost
Definition: accessor_impl.hpp:168
cl::sycl::interop_handle
Definition: interop_handle.hpp:37
cl::sycl::event
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:32
__SYCL2020_DEPRECATED
#define __SYCL2020_DEPRECATED(message)
Definition: defines_elementary.hpp:56
cl::sycl::item
Identifies an instance of the function object executing at each point in a range.
Definition: helpers.hpp:28
item.hpp
cl::sycl::handler::parallel_for
void parallel_for(range< 2 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1983
__copyAcc2Acc
Definition: handler.hpp:69
cl::sycl::access::placeholder
placeholder
Definition: access.hpp:43
cl::sycl::detail::LocalAccessorImplPtr
std::shared_ptr< LocalAccessorImplHost > LocalAccessorImplPtr
Definition: accessor_impl.hpp:182
cl::sycl::detail::remove_cv_t
typename std::remove_cv< T >::type remove_cv_t
Definition: stl_type_traits.hpp:32
stl.hpp
cl::sycl::detail::RoundedRangeKernelWithKH::operator()
void operator()(TransformedArgType Arg, kernel_handler KH) const
Definition: handler.hpp:227
cg_types.hpp
cl::sycl::handler::is_same_type
std::is_same< remove_cv_ref_t< U >, remove_cv_ref_t< T > > is_same_type
Definition: handler.hpp:1384
cl::sycl::id
A unique identifier of an item in an index space.
Definition: array.hpp:17
cl::sycl::detail::getVersionedCGType
constexpr unsigned int getVersionedCGType(unsigned int Type, unsigned char Version)
Definition: cg.hpp:126
cl::sycl::sampler
Encapsulates a configuration for sampling an image accessor.
Definition: sampler.hpp:66
cl::sycl::ext::oneapi::detail::reduGetMaxNumConcurrentWorkGroups
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< queue_impl > Queue)
handler_proxy.hpp
cl::sycl::detail::CG::CG_VERSION
CG_VERSION
Definition: cg.hpp:150
cl::sycl::detail::argument_helper
SuggestedArgType argument_helper(...)
cl::sycl::handler::set_args
void set_args(Ts &&... Args)
Sets arguments for OpenCL interoperability kernels.
Definition: handler.hpp:1423
cl::sycl::detail::RoundedRangeKernel::RoundedRangeKernel
RoundedRangeKernel(range< Dims > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:206
cl::sycl::detail::RoundedRangeKernel::operator()
void operator()(TransformedArgType Arg) const
Definition: handler.hpp:209
__copyAcc2Ptr
Definition: handler.hpp:55
context.hpp
_KERNELFUNCPARAM
#define _KERNELFUNCPARAM(a)
Definition: handler.hpp:1514
cl::sycl::nd_range::get_group_range
range< dimensions > get_group_range() const
Definition: nd_range.hpp:44
event.hpp
os_util.hpp
cl::sycl::make_error_code
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:121
cl::sycl::detail::__pf_kernel_wrapper
Definition: handler.hpp:115
cl::sycl::detail::KernelBundleImplPtr
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
Definition: kernel_bundle.hpp:132
cl::sycl::handler::parallel_for
detail::enable_if_t< Reduction::has_fast_atomics > parallel_for(nd_range< Dims > Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc))
Implements parallel_for() accepting nd_range Range and one reduction object.
Definition: handler.hpp:1651
cl::sycl::accessor::get_range
range< Dimensions > get_range() const
Definition: accessor.hpp:1694
sycl
Definition: invoke_simd.hpp:68
cl::sycl::handler::parallel_for
void parallel_for(range< Dims > Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified nd_range.
Definition: handler.hpp:1600
cl::sycl::property_list
Objects of the property_list class are containers for the SYCL properties.
Definition: property_list.hpp:26
max
simd< _Tp, _Abi > max(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
access.hpp
cl::sycl::detail::lambda_arg_type
decltype(argument_helper< F, SuggestedArgType >(0)) lambda_arg_type
Definition: handler.hpp:112
cl::sycl::detail::code_location
Definition: common.hpp:54
cl::sycl::detail::remove_reference_t
typename std::remove_reference< T >::type remove_reference_t
Definition: stl_type_traits.hpp:35
cl::sycl::handler::update_host
void update_host(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder > Acc)
Provides guarantees that the memory object accessed via Acc is updated on the host after command grou...
Definition: handler.hpp:2459
cl::sycl::detail::write
void write(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize, unsigned WIOffset, const char *Str, unsigned Len, unsigned Padding=0)
Definition: stream.hpp:110
cl::sycl::detail::LocalAccessorBaseHost
Definition: accessor_impl.hpp:184
id.hpp
cl::sycl::range
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: buffer.hpp:24
cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMemHelper
std::enable_if_t<!Reduction::is_usm > reduSaveFinalResultToUserMemHelper(std::vector< event > &Events, std::shared_ptr< detail::queue_impl > Queue, bool IsHost, Reduction &Redu, RestT... Rest)
Definition: reduction.hpp:2513
interop_handle.hpp
cl::sycl::handler::parallel_for
detail::enable_if_t< Reduction::has_atomic_add_float64 > parallel_for(nd_range< Dims > Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc))
Implements parallel_for() accepting nd_range Range and one reduction object.
Definition: handler.hpp:1685
cl::sycl::detail::memcpy
void memcpy(void *Dst, const void *Src, std::size_t Size)
cl::sycl::handler::parallel_for_work_group
void parallel_for_work_group(range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:1937
cl::sycl::ext::oneapi::detail::reduGetMaxWGSize
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
cl::sycl::detail::CG::CGTYPE
CGTYPE
Type of the command group.
Definition: cg.hpp:156
nd_range.hpp
cl::sycl::handler::parallel_for_work_group
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2202
export.hpp
cl::sycl::handler::ShouldEnableSetArg
Definition: handler.hpp:1386
cl::sycl::handler::parallel_for
void parallel_for(kernel Kernel, nd_range< Dims > NDRange, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:2160
cl::sycl::detail::AccessorImplHost
Definition: accessor_impl.hpp:74
cl::sycl::handler::require
void require(accessor< DataT, Dims, AccMode, AccTarget, access::placeholder::true_t > Acc)
Requires access to the memory object associated with the placeholder accessor.
Definition: handler.hpp:1360
cl::sycl::handler::parallel_for
void parallel_for(range< 1 > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:1457
cl::sycl::handler::single_task
void single_task(kernel Kernel)
Invokes a SYCL kernel.
Definition: handler.hpp:1965
cl::sycl::handler::parallel_for
void parallel_for(range< 3 > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:1475
kernel.hpp
cl::sycl::ext::oneapi::detail::tuple_select_elements
std::tuple< std::tuple_element_t< Is, TupleT >... > tuple_select_elements(TupleT Tuple, std::index_sequence< Is... >)
Utility function: for the given tuple.
Definition: reduction.hpp:2567
cl::sycl::nd_item::get_global_range
range< dimensions > get_global_range() const
Definition: nd_item.hpp:92
cl::sycl::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:36
cl::sycl::detail::RoundedRangeKernel
Definition: handler.hpp:204
kernel_bundle.hpp
cl.h
cl::sycl::detail::AccessorImplPtr
std::shared_ptr< AccessorImplHost > AccessorImplPtr
Definition: accessor_impl.hpp:130
cl::sycl::handler::ext_oneapi_barrier
void ext_oneapi_barrier()
Prevents any commands submitted afterward to this queue from executing until all commands previously ...
Definition: handler.hpp:2538
cl::sycl::handler::parallel_for
void parallel_for(nd_range< Dims > ExecutionRange, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified nd_range.
Definition: handler.hpp:1567
cl::sycl::ext::oneapi::detail::AreAllButLastReductions
Predicate returning true if all template type parameters except the last one are reductions.
Definition: handler.hpp:323
cl::sycl::accessor
Buffer accessor.
Definition: accessor.hpp:224
cl::sycl::ext::intel::experimental::prefetch
prefetch_impl< _B > prefetch
Definition: fpga_lsu.hpp:47
cl::sycl::access::target
target
Definition: access.hpp:17
cl::sycl::handler::copy
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2282
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
cl::sycl::handler::remove_cv_ref_t
typename detail::remove_cv_t< detail::remove_reference_t< T > > remove_cv_ref_t
Definition: handler.hpp:1381
cl::sycl::handler::single_task
void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function.
Definition: handler.hpp:2039
cl::sycl::handler::fill
void fill(void *Ptr, const T &Pattern, size_t Count)
Fills the specified memory with the specified pattern.
Definition: handler.hpp:2525
cl::sycl::handler::parallel_for
void parallel_for(range< 2 > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:1466
cl::sycl::detail::KernelInfo
Definition: kernel_desc.hpp:70
cl::sycl::handler::parallel_for
std::enable_if_t<(sizeof...(RestT) >=3 &&ext::oneapi::detail::AreAllButLastReductions< RestT... >::value)> parallel_for(nd_range< Dims > Range, RestT... Rest)
Definition: handler.hpp:1849
cl::sycl::kernel::is_host
bool is_host() const
Get a valid OpenCL kernel handle.
Definition: kernel.cpp:27
cl::sycl::detail::runKernelWithArg
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType, ArgType >::value > runKernelWithArg(KernelType KernelName, ArgType Arg)
Definition: cg_types.hpp:197
cl::sycl::ext::oneapi::detail::reduCGFuncAtomic64
enable_if_t< Reduction::has_atomic_add_float64 > reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &Redu)
Definition: reduction.hpp:2246
cl::sycl::kernel_handler
Reading the value of a specialization constant.
Definition: kernel_handler.hpp:22
cl::sycl::handler::copy
void copy(std::shared_ptr< T_Src > Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2307
cl::sycl::detail::RoundedRangeKernelWithKH
Definition: handler.hpp:222
cl::sycl::handler
Command group handler class.
Definition: handler.hpp:362
cl::sycl::handler::parallel_for
void parallel_for(kernel Kernel, range< Dims > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range.
Definition: handler.hpp:2084
cl::sycl::accessor::get_pointer
DataT * get_pointer() const
Definition: accessor.hpp:1778
__SYCL_KERNEL_ATTR__
#define __SYCL_KERNEL_ATTR__
Definition: handler.hpp:1078
cl::sycl::ext::oneapi::detail::reduAuxCGFunc
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:2472
cl::sycl::detail::queue_impl
Definition: queue_impl.hpp:54
cl::sycl::handler::__SYCL_DEPRECATED
__SYCL_DEPRECATED("run_on_host_intel() is deprecated, use host_task() instead") void run_on_host_intel(FuncT Func)
Defines and invokes a SYCL kernel on host device.
Definition: handler.hpp:1487
accessor.hpp
cl::sycl::detail::InteropTask
Definition: cg_types.hpp:220
cl::sycl::nd_item::get_global_id
id< dimensions > get_global_id() const
Definition: nd_item.hpp:40
std::get
constexpr tuple_element< I, tuple< Types... > >::type & get(cl::sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:199
KernelFunc
std::function< void(const sycl::nd_item< NDims > &)> KernelFunc
Definition: pi_esimd_emulator.cpp:172
cl::sycl::accessor::get_size
size_t get_size() const
Definition: accessor.hpp:1687
cl::sycl::detail::OSModuleHandle
intptr_t OSModuleHandle
Uniquely identifies an operating system module (executable or a dynamic library)
Definition: os_util.hpp:48
cl::sycl::nd_item::get_offset
id< dimensions > get_offset() const
Definition: nd_item.hpp:105
kernel_handler.hpp
cl::sycl::detail::auto_name
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:36
cl::sycl::handler::copy
void copy(const T_Src *Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2370
cl::sycl::detail::getSyclObjImpl
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: common.hpp:204
cl::sycl::accessor::size
size_t size() const noexcept
Definition: accessor.hpp:1691
cl::sycl::nd_item
Identifies an instance of the function object executing at each point in an nd_range.
Definition: helpers.hpp:32
std
Definition: accessor.hpp:2617
cl::sycl::detail::checkValueRange
detail::enable_if_t< std::is_same< T, nd_range< Dims > >::value > checkValueRange(const T &V)
Definition: handler.hpp:191
cl::sycl::handler::set_arg
void set_arg(int ArgIndex, accessor< DataT, Dims, AccessMode, AccessTarget, IsPlaceholder > Arg)
Definition: handler.hpp:1413
cl::sycl::ext::intel::esimd::barrier
__ESIMD_API void barrier()
Generic work-group barrier.
Definition: memory.hpp:917
cl::sycl::detail::image_accessor
Definition: accessor.hpp:378
cl::sycl::detail::KernelLambdaHasKernelHandlerArgT
Definition: cg_types.hpp:174
cl::sycl::handler::parallel_for_Impl
detail::enable_if_t<!Reduction::has_fast_atomics > parallel_for_Impl(nd_range< Dims > Range, Reduction Redu, KernelType KernelFunc)
Definition: handler.hpp:1736
sampler.hpp
cl::sycl::detail::HostKernel
Definition: cg_types.hpp:246
cl::sycl::detail::getDeviceFromHandler
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
Definition: accessor.cpp:15
cl::sycl::handler::single_task
void single_task(KernelType KernelFunc)
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:1436
cl::sycl::detail::kernel_param_kind_t
kernel_param_kind_t
Definition: kernel_desc.hpp:25
cl::sycl::detail::get_kernel_wrapper_name_t
Definition: handler.hpp:117
cl::sycl::handler::__SYCL_DEPRECATED
__SYCL_DEPRECATED("interop_task() is deprecated, use host_task() instead") void interop_task(FuncT Func)
Invokes a lambda on the host.
Definition: handler.hpp:2068
cl::sycl::ext::oneapi::detail::reduCGFunc
void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:2175
cl::sycl::handler::parallel_for_work_group
void parallel_for_work_group(range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:1905
cl::sycl::detail::remove_const_t
typename std::remove_const< T >::type remove_const_t
Definition: stl_type_traits.hpp:30
cl::sycl::detail::kernel_param_desc_t
Definition: kernel_desc.hpp:36
cl::sycl::detail::AccessorBaseHost
Definition: accessor_impl.hpp:132
cl::sycl::detail::getUnversionedCGType
constexpr unsigned char getUnversionedCGType(unsigned int Type)
Definition: cg.hpp:132
cl::sycl::device::has
bool has(aspect Aspect) const
Indicates if the SYCL device has the given feature.
Definition: device.cpp:163
__fill
Definition: handler.hpp:47
nd_item.hpp
cl::sycl::access::mode
mode
Definition: access.hpp:28
cl::sycl::ext::oneapi::detail::reduGetMemPerWorkItem
size_t reduGetMemPerWorkItem(std::tuple< ReductionT... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:2558
cl::sycl::handler::copy
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, T_Dst *Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2331
cl::sycl::handler::fill
void fill(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT > Dst, const T &Pattern)
Fills memory pointed by accessor with the pattern given.
Definition: handler.hpp:2486
cl::sycl::info::device
device
Definition: info_desc.hpp:53
cl::sycl::stream
A buffered output stream that allows outputting the values of built-in, vector and SYCL types to the ...
Definition: stream.hpp:743
cl::sycl::detail::get_kernel_name_t::name
Name name
Definition: kernel.hpp:42
cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMem
std::shared_ptr< event > reduSaveFinalResultToUserMem(std::shared_ptr< detail::queue_impl > Queue, bool IsHost, std::tuple< Reduction... > &ReduTuple, std::index_sequence< Is... >)
Creates additional kernels that copy the accumulated/final results from reductions accessors to eithe...
Definition: reduction.hpp:2536
cl::sycl::detail::enable_if_t
typename std::enable_if< B, T >::type enable_if_t
Definition: stl_type_traits.hpp:24
cl::sycl::handler::set_arg
detail::enable_if_t< ShouldEnableSetArg< T >::value, void > set_arg(int ArgIndex, T &&Arg)
Sets argument for OpenCL interoperability kernels.
Definition: handler.hpp:1406
cl::sycl::kernel
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:68
cl::sycl::handler::parallel_for
void parallel_for(range< 3 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1987
cl::sycl::Dimensions
Dimensions
Definition: backend.hpp:142
cl::sycl::kernel::get_kernel_bundle
kernel_bundle< bundle_state::executable > get_kernel_bundle() const
Get the kernel_bundle associated with this kernel.
Definition: kernel.cpp:36
cl::sycl::handler::copy
void copy(accessor< T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src, IsPlaceholder_Src > Src, accessor< T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst, IsPlaceholder_Dst > Dst)
Copies the content of memory object accessed by Src to the memory object accessed by Dst.
Definition: handler.hpp:2411
cl::sycl::handler::parallel_for_work_group
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2241
cl::sycl::detail::runKernelWithoutArg
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType >::value > runKernelWithoutArg(KernelType KernelName)
Definition: cg_types.hpp:183
cl::sycl::detail::RoundedRangeKernelWithKH::RoundedRangeKernelWithKH
RoundedRangeKernelWithKH(range< Dims > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:224
cl::sycl::nd_range::get_local_range
range< dimensions > get_local_range() const
Definition: nd_range.hpp:42
cl::sycl::handler::parallel_for
void parallel_for(nd_range< Dims > NDRange, kernel Kernel)
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:2021
__copyPtr2Acc
Definition: handler.hpp:61
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:12