DPC++ Runtime
Runtime libraries for oneAPI DPC++
handler.hpp
Go to the documentation of this file.
1 //==-------- handler.hpp --- SYCL command group handler --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
11 #include <sycl/access/access.hpp>
12 #include <sycl/accessor.hpp>
13 #include <sycl/context.hpp>
14 #include <sycl/detail/cg.hpp>
15 #include <sycl/detail/cg_types.hpp>
16 #include <sycl/detail/cl.h>
17 #include <sycl/detail/export.hpp>
19 #include <sycl/detail/os_util.hpp>
20 #include <sycl/event.hpp>
26 #include <sycl/id.hpp>
27 #include <sycl/interop_handle.hpp>
28 #include <sycl/item.hpp>
29 #include <sycl/kernel.hpp>
30 #include <sycl/kernel_bundle.hpp>
31 #include <sycl/kernel_handler.hpp>
32 #include <sycl/nd_item.hpp>
33 #include <sycl/nd_range.hpp>
34 #include <sycl/property_list.hpp>
36 #include <sycl/sampler.hpp>
37 #include <sycl/stl.hpp>
39 
40 #include <functional>
41 #include <limits>
42 #include <memory>
43 #include <tuple>
44 #include <type_traits>
45 
46 // SYCL_LANGUAGE_VERSION is 4 digit year followed by 2 digit revision
47 #if !SYCL_LANGUAGE_VERSION || SYCL_LANGUAGE_VERSION < 202001
48 #define __SYCL_NONCONST_FUNCTOR__
49 #endif
50 
51 // replace _KERNELFUNCPARAM(KernelFunc) with KernelType KernelFunc
52 // or const KernelType &KernelFunc
53 #ifdef __SYCL_NONCONST_FUNCTOR__
54 #define _KERNELFUNCPARAMTYPE KernelType
55 #else
56 #define _KERNELFUNCPARAMTYPE const KernelType &
57 #endif
58 #define _KERNELFUNCPARAM(a) _KERNELFUNCPARAMTYPE a
59 
60 template <typename DataT, int Dimensions, sycl::access::mode AccessMode,
61  sycl::access::target AccessTarget,
63 class __fill;
64 
65 template <typename T> class __usmfill;
66 template <typename T> class __usmfill2d;
67 template <typename T> class __usmmemcpy2d;
68 
69 template <typename T_Src, typename T_Dst, int Dims,
73 
74 template <typename T_Src, typename T_Dst, int Dims,
78 
79 template <typename T_Src, int Dims_Src, sycl::access::mode AccessMode_Src,
80  sycl::access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
81  sycl::access::mode AccessMode_Dst,
82  sycl::access::target AccessTarget_Dst,
83  sycl::access::placeholder IsPlaceholder_Src,
84  sycl::access::placeholder IsPlaceholder_Dst>
86 
87 // For unit testing purposes
88 class MockHandler;
89 
90 namespace sycl {
92 
93 // Forward declaration
94 
95 class handler;
96 template <typename T, int Dimensions, typename AllocatorT, typename Enable>
97 class buffer;
98 
99 namespace ext::intel::experimental {
100 template <class _name, class _dataT, int32_t _min_capacity, class _propertiesT,
101  class>
102 class pipe;
103 }
104 
105 namespace detail {
106 
107 class handler_impl;
108 class kernel_impl;
109 class queue_impl;
110 class stream_impl;
111 template <typename DataT, int Dimensions, access::mode AccessMode,
113 class image_accessor;
114 template <typename RetType, typename Func, typename Arg>
115 static Arg member_ptr_helper(RetType (Func::*)(Arg) const);
116 
117 // Non-const version of the above template to match functors whose 'operator()'
118 // is declared w/o the 'const' qualifier.
119 template <typename RetType, typename Func, typename Arg>
120 static Arg member_ptr_helper(RetType (Func::*)(Arg));
121 
122 // template <typename RetType, typename Func>
123 // static void member_ptr_helper(RetType (Func::*)() const);
124 
125 // template <typename RetType, typename Func>
126 // static void member_ptr_helper(RetType (Func::*)());
127 
128 template <typename F, typename SuggestedArgType>
129 decltype(member_ptr_helper(&F::operator())) argument_helper(int);
130 
131 template <typename F, typename SuggestedArgType>
132 SuggestedArgType argument_helper(...);
133 
134 template <typename F, typename SuggestedArgType>
135 using lambda_arg_type = decltype(argument_helper<F, SuggestedArgType>(0));
136 
137 // Used when parallel_for range is rounded-up.
138 template <typename Name> class __pf_kernel_wrapper;
139 
140 template <typename Type> struct get_kernel_wrapper_name_t {
142 };
143 
144 __SYCL_EXPORT device getDeviceFromHandler(handler &);
145 
146 // Checks if a device_global has any registered kernel usage.
147 __SYCL_EXPORT bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr);
148 
149 #if __SYCL_ID_QUERIES_FIT_IN_INT__
150 template <typename T> struct NotIntMsg;
151 
152 template <int Dims> struct NotIntMsg<range<Dims>> {
153  constexpr static const char *Msg =
154  "Provided range is out of integer limits. Pass "
155  "`-fno-sycl-id-queries-fit-in-int' to disable range check.";
156 };
157 
158 template <int Dims> struct NotIntMsg<id<Dims>> {
159  constexpr static const char *Msg =
160  "Provided offset is out of integer limits. Pass "
161  "`-fno-sycl-id-queries-fit-in-int' to disable offset check.";
162 };
163 #endif
164 
165 // Helper for merging properties with ones defined in an optional kernel functor
166 // getter.
167 template <typename KernelType, typename PropertiesT, typename Cond = void>
169  using type = PropertiesT;
170 };
171 template <typename KernelType, typename PropertiesT>
173  KernelType, PropertiesT,
174  std::enable_if_t<ext::oneapi::experimental::detail::
175  HasKernelPropertiesGetMethod<KernelType>::value>> {
176  using get_method_properties =
178  KernelType>::properties_t;
179  static_assert(
181  "get(sycl::ext::oneapi::experimental::properties_tag) member in kernel "
182  "functor class must return a valid property list.");
184  PropertiesT, get_method_properties>;
185 };
186 
187 #if __SYCL_ID_QUERIES_FIT_IN_INT__
188 template <typename T, typename ValT>
189 typename std::enable_if_t<std::is_same<ValT, size_t>::value ||
190  std::is_same<ValT, unsigned long long>::value>
191 checkValueRangeImpl(ValT V) {
192  static constexpr size_t Limit =
193  static_cast<size_t>((std::numeric_limits<int>::max)());
194  if (V > Limit)
195  throw runtime_error(NotIntMsg<T>::Msg, PI_ERROR_INVALID_VALUE);
196 }
197 #endif
198 
199 template <int Dims, typename T>
200 typename std::enable_if_t<std::is_same_v<T, range<Dims>> ||
201  std::is_same_v<T, id<Dims>>>
202 checkValueRange(const T &V) {
203 #if __SYCL_ID_QUERIES_FIT_IN_INT__
204  for (size_t Dim = 0; Dim < Dims; ++Dim)
205  checkValueRangeImpl<T>(V[Dim]);
206 
207  {
208  unsigned long long Product = 1;
209  for (size_t Dim = 0; Dim < Dims; ++Dim) {
210  Product *= V[Dim];
211  // check value now to prevent product overflow in the end
212  checkValueRangeImpl<T>(Product);
213  }
214  }
215 #else
216  (void)V;
217 #endif
218 }
219 
220 template <int Dims>
221 void checkValueRange(const range<Dims> &R, const id<Dims> &O) {
222 #if __SYCL_ID_QUERIES_FIT_IN_INT__
223  checkValueRange<Dims>(R);
224  checkValueRange<Dims>(O);
225 
226  for (size_t Dim = 0; Dim < Dims; ++Dim) {
227  unsigned long long Sum = R[Dim] + O[Dim];
228 
229  checkValueRangeImpl<range<Dims>>(Sum);
230  }
231 #else
232  (void)R;
233  (void)O;
234 #endif
235 }
236 
237 template <int Dims, typename T>
238 typename std::enable_if_t<std::is_same_v<T, nd_range<Dims>>>
239 checkValueRange(const T &V) {
240 #if __SYCL_ID_QUERIES_FIT_IN_INT__
241  checkValueRange<Dims>(V.get_global_range());
242  checkValueRange<Dims>(V.get_local_range());
243  checkValueRange<Dims>(V.get_offset());
244 
245  checkValueRange<Dims>(V.get_global_range(), V.get_offset());
246 #else
247  (void)V;
248 #endif
249 }
250 
251 template <typename TransformedArgType, int Dims, typename KernelType>
253 public:
254  RoundedRangeKernel(range<Dims> NumWorkItems, KernelType KernelFunc)
255  : NumWorkItems(NumWorkItems), KernelFunc(KernelFunc) {}
256 
257  void operator()(TransformedArgType Arg) const {
258  if (Arg[0] >= NumWorkItems[0])
259  return;
260  Arg.set_allowed_range(NumWorkItems);
261  KernelFunc(Arg);
262  }
263 
264 private:
265  range<Dims> NumWorkItems;
266  KernelType KernelFunc;
267 };
268 
269 template <typename TransformedArgType, int Dims, typename KernelType>
271 public:
273  : NumWorkItems(NumWorkItems), KernelFunc(KernelFunc) {}
274 
275  void operator()(TransformedArgType Arg, kernel_handler KH) const {
276  if (Arg[0] >= NumWorkItems[0])
277  return;
278  Arg.set_allowed_range(NumWorkItems);
279  KernelFunc(Arg, KH);
280  }
281 
282 private:
283  range<Dims> NumWorkItems;
284  KernelType KernelFunc;
285 };
286 
287 using std::enable_if_t;
288 using sycl::detail::queue_impl;
289 
290 } // namespace detail
291 
325 class __SYCL_EXPORT handler {
326 private:
331  handler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost);
332 
342  handler(std::shared_ptr<detail::queue_impl> Queue,
343  std::shared_ptr<detail::queue_impl> PrimaryQueue,
344  std::shared_ptr<detail::queue_impl> SecondaryQueue, bool IsHost);
345 
347  template <typename T, typename F = typename std::remove_const_t<
348  typename std::remove_reference_t<T>>>
349  F *storePlainArg(T &&Arg) {
350  CGData.MArgsStorage.emplace_back(sizeof(T));
351  auto Storage = reinterpret_cast<F *>(CGData.MArgsStorage.back().data());
352  *Storage = Arg;
353  return Storage;
354  }
355 
356  void setType(detail::CG::CGTYPE Type) { MCGType = Type; }
357 
358  detail::CG::CGTYPE getType() { return MCGType; }
359 
360  void throwIfActionIsCreated() {
361  if (detail::CG::None != getType())
362  throw sycl::runtime_error("Attempt to set multiple actions for the "
363  "command group. Command group must consist of "
364  "a single kernel or explicit memory operation.",
365  PI_ERROR_INVALID_OPERATION);
366  }
367 
368  constexpr static int AccessTargetMask = 0x7ff;
372  template <typename KernelName, typename KernelType>
373  void throwOnLocalAccessorMisuse() const {
374  using NameT =
376  using KI = sycl::detail::KernelInfo<NameT>;
377 
378  auto *KernelArgs = &KI::getParamDesc(0);
379 
380  for (unsigned I = 0; I < KI::getNumParams(); ++I) {
381  const detail::kernel_param_kind_t &Kind = KernelArgs[I].kind;
382  const access::target AccTarget =
383  static_cast<access::target>(KernelArgs[I].info & AccessTargetMask);
384  if ((Kind == detail::kernel_param_kind_t::kind_accessor) &&
385  (AccTarget == target::local))
386  throw sycl::exception(
387  make_error_code(errc::kernel_argument),
388  "A local accessor must not be used in a SYCL kernel function "
389  "that is invoked via single_task or via the simple form of "
390  "parallel_for that takes a range parameter.");
391  }
392  }
393 
396  void
397  extractArgsAndReqsFromLambda(char *LambdaPtr, size_t KernelArgsNum,
398  const detail::kernel_param_desc_t *KernelArgs,
399  bool IsESIMD);
400 
402  void extractArgsAndReqs();
403 
404  void processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
405  const int Size, const size_t Index, size_t &IndexShift,
406  bool IsKernelCreatedFromSource, bool IsESIMD);
407 
409  std::string getKernelName();
410 
411  template <typename LambdaNameT> bool lambdaAndKernelHaveEqualName() {
412  // TODO It is unclear a kernel and a lambda/functor must to be equal or not
413  // for parallel_for with sycl::kernel and lambda/functor together
414  // Now if they are equal we extract argumets from lambda/functor for the
415  // kernel. Else it is necessary use set_atg(s) for resolve the order and
416  // values of arguments for the kernel.
417  assert(MKernel && "MKernel is not initialized");
418  const std::string LambdaName = detail::KernelInfo<LambdaNameT>::getName();
419  const std::string KernelName = getKernelName();
420  return LambdaName == KernelName;
421  }
422 
425  void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
426 
433  event finalize();
434 
440  void addStream(const std::shared_ptr<detail::stream_impl> &Stream) {
441  MStreamStorage.push_back(Stream);
442  }
443 
449  void addReduction(const std::shared_ptr<const void> &ReduObj);
450 
451  ~handler() = default;
452 
453  // TODO: Private and unusued. Remove when ABI break is allowed.
454  bool is_host() { return MIsHost; }
455 
456 #ifdef __SYCL_DEVICE_ONLY__
457  // In device compilation accessor isn't inherited from AccessorBaseHost, so
458  // can't detect by it. Since we don't expect it to be ever called in device
459  // execution, just use blind void *.
460  void associateWithHandler(void *AccBase, access::target AccTarget);
461 #else
463  access::target AccTarget);
464 #endif
465 
466  // Recursively calls itself until arguments pack is fully processed.
467  // The version for regular(standard layout) argument.
468  template <typename T, typename... Ts>
469  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&...Args) {
470  set_arg(ArgIndex, std::move(Arg));
471  setArgsHelper(++ArgIndex, std::move(Args)...);
472  }
473 
474  void setArgsHelper(int) {}
475 
476  void setLocalAccessorArgHelper(int ArgIndex,
477  detail::LocalAccessorBaseHost &LocalAccBase) {
478  detail::LocalAccessorImplPtr LocalAccImpl =
479  detail::getSyclObjImpl(LocalAccBase);
480  detail::LocalAccessorImplHost *Req = LocalAccImpl.get();
481  MLocalAccStorage.push_back(std::move(LocalAccImpl));
482  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
483  static_cast<int>(access::target::local), ArgIndex);
484  }
485 
486  // setArgHelper for local accessor argument (legacy accessor interface)
487  template <typename DataT, int Dims, access::mode AccessMode,
489  void setArgHelper(int ArgIndex,
490  accessor<DataT, Dims, AccessMode, access::target::local,
491  IsPlaceholder> &&Arg) {
492 #ifndef __SYCL_DEVICE_ONLY__
493  setLocalAccessorArgHelper(ArgIndex, Arg);
494 #endif
495  }
496 
497  // setArgHelper for local accessor argument (up to date accessor interface)
498  template <typename DataT, int Dims>
499  void setArgHelper(int ArgIndex, local_accessor<DataT, Dims> &&Arg) {
500 #ifndef __SYCL_DEVICE_ONLY__
501  setLocalAccessorArgHelper(ArgIndex, Arg);
502 #endif
503  }
504 
505  // setArgHelper for non local accessor argument.
506  template <typename DataT, int Dims, access::mode AccessMode,
508  typename std::enable_if_t<AccessTarget != access::target::local, void>
509  setArgHelper(
510  int ArgIndex,
514  detail::AccessorImplHost *Req = AccImpl.get();
515  // Add accessor to the list of requirements.
516  CGData.MRequirements.push_back(Req);
517  // Store copy of the accessor.
518  CGData.MAccStorage.push_back(std::move(AccImpl));
519  // Add accessor to the list of arguments.
520  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
521  static_cast<int>(AccessTarget), ArgIndex);
522  }
523 
524  template <typename T> void setArgHelper(int ArgIndex, T &&Arg) {
525  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
526 
527  if (!std::is_same<cl_mem, T>::value && std::is_pointer<T>::value) {
528  MArgs.emplace_back(detail::kernel_param_kind_t::kind_pointer, StoredArg,
529  sizeof(T), ArgIndex);
530  } else {
531  MArgs.emplace_back(detail::kernel_param_kind_t::kind_std_layout,
532  StoredArg, sizeof(T), ArgIndex);
533  }
534  }
535 
536  void setArgHelper(int ArgIndex, sampler &&Arg) {
537  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
538  MArgs.emplace_back(detail::kernel_param_kind_t::kind_sampler, StoredArg,
539  sizeof(sampler), ArgIndex);
540  }
541 
542  // TODO: Unusued. Remove when ABI break is allowed.
543  void verifyKernelInvoc(const kernel &Kernel) {
544  std::ignore = Kernel;
545  return;
546  }
547 
548  /* The kernel passed to StoreLambda can take an id, an item or an nd_item as
549  * its argument. Since esimd plugin directly invokes the kernel (doesn’t use
550  * piKernelSetArg), the kernel argument type must be known to the plugin.
551  * However, passing kernel argument type to the plugin requires changing ABI
552  * in HostKernel class. To overcome this problem, helpers below wrap the
553  * “original” kernel with a functor that always takes an nd_item as argument.
554  * A functor is used instead of a lambda because extractArgsAndReqsFromLambda
555  * needs access to the “original” kernel and keeps references to its internal
556  * data, i.e. the kernel passed as argument cannot be local in scope. The
557  * functor itself is again encapsulated in a std::function since functor’s
558  * type is unknown to the plugin.
559  */
560 
561  // For 'id, item w/wo offset, nd_item' kernel arguments
562  template <class KernelType, class NormalizedKernelType, int Dims>
563  KernelType *ResetHostKernelHelper(const KernelType &KernelFunc) {
564  NormalizedKernelType NormalizedKernel(KernelFunc);
565  auto NormalizedKernelFunc =
566  std::function<void(const sycl::nd_item<Dims> &)>(NormalizedKernel);
567  auto HostKernelPtr =
568  new detail::HostKernel<decltype(NormalizedKernelFunc),
569  sycl::nd_item<Dims>, Dims>(NormalizedKernelFunc);
570  MHostKernel.reset(HostKernelPtr);
571  return &HostKernelPtr->MKernel.template target<NormalizedKernelType>()
572  ->MKernelFunc;
573  }
574 
575  // For 'sycl::id<Dims>' kernel argument
576  template <class KernelType, typename ArgT, int Dims>
577  std::enable_if_t<std::is_same_v<ArgT, sycl::id<Dims>>, KernelType *>
578  ResetHostKernel(const KernelType &KernelFunc) {
579  struct NormalizedKernelType {
580  KernelType MKernelFunc;
581  NormalizedKernelType(const KernelType &KernelFunc)
582  : MKernelFunc(KernelFunc) {}
583  void operator()(const nd_item<Dims> &Arg) {
584  detail::runKernelWithArg(MKernelFunc, Arg.get_global_id());
585  }
586  };
587  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
588  KernelFunc);
589  }
590 
591  // For 'sycl::nd_item<Dims>' kernel argument
592  template <class KernelType, typename ArgT, int Dims>
593  std::enable_if_t<std::is_same_v<ArgT, sycl::nd_item<Dims>>, KernelType *>
594  ResetHostKernel(const KernelType &KernelFunc) {
595  struct NormalizedKernelType {
596  KernelType MKernelFunc;
597  NormalizedKernelType(const KernelType &KernelFunc)
598  : MKernelFunc(KernelFunc) {}
599  void operator()(const nd_item<Dims> &Arg) {
600  detail::runKernelWithArg(MKernelFunc, Arg);
601  }
602  };
603  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
604  KernelFunc);
605  }
606 
607  // For 'sycl::item<Dims, without_offset>' kernel argument
608  template <class KernelType, typename ArgT, int Dims>
609  std::enable_if_t<std::is_same_v<ArgT, sycl::item<Dims, false>>, KernelType *>
610  ResetHostKernel(const KernelType &KernelFunc) {
611  struct NormalizedKernelType {
612  KernelType MKernelFunc;
613  NormalizedKernelType(const KernelType &KernelFunc)
614  : MKernelFunc(KernelFunc) {}
615  void operator()(const nd_item<Dims> &Arg) {
616  sycl::item<Dims, false> Item = detail::Builder::createItem<Dims, false>(
617  Arg.get_global_range(), Arg.get_global_id());
618  detail::runKernelWithArg(MKernelFunc, Item);
619  }
620  };
621  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
622  KernelFunc);
623  }
624 
625  // For 'sycl::item<Dims, with_offset>' kernel argument
626  template <class KernelType, typename ArgT, int Dims>
627  std::enable_if_t<std::is_same_v<ArgT, sycl::item<Dims, true>>, KernelType *>
628  ResetHostKernel(const KernelType &KernelFunc) {
629  struct NormalizedKernelType {
630  KernelType MKernelFunc;
631  NormalizedKernelType(const KernelType &KernelFunc)
632  : MKernelFunc(KernelFunc) {}
633  void operator()(const nd_item<Dims> &Arg) {
634  sycl::item<Dims, true> Item = detail::Builder::createItem<Dims, true>(
635  Arg.get_global_range(), Arg.get_global_id(), Arg.get_offset());
636  detail::runKernelWithArg(MKernelFunc, Item);
637  }
638  };
639  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
640  KernelFunc);
641  }
642 
643  // For 'void' kernel argument (single_task)
644  template <class KernelType, typename ArgT, int Dims>
645  typename std::enable_if_t<std::is_same_v<ArgT, void>, KernelType *>
646  ResetHostKernel(const KernelType &KernelFunc) {
647  struct NormalizedKernelType {
648  KernelType MKernelFunc;
649  NormalizedKernelType(const KernelType &KernelFunc)
650  : MKernelFunc(KernelFunc) {}
651  void operator()(const nd_item<Dims> &Arg) {
652  (void)Arg;
653  detail::runKernelWithoutArg(MKernelFunc);
654  }
655  };
656  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
657  KernelFunc);
658  }
659 
660  // For 'sycl::group<Dims>' kernel argument
661  // 'wrapper'-based approach using 'NormalizedKernelType' struct is not used
662  // for 'void(sycl::group<Dims>)' since 'void(sycl::group<Dims>)' is not
663  // supported in ESIMD.
664  template <class KernelType, typename ArgT, int Dims>
665  std::enable_if_t<std::is_same_v<ArgT, sycl::group<Dims>>, KernelType *>
666  ResetHostKernel(const KernelType &KernelFunc) {
667  MHostKernel.reset(
669  return (KernelType *)(MHostKernel->getPtr());
670  }
671 
679  void verifyUsedKernelBundle(const std::string &KernelName);
680 
687  template <typename KernelName, typename KernelType, int Dims,
688  typename LambdaArgType>
689  void StoreLambda(KernelType KernelFunc) {
691  constexpr bool IsCallableWithKernelHandler =
693  LambdaArgType>::value;
694 
695  if (IsCallableWithKernelHandler && MIsHost) {
696  throw sycl::feature_not_supported(
697  "kernel_handler is not yet supported by host device.",
698  PI_ERROR_INVALID_OPERATION);
699  }
700 
701  KernelType *KernelPtr =
702  ResetHostKernel<KernelType, LambdaArgType, Dims>(KernelFunc);
703 
704  constexpr bool KernelHasName =
705  KI::getName() != nullptr && KI::getName()[0] != '\0';
706 
707  // Some host compilers may have different captures from Clang. Currently
708  // there is no stable way of handling this when extracting the captures, so
709  // a static assert is made to fail for incompatible kernel lambdas.
710  static_assert(
711  !KernelHasName || sizeof(KernelFunc) == KI::getKernelSize(),
712  "Unexpected kernel lambda size. This can be caused by an "
713  "external host compiler producing a lambda with an "
714  "unexpected layout. This is a limitation of the compiler."
715  "In many cases the difference is related to capturing constexpr "
716  "variables. In such cases removing constexpr specifier aligns the "
717  "captures between the host compiler and the device compiler."
718  "\n"
719  "In case of MSVC, passing "
720  "-fsycl-host-compiler-options='/std:c++latest' "
721  "might also help.");
722 
723  // Empty name indicates that the compilation happens without integration
724  // header, so don't perform things that require it.
725  if (KernelHasName) {
726  // TODO support ESIMD in no-integration-header case too.
727  MArgs.clear();
728  extractArgsAndReqsFromLambda(reinterpret_cast<char *>(KernelPtr),
729  KI::getNumParams(), &KI::getParamDesc(0),
730  KI::isESIMD());
731  MKernelName = KI::getName();
732  } else {
733  // In case w/o the integration header it is necessary to process
734  // accessors from the list(which are associated with this handler) as
735  // arguments.
736  MArgs = std::move(MAssociatedAccesors);
737  }
738 
739  // If the kernel lambda is callable with a kernel_handler argument, manifest
740  // the associated kernel handler.
741  if (IsCallableWithKernelHandler) {
742  getOrInsertHandlerKernelBundle(/*Insert=*/true);
743  }
744  }
745 
749  template <typename PropertiesT =
751  void processProperties(PropertiesT Props) {
752  static_assert(
754  "Template type is not a property list.");
755  if constexpr (PropertiesT::template has_property<
757  auto Config = Props.template get_property<
760  setKernelCacheConfig(PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM);
761  } else if (Config == sycl::ext::intel::experimental::large_data) {
762  setKernelCacheConfig(PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA);
763  }
764  } else {
765  std::ignore = Props;
766  }
767  }
768 
773  template <int Dims_Src, int Dims_Dst>
774  static bool IsCopyingRectRegionAvailable(const range<Dims_Src> Src,
775  const range<Dims_Dst> Dst) {
776  if (Dims_Src > Dims_Dst)
777  return false;
778  for (size_t I = 0; I < Dims_Src; ++I)
779  if (Src[I] > Dst[I])
780  return false;
781  return true;
782  }
783 
789  template <typename TSrc, int DimSrc, access::mode ModeSrc,
790  access::target TargetSrc, typename TDst, int DimDst,
791  access::mode ModeDst, access::target TargetDst,
792  access::placeholder IsPHSrc, access::placeholder IsPHDst>
793  std::enable_if_t<(DimSrc > 0) && (DimDst > 0), bool>
796  if (!MIsHost &&
797  IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range()))
798  return false;
799 
800  range<1> LinearizedRange(Src.size());
801  parallel_for<
802  class __copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
803  ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
804  LinearizedRange, [=](id<1> Id) {
805  size_t Index = Id[0];
806  id<DimSrc> SrcId = detail::getDelinearizedId(Src.get_range(), Index);
807  id<DimDst> DstId = detail::getDelinearizedId(Dst.get_range(), Index);
808  Dst[DstId] = Src[SrcId];
809  });
810  return true;
811  }
812 
820  template <typename TSrc, int DimSrc, access::mode ModeSrc,
821  access::target TargetSrc, typename TDst, int DimDst,
822  access::mode ModeDst, access::target TargetDst,
823  access::placeholder IsPHSrc, access::placeholder IsPHDst>
824  std::enable_if_t<DimSrc == 0 || DimDst == 0, bool>
827  if (!MIsHost)
828  return false;
829 
830  single_task<
831  class __copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
832  ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
833  [=]() { *(Dst.get_pointer()) = *(Src.get_pointer()); });
834  return true;
835  }
836 
837 #ifndef __SYCL_DEVICE_ONLY__
838  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
844  access::target AccTarget, access::placeholder IsPH>
845  std::enable_if_t<(Dim > 0)>
847  TDst *Dst) {
848  range<Dim> Range = Src.get_range();
849  parallel_for<
850  class __copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
851  Range, [=](id<Dim> Index) {
852  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
853  using TSrcNonConst = typename std::remove_const_t<TSrc>;
854  (reinterpret_cast<TSrcNonConst *>(Dst))[LinearIndex] = Src[Index];
855  });
856  }
857 
863  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
864  access::target AccTarget, access::placeholder IsPH>
865  std::enable_if_t<Dim == 0>
867  TDst *Dst) {
868  single_task<class __copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
869  [=]() {
870  using TSrcNonConst = typename std::remove_const_t<TSrc>;
871  *(reinterpret_cast<TSrcNonConst *>(Dst)) = *(Src.get_pointer());
872  });
873  }
874 
879  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
880  access::target AccTarget, access::placeholder IsPH>
881  std::enable_if_t<(Dim > 0)>
882  copyPtrToAccHost(TSrc *Src,
884  range<Dim> Range = Dst.get_range();
885  parallel_for<
886  class __copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
887  Range, [=](id<Dim> Index) {
888  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
889  Dst[Index] = (reinterpret_cast<const TDst *>(Src))[LinearIndex];
890  });
891  }
892 
898  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
899  access::target AccTarget, access::placeholder IsPH>
900  std::enable_if_t<Dim == 0>
901  copyPtrToAccHost(TSrc *Src,
903  single_task<class __copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
904  [=]() {
905  *(Dst.get_pointer()) = *(reinterpret_cast<const TDst *>(Src));
906  });
907  }
908 #endif // __SYCL_DEVICE_ONLY__
909 
910  constexpr static bool isConstOrGlobal(access::target AccessTarget) {
911  return AccessTarget == access::target::device ||
912  AccessTarget == access::target::constant_buffer;
913  }
914 
915  constexpr static bool isImageOrImageArray(access::target AccessTarget) {
916  return AccessTarget == access::target::image ||
917  AccessTarget == access::target::image_array;
918  }
919 
920  constexpr static bool
921  isValidTargetForExplicitOp(access::target AccessTarget) {
922  return isConstOrGlobal(AccessTarget) || isImageOrImageArray(AccessTarget);
923  }
924 
925  constexpr static bool isValidModeForSourceAccessor(access::mode AccessMode) {
926  return AccessMode == access::mode::read ||
928  }
929 
930  constexpr static bool
931  isValidModeForDestinationAccessor(access::mode AccessMode) {
932  return AccessMode == access::mode::write ||
934  AccessMode == access::mode::discard_write ||
935  AccessMode == access::mode::discard_read_write;
936  }
937 
938  // PI APIs only support select fill sizes: 1, 2, 4, 8, 16, 32, 64, 128
939  constexpr static bool isBackendSupportedFillSize(size_t Size) {
940  return Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16 ||
941  Size == 32 || Size == 64 || Size == 128;
942  }
943 
944  template <int Dims, typename LambdaArgType> struct TransformUserItemType {
945  using type = std::conditional_t<
946  std::is_convertible_v<nd_item<Dims>, LambdaArgType>, nd_item<Dims>,
947  std::conditional_t<std::is_convertible_v<item<Dims>, LambdaArgType>,
948  item<Dims>, LambdaArgType>>;
949  };
950 
962  template <typename KernelName, typename KernelType, int Dims,
963  typename PropertiesT =
965  void parallel_for_lambda_impl(range<Dims> NumWorkItems, PropertiesT Props,
966  KernelType KernelFunc) {
967  throwIfActionIsCreated();
968  throwOnLocalAccessorMisuse<KernelName, KernelType>();
969  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
970 
971  // If 1D kernel argument is an integral type, convert it to sycl::item<1>
972  // If user type is convertible from sycl::item/sycl::nd_item, use
973  // sycl::item/sycl::nd_item to transport item information
974  using TransformedArgType = std::conditional_t<
975  std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
976  typename TransformUserItemType<Dims, LambdaArgType>::type>;
977 
978  // TODO: Properties may change the kernel function, so in order to avoid
979  // conflicts they should be included in the name.
980  using NameT =
982 
983  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
984 
985  // Range rounding can be disabled by the user.
986  // Range rounding is not done on the host device.
987  // Range rounding is supported only for newer SYCL standards.
988 #if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \
989  !defined(DPCPP_HOST_DEVICE_OPENMP) && \
990  !defined(DPCPP_HOST_DEVICE_PERF_NATIVE) && SYCL_LANGUAGE_VERSION >= 202001
991  // Range should be a multiple of this for reasonable performance.
992  size_t MinFactorX = 16;
993  // Range should be a multiple of this for improved performance.
994  size_t GoodFactorX = 32;
995  // Range should be at least this to make rounding worthwhile.
996  size_t MinRangeX = 1024;
997 
998  // Check if rounding parameters have been set through environment:
999  // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
1000  this->GetRangeRoundingSettings(MinFactorX, GoodFactorX, MinRangeX);
1001 
1002  // Disable the rounding-up optimizations under these conditions:
1003  // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
1004  // 2. The kernel is provided via an interoperability method.
1005  // 3. The range is already a multiple of the rounding factor.
1006  //
1007  // Cases 2 and 3 could be supported with extra effort.
1008  // As an optimization for the common case it is an
1009  // implementation choice to not support those scenarios.
1010  // Note that "this_item" is a free function, i.e. not tied to any
1011  // specific id or item. When concurrent parallel_fors are executing
1012  // on a device it is difficult to tell which parallel_for the call is
1013  // being made from. One could replicate portions of the
1014  // call-graph to make this_item calls kernel-specific but this is
1015  // not considered worthwhile.
1016 
1017  // Get the kernel name to check condition 2.
1018  std::string KName = typeid(NameT *).name();
1019  using KI = detail::KernelInfo<KernelName>;
1020  bool DisableRounding =
1021  this->DisableRangeRounding() ||
1022  (KI::getName() == nullptr || KI::getName()[0] == '\0');
1023 
1024  // Perform range rounding if rounding-up is enabled
1025  // and there are sufficient work-items to need rounding
1026  // and the user-specified range is not a multiple of a "good" value.
1027  if (!DisableRounding && (NumWorkItems[0] >= MinRangeX) &&
1028  (NumWorkItems[0] % MinFactorX != 0)) {
1029  // It is sufficient to round up just the first dimension.
1030  // Multiplying the rounded-up value of the first dimension
1031  // by the values of the remaining dimensions (if any)
1032  // will yield a rounded-up value for the total range.
1033  size_t NewValX =
1034  ((NumWorkItems[0] + GoodFactorX - 1) / GoodFactorX) * GoodFactorX;
1035  if (this->RangeRoundingTrace())
1036  std::cout << "parallel_for range adjusted from " << NumWorkItems[0]
1037  << " to " << NewValX << std::endl;
1038 
1039  using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
1040  auto Wrapper =
1041  getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
1042  KernelFunc, NumWorkItems);
1043 
1044  using KName = std::conditional_t<std::is_same<KernelType, NameT>::value,
1045  decltype(Wrapper), NameWT>;
1046 
1047  range<Dims> AdjustedRange = NumWorkItems;
1048  AdjustedRange.set_range_dim0(NewValX);
1049  kernel_parallel_for_wrapper<KName, TransformedArgType, decltype(Wrapper),
1050  PropertiesT>(Wrapper);
1051 #ifndef __SYCL_DEVICE_ONLY__
1052  detail::checkValueRange<Dims>(AdjustedRange);
1053  MNDRDesc.set(std::move(AdjustedRange));
1054  StoreLambda<KName, decltype(Wrapper), Dims, TransformedArgType>(
1055  std::move(Wrapper));
1056  setType(detail::CG::Kernel);
1057 #endif
1058  } else
1059 #endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ &&
1060  // !DPCPP_HOST_DEVICE_OPENMP && !DPCPP_HOST_DEVICE_PERF_NATIVE &&
1061  // SYCL_LANGUAGE_VERSION >= 202001
1062  {
1063  (void)NumWorkItems;
1064  kernel_parallel_for_wrapper<NameT, TransformedArgType, KernelType,
1065  PropertiesT>(KernelFunc);
1066 #ifndef __SYCL_DEVICE_ONLY__
1067  processProperties<PropertiesT>(Props);
1068  detail::checkValueRange<Dims>(NumWorkItems);
1069  MNDRDesc.set(std::move(NumWorkItems));
1070  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1071  std::move(KernelFunc));
1072  setType(detail::CG::Kernel);
1073 #endif
1074  }
1075  }
1076 
1090  template <typename KernelName, typename KernelType, int Dims,
1091  typename PropertiesT>
1092  void parallel_for_impl(nd_range<Dims> ExecutionRange, PropertiesT Props,
1094  throwIfActionIsCreated();
1095  // TODO: Properties may change the kernel function, so in order to avoid
1096  // conflicts they should be included in the name.
1097  using NameT =
1099  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1100  using LambdaArgType =
1101  sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
1102  // If user type is convertible from sycl::item/sycl::nd_item, use
1103  // sycl::item/sycl::nd_item to transport item information
1104  using TransformedArgType =
1105  typename TransformUserItemType<Dims, LambdaArgType>::type;
1106  (void)ExecutionRange;
1107  kernel_parallel_for_wrapper<NameT, TransformedArgType, KernelType,
1108  PropertiesT>(KernelFunc);
1109 #ifndef __SYCL_DEVICE_ONLY__
1110  processProperties<PropertiesT>(Props);
1111  detail::checkValueRange<Dims>(ExecutionRange);
1112  MNDRDesc.set(std::move(ExecutionRange));
1113  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1114  std::move(KernelFunc));
1115  setType(detail::CG::Kernel);
1116 #endif
1117  }
1118 
1126  template <int Dims>
1127  void parallel_for_impl(range<Dims> NumWorkItems, kernel Kernel) {
1128  throwIfActionIsCreated();
1129  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1130  detail::checkValueRange<Dims>(NumWorkItems);
1131  MNDRDesc.set(std::move(NumWorkItems));
1132  setType(detail::CG::Kernel);
1133  extractArgsAndReqs();
1134  MKernelName = getKernelName();
1135  }
1136 
1147  template <typename KernelName, typename KernelType, int Dims,
1148  typename PropertiesT =
1150  void parallel_for_work_group_lambda_impl(range<Dims> NumWorkGroups,
1151  PropertiesT Props,
1153  throwIfActionIsCreated();
1154  // TODO: Properties may change the kernel function, so in order to avoid
1155  // conflicts they should be included in the name.
1156  using NameT =
1158  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1159  using LambdaArgType =
1160  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
1161  (void)NumWorkGroups;
1162  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType, KernelType,
1163  PropertiesT>(KernelFunc);
1164 #ifndef __SYCL_DEVICE_ONLY__
1165  processProperties<PropertiesT>(Props);
1166  detail::checkValueRange<Dims>(NumWorkGroups);
1167  MNDRDesc.setNumWorkGroups(NumWorkGroups);
1168  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1169  setType(detail::CG::Kernel);
1170 #endif // __SYCL_DEVICE_ONLY__
1171  }
1172 
1185  template <typename KernelName, typename KernelType, int Dims,
1186  typename PropertiesT =
1188  void parallel_for_work_group_lambda_impl(range<Dims> NumWorkGroups,
1190  PropertiesT Props,
1192  throwIfActionIsCreated();
1193  // TODO: Properties may change the kernel function, so in order to avoid
1194  // conflicts they should be included in the name.
1195  using NameT =
1197  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1198  using LambdaArgType =
1199  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
1200  (void)NumWorkGroups;
1201  (void)WorkGroupSize;
1202  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType, KernelType,
1203  PropertiesT>(KernelFunc);
1204 #ifndef __SYCL_DEVICE_ONLY__
1205  processProperties<PropertiesT>(Props);
1206  nd_range<Dims> ExecRange =
1207  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
1208  detail::checkValueRange<Dims>(ExecRange);
1209  MNDRDesc.set(std::move(ExecRange));
1210  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1211  setType(detail::CG::Kernel);
1212 #endif // __SYCL_DEVICE_ONLY__
1213  }
1214 
1215 #ifdef SYCL_LANGUAGE_VERSION
1216 #define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel]]
1217 #else
1218 #define __SYCL_KERNEL_ATTR__
1219 #endif
1220 
1221  // NOTE: the name of this function - "kernel_single_task" - is used by the
1222  // Front End to determine kernel invocation kind.
1223  template <typename KernelName, typename KernelType, typename... Props>
1224 #ifdef __SYCL_DEVICE_ONLY__
1225  [[__sycl_detail__::add_ir_attributes_function(
1226  "sycl-single-task",
1227  ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
1228  nullptr,
1229  ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
1230 #endif
1232  kernel_single_task(_KERNELFUNCPARAM(KernelFunc)) {
1233 #ifdef __SYCL_DEVICE_ONLY__
1234  KernelFunc();
1235 #else
1236  (void)KernelFunc;
1237 #endif
1238  }
1239 
1240  // NOTE: the name of this function - "kernel_single_task" - is used by the
1241  // Front End to determine kernel invocation kind.
1242  template <typename KernelName, typename KernelType, typename... Props>
1243 #ifdef __SYCL_DEVICE_ONLY__
1244  [[__sycl_detail__::add_ir_attributes_function(
1245  "sycl-single-task",
1246  ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
1247  nullptr,
1248  ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
1249 #endif
1251  kernel_single_task(_KERNELFUNCPARAM(KernelFunc), kernel_handler KH) {
1252 #ifdef __SYCL_DEVICE_ONLY__
1253  KernelFunc(KH);
1254 #else
1255  (void)KernelFunc;
1256  (void)KH;
1257 #endif
1258  }
1259 
1260  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1261  // Front End to determine kernel invocation kind.
1262  template <typename KernelName, typename ElementType, typename KernelType,
1263  typename... Props>
1264 #ifdef __SYCL_DEVICE_ONLY__
1265  [[__sycl_detail__::add_ir_attributes_function(
1266  ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
1267  ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
1268 #endif
1270  kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc)) {
1271 #ifdef __SYCL_DEVICE_ONLY__
1272  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1273 #else
1274  (void)KernelFunc;
1275 #endif
1276  }
1277 
1278  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1279  // Front End to determine kernel invocation kind.
1280  template <typename KernelName, typename ElementType, typename KernelType,
1281  typename... Props>
1282 #ifdef __SYCL_DEVICE_ONLY__
1283  [[__sycl_detail__::add_ir_attributes_function(
1284  ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
1285  ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
1286 #endif
1288  kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc), kernel_handler KH) {
1289 #ifdef __SYCL_DEVICE_ONLY__
1290  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1291 #else
1292  (void)KernelFunc;
1293  (void)KH;
1294 #endif
1295  }
1296 
1297  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1298  // used by the Front End to determine kernel invocation kind.
1299  template <typename KernelName, typename ElementType, typename KernelType,
1300  typename... Props>
1301 #ifdef __SYCL_DEVICE_ONLY__
1302  [[__sycl_detail__::add_ir_attributes_function(
1303  ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
1304  ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
1305 #endif
1307  kernel_parallel_for_work_group(_KERNELFUNCPARAM(KernelFunc)) {
1308 #ifdef __SYCL_DEVICE_ONLY__
1309  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1310 #else
1311  (void)KernelFunc;
1312 #endif
1313  }
1314 
1315  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1316  // used by the Front End to determine kernel invocation kind.
1317  template <typename KernelName, typename ElementType, typename KernelType,
1318  typename... Props>
1319 #ifdef __SYCL_DEVICE_ONLY__
1320  [[__sycl_detail__::add_ir_attributes_function(
1321  ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
1322  ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
1323 #endif
1325  kernel_parallel_for_work_group(_KERNELFUNCPARAM(KernelFunc),
1326  kernel_handler KH) {
1327 #ifdef __SYCL_DEVICE_ONLY__
1328  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1329 #else
1330  (void)KernelFunc;
1331  (void)KH;
1332 #endif
1333  }
1334 
1335  template <typename... Props> struct KernelPropertiesUnpackerImpl {
1336  // Just pass extra Props... as template parameters to the underlying
1337  // Caller->* member functions. Don't have reflection so try to use
1338  // templates as much as possible to reduce the amount of boilerplate code
1339  // needed. All the type checks are expected to be done at the Caller's
1340  // methods side.
1341 
1342  template <typename... TypesToForward, typename... ArgsTy>
1343  static void kernel_single_task_unpack(handler *h, ArgsTy... Args) {
1344  h->kernel_single_task<TypesToForward..., Props...>(Args...);
1345  }
1346 
1347  template <typename... TypesToForward, typename... ArgsTy>
1348  static void kernel_parallel_for_unpack(handler *h, ArgsTy... Args) {
1349  h->kernel_parallel_for<TypesToForward..., Props...>(Args...);
1350  }
1351 
1352  template <typename... TypesToForward, typename... ArgsTy>
1353  static void kernel_parallel_for_work_group_unpack(handler *h,
1354  ArgsTy... Args) {
1355  h->kernel_parallel_for_work_group<TypesToForward..., Props...>(Args...);
1356  }
1357  };
1358 
1359  template <typename PropertiesT>
1360  struct KernelPropertiesUnpacker : public KernelPropertiesUnpackerImpl<> {
1361  // This should always fail outside the specialization below but must be
1362  // dependent to avoid failing even if not instantiated.
1363  static_assert(
1364  ext::oneapi::experimental::is_property_list<PropertiesT>::value,
1365  "Template type is not a property list.");
1366  };
1367 
1368  template <typename... Props>
1369  struct KernelPropertiesUnpacker<
1371  : public KernelPropertiesUnpackerImpl<Props...> {};
1372 
1373  // Helper function to
1374  //
1375  // * Make use of the KernelPropertiesUnpacker above
1376  // * Decide if we need an extra kernel_handler parameter
1377  //
1378  // The interface uses a \p Lambda callback to propagate that information back
1379  // to the caller as we need the caller to communicate:
1380  //
1381  // * Name of the method to call
1382  // * Provide explicit template type parameters for the call
1383  //
1384  // Couldn't think of a better way to achieve both.
1385  template <typename KernelType, typename PropertiesT, bool HasKernelHandlerArg,
1386  typename FuncTy>
1387  void unpack(_KERNELFUNCPARAM(KernelFunc), FuncTy Lambda) {
1388 #ifdef __SYCL_DEVICE_ONLY__
1389  detail::CheckDeviceCopyable<KernelType>();
1390 #endif // __SYCL_DEVICE_ONLY__
1391  using MergedPropertiesT =
1392  typename detail::GetMergedKernelProperties<KernelType,
1393  PropertiesT>::type;
1394  using Unpacker = KernelPropertiesUnpacker<MergedPropertiesT>;
1395  // If there are properties provided by get method then process them.
1396  if constexpr (ext::oneapi::experimental::detail::
1397  HasKernelPropertiesGetMethod<
1398  _KERNELFUNCPARAMTYPE>::value) {
1399  processProperties(
1400  KernelFunc.get(ext::oneapi::experimental::properties_tag{}));
1401  }
1402  if constexpr (HasKernelHandlerArg) {
1403  kernel_handler KH;
1404  Lambda(Unpacker{}, this, KernelFunc, KH);
1405  } else {
1406  Lambda(Unpacker{}, this, KernelFunc);
1407  }
1408  }
1409 
1410  // NOTE: to support kernel_handler argument in kernel lambdas, only
1411  // kernel_***_wrapper functions must be called in this code
1412 
1413  template <typename KernelName, typename KernelType,
1414  typename PropertiesT =
1416  void kernel_single_task_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1417  unpack<KernelType, PropertiesT,
1418  detail::KernelLambdaHasKernelHandlerArgT<KernelType>::value>(
1419  KernelFunc, [&](auto Unpacker, auto... args) {
1420  Unpacker.template kernel_single_task_unpack<KernelName, KernelType>(
1421  args...);
1422  });
1423  }
1424 
1425  template <typename KernelName, typename ElementType, typename KernelType,
1426  typename PropertiesT =
1428  void kernel_parallel_for_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1429  unpack<KernelType, PropertiesT,
1430  detail::KernelLambdaHasKernelHandlerArgT<KernelType,
1431  ElementType>::value>(
1432  KernelFunc, [&](auto Unpacker, auto... args) {
1433  Unpacker.template kernel_parallel_for_unpack<KernelName, ElementType,
1434  KernelType>(args...);
1435  });
1436  }
1437 
1438  template <typename KernelName, typename ElementType, typename KernelType,
1439  typename PropertiesT =
1441  void kernel_parallel_for_work_group_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1442  unpack<KernelType, PropertiesT,
1443  detail::KernelLambdaHasKernelHandlerArgT<KernelType,
1444  ElementType>::value>(
1445  KernelFunc, [&](auto Unpacker, auto... args) {
1446  Unpacker.template kernel_parallel_for_work_group_unpack<
1447  KernelName, ElementType, KernelType>(args...);
1448  });
1449  }
1450 
1458  template <typename KernelName, typename KernelType,
1459  typename PropertiesT =
1461  void single_task_lambda_impl(PropertiesT Props,
1463  throwIfActionIsCreated();
1464  throwOnLocalAccessorMisuse<KernelName, KernelType>();
1465  // TODO: Properties may change the kernel function, so in order to avoid
1466  // conflicts they should be included in the name.
1467  using NameT =
1468  typename detail::get_kernel_name_t<KernelName, KernelType>::name;
1469  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1470  kernel_single_task_wrapper<NameT, KernelType, PropertiesT>(KernelFunc);
1471 #ifndef __SYCL_DEVICE_ONLY__
1472  // No need to check if range is out of INT_MAX limits as it's compile-time
1473  // known constant.
1474  MNDRDesc.set(range<1>{1});
1475  processProperties<PropertiesT>(Props);
1476  StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(KernelFunc);
1477  setType(detail::CG::Kernel);
1478 #endif
1479  }
1480 
1481  void setStateExplicitKernelBundle();
1482  void setStateSpecConstSet();
1483  bool isStateExplicitKernelBundle() const;
1484 
1485  std::shared_ptr<detail::kernel_bundle_impl>
1486  getOrInsertHandlerKernelBundle(bool Insert) const;
1487 
1488  void setHandlerKernelBundle(kernel Kernel);
1489 
1490  void setHandlerKernelBundle(
1491  const std::shared_ptr<detail::kernel_bundle_impl> &NewKernelBundleImpPtr);
1492 
1493  template <typename FuncT>
1494  std::enable_if_t<detail::check_fn_signature<std::remove_reference_t<FuncT>,
1495  void()>::value ||
1496  detail::check_fn_signature<std::remove_reference_t<FuncT>,
1497  void(interop_handle)>::value>
1498  host_task_impl(FuncT &&Func) {
1499  throwIfActionIsCreated();
1500 
1501  MNDRDesc.set(range<1>(1));
1502  MArgs = std::move(MAssociatedAccesors);
1503 
1504  MHostTask.reset(new detail::HostTask(std::move(Func)));
1505 
1506  setType(detail::CG::CodeplayHostTask);
1507  }
1508 
1509 public:
1510  handler(const handler &) = delete;
1511  handler(handler &&) = delete;
1512  handler &operator=(const handler &) = delete;
1513  handler &operator=(handler &&) = delete;
1514 
1515  template <auto &SpecName>
1517  typename std::remove_reference_t<decltype(SpecName)>::value_type Value) {
1518 
1519  setStateSpecConstSet();
1520 
1521  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1522  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1523 
1524  detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1526  .set_specialization_constant<SpecName>(Value);
1527  }
1528 
1529  template <auto &SpecName>
1530  typename std::remove_reference_t<decltype(SpecName)>::value_type
1532 
1533  if (isStateExplicitKernelBundle())
1534  throw sycl::exception(make_error_code(errc::invalid),
1535  "Specialization constants cannot be read after "
1536  "explicitly setting the used kernel bundle");
1537 
1538  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1539  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1540 
1541  return detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1543  .get_specialization_constant<SpecName>();
1544  }
1545 
1546  void
1547  use_kernel_bundle(const kernel_bundle<bundle_state::executable> &ExecBundle);
1548 
1557  template <typename DataT, int Dims, access::mode AccMode,
1558  access::target AccTarget, access::placeholder isPlaceholder>
1560  if (Acc.empty())
1561  throw sycl::exception(make_error_code(errc::invalid),
1562  "require() cannot be called on empty accessors");
1563  if (Acc.is_placeholder())
1564  associateWithHandler(&Acc, AccTarget);
1565  }
1566 
1570  void depends_on(event Event);
1571 
1575  void depends_on(const std::vector<event> &Events);
1576 
1577  template <typename T>
1578  using remove_cv_ref_t = typename std::remove_cv_t<std::remove_reference_t<T>>;
1579 
1580  template <typename U, typename T>
1581  using is_same_type = std::is_same<remove_cv_ref_t<U>, remove_cv_ref_t<T>>;
1582 
1583  template <typename T> struct ShouldEnableSetArg {
1584  static constexpr bool value =
1585  std::is_trivially_copyable_v<std::remove_reference_t<T>>
1586 #if SYCL_LANGUAGE_VERSION && SYCL_LANGUAGE_VERSION <= 201707
1587  && std::is_standard_layout<std::remove_reference_t<T>>::value
1588 #endif
1589  || is_same_type<sampler, T>::value // Sampler
1591  std::is_pointer_v<remove_cv_ref_t<T>>) // USM
1592  || is_same_type<cl_mem, T>::value; // Interop
1593  };
1594 
1601  template <typename T>
1602  typename std::enable_if_t<ShouldEnableSetArg<T>::value, void>
1603  set_arg(int ArgIndex, T &&Arg) {
1604  setArgHelper(ArgIndex, std::move(Arg));
1605  }
1606 
1607  template <typename DataT, int Dims, access::mode AccessMode,
1609  void
1610  set_arg(int ArgIndex,
1612  setArgHelper(ArgIndex, std::move(Arg));
1613  }
1614 
1615  template <typename DataT, int Dims>
1616  void set_arg(int ArgIndex, local_accessor<DataT, Dims> Arg) {
1617  setArgHelper(ArgIndex, std::move(Arg));
1618  }
1619 
1625  template <typename... Ts> void set_args(Ts &&...Args) {
1626  setArgsHelper(0, std::move(Args)...);
1627  }
1628 
1636  template <typename KernelName = detail::auto_name, typename KernelType>
1638  single_task_lambda_impl<KernelName>(
1640  }
1641 
1642  template <typename KernelName = detail::auto_name, typename KernelType>
1644  parallel_for_lambda_impl<KernelName>(
1646  std::move(KernelFunc));
1647  }
1648 
1649  template <typename KernelName = detail::auto_name, typename KernelType>
1651  parallel_for_lambda_impl<KernelName>(
1653  std::move(KernelFunc));
1654  }
1655 
1656  template <typename KernelName = detail::auto_name, typename KernelType>
1658  parallel_for_lambda_impl<KernelName>(
1660  std::move(KernelFunc));
1661  }
1662 
1667  template <typename FuncT>
1669  "run_on_host_intel() is deprecated, use host_task() instead")
1670  void run_on_host_intel(FuncT Func) {
1671  throwIfActionIsCreated();
1672  // No need to check if range is out of INT_MAX limits as it's compile-time
1673  // known constant
1674  MNDRDesc.set(range<1>{1});
1675 
1676  MArgs = std::move(MAssociatedAccesors);
1677  MHostKernel.reset(new detail::HostKernel<FuncT, void, 1>(std::move(Func)));
1678  setType(detail::CG::RunOnHostIntel);
1679  }
1680 
1682  template <typename FuncT>
1683  std::enable_if_t<detail::check_fn_signature<std::remove_reference_t<FuncT>,
1684  void()>::value ||
1686  void(interop_handle)>::value>
1687  host_task(FuncT &&Func) {
1688  host_task_impl(Func);
1689  }
1690 
1704  template <typename KernelName = detail::auto_name, typename KernelType,
1705  int Dims>
1706  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL2020")
1707  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
1709  throwIfActionIsCreated();
1710  using NameT =
1712  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1713  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
1714  (void)NumWorkItems;
1715  (void)WorkItemOffset;
1716  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
1717 #ifndef __SYCL_DEVICE_ONLY__
1718  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
1719  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
1720  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1721  setType(detail::CG::Kernel);
1722 #endif
1723  }
1724 
1735  template <typename KernelName = detail::auto_name, typename KernelType,
1736  int Dims>
1739  parallel_for_work_group_lambda_impl<KernelName>(
1741  KernelFunc);
1742  }
1743 
1756  template <typename KernelName = detail::auto_name, typename KernelType,
1757  int Dims>
1761  parallel_for_work_group_lambda_impl<KernelName>(
1762  NumWorkGroups, WorkGroupSize,
1764  }
1765 
1772  void single_task(kernel Kernel) {
1773  throwIfActionIsCreated();
1774  // Ignore any set kernel bundles and use the one associated with the kernel
1775  setHandlerKernelBundle(Kernel);
1776  // No need to check if range is out of INT_MAX limits as it's compile-time
1777  // known constant
1778  MNDRDesc.set(range<1>{1});
1779  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1780  setType(detail::CG::Kernel);
1781  extractArgsAndReqs();
1782  MKernelName = getKernelName();
1783  }
1784 
1785  void parallel_for(range<1> NumWorkItems, kernel Kernel) {
1786  parallel_for_impl(NumWorkItems, Kernel);
1787  }
1788 
1789  void parallel_for(range<2> NumWorkItems, kernel Kernel) {
1790  parallel_for_impl(NumWorkItems, Kernel);
1791  }
1792 
1793  void parallel_for(range<3> NumWorkItems, kernel Kernel) {
1794  parallel_for_impl(NumWorkItems, Kernel);
1795  }
1796 
1805  template <int Dims>
1806  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
1807  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
1808  kernel Kernel) {
1809  throwIfActionIsCreated();
1810  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1811  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
1812  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
1813  setType(detail::CG::Kernel);
1814  extractArgsAndReqs();
1815  MKernelName = getKernelName();
1816  }
1817 
1826  template <int Dims> void parallel_for(nd_range<Dims> NDRange, kernel Kernel) {
1827  throwIfActionIsCreated();
1828  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1829  detail::checkValueRange<Dims>(NDRange);
1830  MNDRDesc.set(std::move(NDRange));
1831  setType(detail::CG::Kernel);
1832  extractArgsAndReqs();
1833  MKernelName = getKernelName();
1834  }
1835 
1842  template <typename KernelName = detail::auto_name, typename KernelType>
1844  throwIfActionIsCreated();
1845  // Ignore any set kernel bundles and use the one associated with the kernel
1846  setHandlerKernelBundle(Kernel);
1847  using NameT =
1849  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1850  (void)Kernel;
1851  kernel_single_task<NameT>(KernelFunc);
1852 #ifndef __SYCL_DEVICE_ONLY__
1853  // No need to check if range is out of INT_MAX limits as it's compile-time
1854  // known constant
1855  MNDRDesc.set(range<1>{1});
1856  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1857  setType(detail::CG::Kernel);
1858  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
1859  extractArgsAndReqs();
1860  MKernelName = getKernelName();
1861  } else
1862  StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(std::move(KernelFunc));
1863 #else
1864  detail::CheckDeviceCopyable<KernelType>();
1865 #endif
1866  }
1867 
1871  template <typename FuncT>
1872  __SYCL_DEPRECATED("interop_task() is deprecated, use host_task() instead")
1873  void interop_task(FuncT Func) {
1874 
1875  MInteropTask.reset(new detail::InteropTask(std::move(Func)));
1876  setType(detail::CG::CodeplayInteropTask);
1877  }
1878 
1886  template <typename KernelName = detail::auto_name, typename KernelType,
1887  int Dims>
1888  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
1890  throwIfActionIsCreated();
1891  // Ignore any set kernel bundles and use the one associated with the kernel
1892  setHandlerKernelBundle(Kernel);
1893  using NameT =
1895  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1896  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
1897  (void)Kernel;
1898  (void)NumWorkItems;
1899  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
1900 #ifndef __SYCL_DEVICE_ONLY__
1901  detail::checkValueRange<Dims>(NumWorkItems);
1902  MNDRDesc.set(std::move(NumWorkItems));
1903  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1904  setType(detail::CG::Kernel);
1905  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
1906  extractArgsAndReqs();
1907  MKernelName = getKernelName();
1908  } else
1909  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
1910  std::move(KernelFunc));
1911 #endif
1912  }
1913 
1923  template <typename KernelName = detail::auto_name, typename KernelType,
1924  int Dims>
1925  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
1926  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
1927  id<Dims> WorkItemOffset, _KERNELFUNCPARAM(KernelFunc)) {
1928  throwIfActionIsCreated();
1929  // Ignore any set kernel bundles and use the one associated with the kernel
1930  setHandlerKernelBundle(Kernel);
1931  using NameT =
1933  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1934  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
1935  (void)Kernel;
1936  (void)NumWorkItems;
1937  (void)WorkItemOffset;
1938  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
1939 #ifndef __SYCL_DEVICE_ONLY__
1940  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
1941  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
1942  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1943  setType(detail::CG::Kernel);
1944  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
1945  extractArgsAndReqs();
1946  MKernelName = getKernelName();
1947  } else
1948  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
1949  std::move(KernelFunc));
1950 #endif
1951  }
1952 
1962  template <typename KernelName = detail::auto_name, typename KernelType,
1963  int Dims>
1964  void parallel_for(kernel Kernel, nd_range<Dims> NDRange,
1966  throwIfActionIsCreated();
1967  // Ignore any set kernel bundles and use the one associated with the kernel
1968  setHandlerKernelBundle(Kernel);
1969  using NameT =
1971  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1972  using LambdaArgType =
1973  sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
1974  (void)Kernel;
1975  (void)NDRange;
1976  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
1977 #ifndef __SYCL_DEVICE_ONLY__
1978  detail::checkValueRange<Dims>(NDRange);
1979  MNDRDesc.set(std::move(NDRange));
1980  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1981  setType(detail::CG::Kernel);
1982  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
1983  extractArgsAndReqs();
1984  MKernelName = getKernelName();
1985  } else
1986  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
1987  std::move(KernelFunc));
1988 #endif
1989  }
1990 
2004  template <typename KernelName = detail::auto_name, typename KernelType,
2005  int Dims>
2006  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2008  throwIfActionIsCreated();
2009  // Ignore any set kernel bundles and use the one associated with the kernel
2010  setHandlerKernelBundle(Kernel);
2011  using NameT =
2013  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2014  using LambdaArgType =
2015  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
2016  (void)Kernel;
2017  (void)NumWorkGroups;
2018  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2019 #ifndef __SYCL_DEVICE_ONLY__
2020  detail::checkValueRange<Dims>(NumWorkGroups);
2021  MNDRDesc.setNumWorkGroups(NumWorkGroups);
2022  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2023  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2024  setType(detail::CG::Kernel);
2025 #endif // __SYCL_DEVICE_ONLY__
2026  }
2027 
2043  template <typename KernelName = detail::auto_name, typename KernelType,
2044  int Dims>
2045  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2048  throwIfActionIsCreated();
2049  // Ignore any set kernel bundles and use the one associated with the kernel
2050  setHandlerKernelBundle(Kernel);
2051  using NameT =
2053  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2054  using LambdaArgType =
2055  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
2056  (void)Kernel;
2057  (void)NumWorkGroups;
2058  (void)WorkGroupSize;
2059  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2060 #ifndef __SYCL_DEVICE_ONLY__
2061  nd_range<Dims> ExecRange =
2062  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
2063  detail::checkValueRange<Dims>(ExecRange);
2064  MNDRDesc.set(std::move(ExecRange));
2065  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2066  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2067  setType(detail::CG::Kernel);
2068 #endif // __SYCL_DEVICE_ONLY__
2069  }
2070 
2071  template <typename KernelName = detail::auto_name, typename KernelType,
2072  typename PropertiesT>
2073  std::enable_if_t<
2075  single_task(PropertiesT Props, _KERNELFUNCPARAM(KernelFunc)) {
2076  single_task_lambda_impl<KernelName, KernelType, PropertiesT>(Props,
2077  KernelFunc);
2078  }
2079 
2080  template <typename KernelName = detail::auto_name, typename KernelType,
2081  typename PropertiesT>
2082  std::enable_if_t<
2084  parallel_for(range<1> NumWorkItems, PropertiesT Props,
2086  parallel_for_lambda_impl<KernelName, KernelType, 1, PropertiesT>(
2087  NumWorkItems, Props, std::move(KernelFunc));
2088  }
2089 
2090  template <typename KernelName = detail::auto_name, typename KernelType,
2091  typename PropertiesT>
2092  std::enable_if_t<
2094  parallel_for(range<2> NumWorkItems, PropertiesT Props,
2096  parallel_for_lambda_impl<KernelName, KernelType, 2, PropertiesT>(
2097  NumWorkItems, Props, std::move(KernelFunc));
2098  }
2099 
2100  template <typename KernelName = detail::auto_name, typename KernelType,
2101  typename PropertiesT>
2102  std::enable_if_t<
2104  parallel_for(range<3> NumWorkItems, PropertiesT Props,
2106  parallel_for_lambda_impl<KernelName, KernelType, 3, PropertiesT>(
2107  NumWorkItems, Props, std::move(KernelFunc));
2108  }
2109 
2110  template <typename KernelName = detail::auto_name, typename KernelType,
2111  typename PropertiesT, int Dims>
2112  std::enable_if_t<
2114  parallel_for(nd_range<Dims> Range, PropertiesT Properties,
2116  parallel_for_impl<KernelName>(Range, Properties, std::move(KernelFunc));
2117  }
2118 
2120 
2121  template <typename KernelName = detail::auto_name, int Dims,
2122  typename PropertiesT, typename... RestT>
2123  std::enable_if_t<
2124  (sizeof...(RestT) > 1) &&
2125  detail::AreAllButLastReductions<RestT...>::value &&
2127  parallel_for(range<Dims> Range, PropertiesT Properties, RestT &&...Rest) {
2128  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2129  std::forward<RestT>(Rest)...);
2130  }
2131 
2132  template <typename KernelName = detail::auto_name, int Dims,
2133  typename... RestT>
2134  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2135  parallel_for(range<Dims> Range, RestT &&...Rest) {
2136  parallel_for<KernelName>(
2138  std::forward<RestT>(Rest)...);
2139  }
2140 
2141  template <typename KernelName = detail::auto_name, int Dims,
2142  typename PropertiesT, typename... RestT>
2143  std::enable_if_t<
2144  (sizeof...(RestT) > 1) &&
2145  detail::AreAllButLastReductions<RestT...>::value &&
2147  parallel_for(nd_range<Dims> Range, PropertiesT Properties, RestT &&...Rest) {
2148  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2149  std::forward<RestT>(Rest)...);
2150  }
2151 
2152  template <typename KernelName = detail::auto_name, int Dims,
2153  typename... RestT>
2154  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2155  parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
2156  parallel_for<KernelName>(
2158  std::forward<RestT>(Rest)...);
2159  }
2160 
2162 
2163  template <typename KernelName = detail::auto_name, typename KernelType,
2164  int Dims, typename PropertiesT>
2165  void parallel_for_work_group(range<Dims> NumWorkGroups, PropertiesT Props,
2167  parallel_for_work_group_lambda_impl<KernelName, KernelType, Dims,
2168  PropertiesT>(NumWorkGroups, Props,
2169  KernelFunc);
2170  }
2171 
2172  template <typename KernelName = detail::auto_name, typename KernelType,
2173  int Dims, typename PropertiesT>
2175  range<Dims> WorkGroupSize, PropertiesT Props,
2177  parallel_for_work_group_lambda_impl<KernelName, KernelType, Dims,
2178  PropertiesT>(
2179  NumWorkGroups, WorkGroupSize, Props, KernelFunc);
2180  }
2181 
2182  // Clean up KERNELFUNC macro.
2183 #undef _KERNELFUNCPARAM
2184 
2185  // Explicit copy operations API
2186 
2194  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2195  access::target AccessTarget,
2196  access::placeholder IsPlaceholder = access::placeholder::false_t>
2198  std::shared_ptr<T_Dst> Dst) {
2199  if (Src.is_placeholder())
2200  checkIfPlaceholderIsBoundToHandler(Src);
2201 
2202  throwIfActionIsCreated();
2203  static_assert(isValidTargetForExplicitOp(AccessTarget),
2204  "Invalid accessor target for the copy method.");
2205  static_assert(isValidModeForSourceAccessor(AccessMode),
2206  "Invalid accessor mode for the copy method.");
2207  // Make sure data shared_ptr points to is not released until we finish
2208  // work with it.
2209  CGData.MSharedPtrStorage.push_back(Dst);
2210  typename std::shared_ptr<T_Dst>::element_type *RawDstPtr = Dst.get();
2211  copy(Src, RawDstPtr);
2212  }
2213 
2221  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2222  access::target AccessTarget,
2223  access::placeholder IsPlaceholder = access::placeholder::false_t>
2224  void
2225  copy(std::shared_ptr<T_Src> Src,
2227  if (Dst.is_placeholder())
2228  checkIfPlaceholderIsBoundToHandler(Dst);
2229 
2230  throwIfActionIsCreated();
2231  static_assert(isValidTargetForExplicitOp(AccessTarget),
2232  "Invalid accessor target for the copy method.");
2233  static_assert(isValidModeForDestinationAccessor(AccessMode),
2234  "Invalid accessor mode for the copy method.");
2235  // Make sure data shared_ptr points to is not released until we finish
2236  // work with it.
2237  CGData.MSharedPtrStorage.push_back(Src);
2238  typename std::shared_ptr<T_Src>::element_type *RawSrcPtr = Src.get();
2239  copy(RawSrcPtr, Dst);
2240  }
2241 
2249  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2250  access::target AccessTarget,
2251  access::placeholder IsPlaceholder = access::placeholder::false_t>
2253  T_Dst *Dst) {
2254  if (Src.is_placeholder())
2255  checkIfPlaceholderIsBoundToHandler(Src);
2256 
2257  throwIfActionIsCreated();
2258  static_assert(isValidTargetForExplicitOp(AccessTarget),
2259  "Invalid accessor target for the copy method.");
2260  static_assert(isValidModeForSourceAccessor(AccessMode),
2261  "Invalid accessor mode for the copy method.");
2262 #ifndef __SYCL_DEVICE_ONLY__
2263  if (MIsHost) {
2264  // TODO: Temporary implementation for host. Should be handled by memory
2265  // manager.
2266  copyAccToPtrHost(Src, Dst);
2267  return;
2268  }
2269 #endif
2270  setType(detail::CG::CopyAccToPtr);
2271 
2273  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2274 
2275  CGData.MRequirements.push_back(AccImpl.get());
2276  MSrcPtr = static_cast<void *>(AccImpl.get());
2277  MDstPtr = static_cast<void *>(Dst);
2278  // Store copy of accessor to the local storage to make sure it is alive
2279  // until we finish
2280  CGData.MAccStorage.push_back(std::move(AccImpl));
2281  }
2282 
2290  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2291  access::target AccessTarget,
2292  access::placeholder IsPlaceholder = access::placeholder::false_t>
2293  void
2294  copy(const T_Src *Src,
2296  if (Dst.is_placeholder())
2297  checkIfPlaceholderIsBoundToHandler(Dst);
2298 
2299  throwIfActionIsCreated();
2300  static_assert(isValidTargetForExplicitOp(AccessTarget),
2301  "Invalid accessor target for the copy method.");
2302  static_assert(isValidModeForDestinationAccessor(AccessMode),
2303  "Invalid accessor mode for the copy method.");
2304 #ifndef __SYCL_DEVICE_ONLY__
2305  if (MIsHost) {
2306  // TODO: Temporary implementation for host. Should be handled by memory
2307  // manager.
2308  copyPtrToAccHost(Src, Dst);
2309  return;
2310  }
2311 #endif
2312  setType(detail::CG::CopyPtrToAcc);
2313 
2315  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2316 
2317  CGData.MRequirements.push_back(AccImpl.get());
2318  MSrcPtr = const_cast<T_Src *>(Src);
2319  MDstPtr = static_cast<void *>(AccImpl.get());
2320  // Store copy of accessor to the local storage to make sure it is alive
2321  // until we finish
2322  CGData.MAccStorage.push_back(std::move(AccImpl));
2323  }
2324 
2332  template <
2333  typename T_Src, int Dims_Src, access::mode AccessMode_Src,
2334  access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
2335  access::mode AccessMode_Dst, access::target AccessTarget_Dst,
2336  access::placeholder IsPlaceholder_Src = access::placeholder::false_t,
2337  access::placeholder IsPlaceholder_Dst = access::placeholder::false_t>
2338  void copy(accessor<T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src,
2339  IsPlaceholder_Src>
2340  Src,
2341  accessor<T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst,
2342  IsPlaceholder_Dst>
2343  Dst) {
2344  if (Src.is_placeholder())
2345  checkIfPlaceholderIsBoundToHandler(Src);
2346  if (Dst.is_placeholder())
2347  checkIfPlaceholderIsBoundToHandler(Dst);
2348 
2349  throwIfActionIsCreated();
2350  static_assert(isValidTargetForExplicitOp(AccessTarget_Src),
2351  "Invalid source accessor target for the copy method.");
2352  static_assert(isValidTargetForExplicitOp(AccessTarget_Dst),
2353  "Invalid destination accessor target for the copy method.");
2354  static_assert(isValidModeForSourceAccessor(AccessMode_Src),
2355  "Invalid source accessor mode for the copy method.");
2356  static_assert(isValidModeForDestinationAccessor(AccessMode_Dst),
2357  "Invalid destination accessor mode for the copy method.");
2358  if (Dst.get_size() < Src.get_size())
2359  throw sycl::invalid_object_error(
2360  "The destination accessor size is too small to copy the memory into.",
2361  PI_ERROR_INVALID_OPERATION);
2362 
2363  if (copyAccToAccHelper(Src, Dst))
2364  return;
2365  setType(detail::CG::CopyAccToAcc);
2366 
2367  detail::AccessorBaseHost *AccBaseSrc = (detail::AccessorBaseHost *)&Src;
2368  detail::AccessorImplPtr AccImplSrc = detail::getSyclObjImpl(*AccBaseSrc);
2369 
2370  detail::AccessorBaseHost *AccBaseDst = (detail::AccessorBaseHost *)&Dst;
2371  detail::AccessorImplPtr AccImplDst = detail::getSyclObjImpl(*AccBaseDst);
2372 
2373  CGData.MRequirements.push_back(AccImplSrc.get());
2374  CGData.MRequirements.push_back(AccImplDst.get());
2375  MSrcPtr = AccImplSrc.get();
2376  MDstPtr = AccImplDst.get();
2377  // Store copy of accessor to the local storage to make sure it is alive
2378  // until we finish
2379  CGData.MAccStorage.push_back(std::move(AccImplSrc));
2380  CGData.MAccStorage.push_back(std::move(AccImplDst));
2381  }
2382 
2387  template <typename T, int Dims, access::mode AccessMode,
2388  access::target AccessTarget,
2389  access::placeholder IsPlaceholder = access::placeholder::false_t>
2390  void
2392  if (Acc.is_placeholder())
2393  checkIfPlaceholderIsBoundToHandler(Acc);
2394 
2395  throwIfActionIsCreated();
2396  static_assert(isValidTargetForExplicitOp(AccessTarget),
2397  "Invalid accessor target for the update_host method.");
2398  setType(detail::CG::UpdateHost);
2399 
2401  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2402 
2403  MDstPtr = static_cast<void *>(AccImpl.get());
2404  CGData.MRequirements.push_back(AccImpl.get());
2405  CGData.MAccStorage.push_back(std::move(AccImpl));
2406  }
2407 
2416  template <typename T, int Dims, access::mode AccessMode,
2417  access::target AccessTarget,
2418  access::placeholder IsPlaceholder = access::placeholder::false_t,
2419  typename PropertyListT = property_list>
2420  void
2422  Dst,
2423  const T &Pattern) {
2424  assert(!MIsHost && "fill() should no longer be callable on a host device.");
2425 
2426  if (Dst.is_placeholder())
2427  checkIfPlaceholderIsBoundToHandler(Dst);
2428 
2429  throwIfActionIsCreated();
2430  // TODO add check:T must be an integral scalar value or a SYCL vector type
2431  static_assert(isValidTargetForExplicitOp(AccessTarget),
2432  "Invalid accessor target for the fill method.");
2433  if constexpr (isBackendSupportedFillSize(sizeof(T)) &&
2434  (Dims <= 1 || isImageOrImageArray(AccessTarget))) {
2435  setType(detail::CG::Fill);
2436 
2438  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2439 
2440  MDstPtr = static_cast<void *>(AccImpl.get());
2441  CGData.MRequirements.push_back(AccImpl.get());
2442  CGData.MAccStorage.push_back(std::move(AccImpl));
2443 
2444  MPattern.resize(sizeof(T));
2445  auto PatternPtr = reinterpret_cast<T *>(MPattern.data());
2446  *PatternPtr = Pattern;
2447  } else if constexpr (Dims == 0) {
2448  // Special case for zero-dim accessors.
2449  parallel_for<
2450  class __fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
2451  range<1>(1), [=](id<1>) { Dst = Pattern; });
2452  } else {
2453  range<Dims> Range = Dst.get_range();
2454  parallel_for<
2455  class __fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
2456  Range, [=](id<Dims> Index) { Dst[Index] = Pattern; });
2457  }
2458  }
2459 
2466  template <typename T> void fill(void *Ptr, const T &Pattern, size_t Count) {
2467  throwIfActionIsCreated();
2468  static_assert(std::is_trivially_copyable<T>::value,
2469  "Pattern must be trivially copyable");
2470  parallel_for<class __usmfill<T>>(range<1>(Count), [=](id<1> Index) {
2471  T *CastedPtr = static_cast<T *>(Ptr);
2472  CastedPtr[Index] = Pattern;
2473  });
2474  }
2475 
2480  throwIfActionIsCreated();
2481  setType(detail::CG::Barrier);
2482  }
2483 
2487  __SYCL2020_DEPRECATED("use 'ext_oneapi_barrier' instead")
2488  void barrier() { ext_oneapi_barrier(); }
2489 
2496  void ext_oneapi_barrier(const std::vector<event> &WaitList);
2497 
2504  __SYCL2020_DEPRECATED("use 'ext_oneapi_barrier' instead")
2505  void barrier(const std::vector<event> &WaitList);
2506 
2517  void memcpy(void *Dest, const void *Src, size_t Count);
2518 
2529  template <typename T> void copy(const T *Src, T *Dest, size_t Count) {
2530  this->memcpy(Dest, Src, Count * sizeof(T));
2531  }
2532 
2541  void memset(void *Dest, int Value, size_t Count);
2542 
2549  void prefetch(const void *Ptr, size_t Count);
2550 
2557  void mem_advise(const void *Ptr, size_t Length, int Advice);
2558 
2576  template <typename T = unsigned char,
2577  typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
2578  void ext_oneapi_memcpy2d(void *Dest, size_t DestPitch, const void *Src,
2579  size_t SrcPitch, size_t Width, size_t Height) {
2580  throwIfActionIsCreated();
2581  if (Width > DestPitch)
2582  throw sycl::exception(sycl::make_error_code(errc::invalid),
2583  "Destination pitch must be greater than or equal "
2584  "to the width specified in 'ext_oneapi_memcpy2d'");
2585  if (Width > SrcPitch)
2586  throw sycl::exception(sycl::make_error_code(errc::invalid),
2587  "Source pitch must be greater than or equal "
2588  "to the width specified in 'ext_oneapi_memcpy2d'");
2589 
2590  // Get the type of the pointers.
2591  context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
2592  usm::alloc SrcAllocType = get_pointer_type(Src, Ctx);
2593  usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);
2594  bool SrcIsHost =
2595  SrcAllocType == usm::alloc::unknown || SrcAllocType == usm::alloc::host;
2596  bool DestIsHost = DestAllocType == usm::alloc::unknown ||
2597  DestAllocType == usm::alloc::host;
2598 
2599  // Do the following:
2600  // 1. If both are host, use host_task to copy.
2601  // 2. If either pointer is host or the backend supports native memcpy2d, use
2602  // special command.
2603  // 3. Otherwise, launch a kernel for copying.
2604  if (SrcIsHost && DestIsHost) {
2605  commonUSMCopy2DFallbackHostTask<T>(Src, SrcPitch, Dest, DestPitch, Width,
2606  Height);
2607  } else if (SrcIsHost || DestIsHost || supportsUSMMemcpy2D()) {
2608  ext_oneapi_memcpy2d_impl(Dest, DestPitch, Src, SrcPitch, Width, Height);
2609  } else {
2610  commonUSMCopy2DFallbackKernel<T>(Src, SrcPitch, Dest, DestPitch, Width,
2611  Height);
2612  }
2613  }
2614 
2629  template <typename T>
2630  void ext_oneapi_copy2d(const T *Src, size_t SrcPitch, T *Dest,
2631  size_t DestPitch, size_t Width, size_t Height) {
2632  if (Width > DestPitch)
2633  throw sycl::exception(sycl::make_error_code(errc::invalid),
2634  "Destination pitch must be greater than or equal "
2635  "to the width specified in 'ext_oneapi_copy2d'");
2636  if (Width > SrcPitch)
2637  throw sycl::exception(sycl::make_error_code(errc::invalid),
2638  "Source pitch must be greater than or equal "
2639  "to the width specified in 'ext_oneapi_copy2d'");
2640 
2641  // Get the type of the pointers.
2642  context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
2643  usm::alloc SrcAllocType = get_pointer_type(Src, Ctx);
2644  usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);
2645  bool SrcIsHost =
2646  SrcAllocType == usm::alloc::unknown || SrcAllocType == usm::alloc::host;
2647  bool DestIsHost = DestAllocType == usm::alloc::unknown ||
2648  DestAllocType == usm::alloc::host;
2649 
2650  // Do the following:
2651  // 1. If both are host, use host_task to copy.
2652  // 2. If either pointer is host or of the backend supports native memcpy2d,
2653  // use special command.
2654  // 3. Otherwise, launch a kernel for copying.
2655  if (SrcIsHost && DestIsHost) {
2656  commonUSMCopy2DFallbackHostTask<T>(Src, SrcPitch, Dest, DestPitch, Width,
2657  Height);
2658  } else if (SrcIsHost || DestIsHost || supportsUSMMemcpy2D()) {
2659  ext_oneapi_memcpy2d_impl(Dest, DestPitch * sizeof(T), Src,
2660  SrcPitch * sizeof(T), Width * sizeof(T), Height);
2661  } else {
2662  commonUSMCopy2DFallbackKernel<T>(Src, SrcPitch, Dest, DestPitch, Width,
2663  Height);
2664  }
2665  }
2666 
2682  template <typename T = unsigned char,
2683  typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
2684  void ext_oneapi_memset2d(void *Dest, size_t DestPitch, int Value,
2685  size_t Width, size_t Height) {
2686  throwIfActionIsCreated();
2687  if (Width > DestPitch)
2688  throw sycl::exception(sycl::make_error_code(errc::invalid),
2689  "Destination pitch must be greater than or equal "
2690  "to the width specified in 'ext_oneapi_memset2d'");
2691  T CharVal = static_cast<T>(Value);
2692 
2693  context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
2694  usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);
2695 
2696  // If the backends supports 2D fill we use that. Otherwise we use a fallback
2697  // kernel. If the target is on host we will always do the operation on host.
2698  if (DestAllocType == usm::alloc::unknown ||
2699  DestAllocType == usm::alloc::host)
2700  commonUSMFill2DFallbackHostTask(Dest, DestPitch, CharVal, Width, Height);
2701  else if (supportsUSMMemset2D())
2702  ext_oneapi_memset2d_impl(Dest, DestPitch, Value, Width, Height);
2703  else
2704  commonUSMFill2DFallbackKernel(Dest, DestPitch, CharVal, Width, Height);
2705  }
2706 
2719  template <typename T>
2720  void ext_oneapi_fill2d(void *Dest, size_t DestPitch, const T &Pattern,
2721  size_t Width, size_t Height) {
2722  throwIfActionIsCreated();
2723  static_assert(std::is_trivially_copyable<T>::value,
2724  "Pattern must be trivially copyable");
2725  if (Width > DestPitch)
2726  throw sycl::exception(sycl::make_error_code(errc::invalid),
2727  "Destination pitch must be greater than or equal "
2728  "to the width specified in 'ext_oneapi_fill2d'");
2729 
2730  context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
2731  usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);
2732 
2733  // If the backends supports 2D fill we use that. Otherwise we use a fallback
2734  // kernel. If the target is on host we will always do the operation on host.
2735  if (DestAllocType == usm::alloc::unknown ||
2736  DestAllocType == usm::alloc::host)
2737  commonUSMFill2DFallbackHostTask(Dest, DestPitch, Pattern, Width, Height);
2738  else if (supportsUSMFill2D())
2739  ext_oneapi_fill2d_impl(Dest, DestPitch, &Pattern, sizeof(T), Width,
2740  Height);
2741  else
2742  commonUSMFill2DFallbackKernel(Dest, DestPitch, Pattern, Width, Height);
2743  }
2744 
2754  template <typename T, typename PropertyListT>
2756  const void *Src, size_t NumBytes = sizeof(T),
2757  size_t DestOffset = 0) {
2758  if (sizeof(T) < DestOffset + NumBytes)
2759  throw sycl::exception(make_error_code(errc::invalid),
2760  "Copy to device_global is out of bounds.");
2761 
2762  constexpr bool IsDeviceImageScoped = PropertyListT::template has_property<
2764 
2765  if (!detail::isDeviceGlobalUsedInKernel(&Dest)) {
2766  // If the corresponding device_global isn't used in any kernels, we fall
2767  // back to doing the memory operation on host-only.
2768  memcpyToHostOnlyDeviceGlobal(&Dest, Src, sizeof(T), IsDeviceImageScoped,
2769  NumBytes, DestOffset);
2770  return;
2771  }
2772 
2773  memcpyToDeviceGlobal(&Dest, Src, IsDeviceImageScoped, NumBytes, DestOffset);
2774  }
2775 
2785  template <typename T, typename PropertyListT>
2786  void
2787  memcpy(void *Dest,
2789  size_t NumBytes = sizeof(T), size_t SrcOffset = 0) {
2790  if (sizeof(T) < SrcOffset + NumBytes)
2791  throw sycl::exception(make_error_code(errc::invalid),
2792  "Copy from device_global is out of bounds.");
2793 
2794  constexpr bool IsDeviceImageScoped = PropertyListT::template has_property<
2796 
2798  // If the corresponding device_global isn't used in any kernels, we fall
2799  // back to doing the memory operation on host-only.
2800  memcpyFromHostOnlyDeviceGlobal(Dest, &Src, IsDeviceImageScoped, NumBytes,
2801  SrcOffset);
2802  return;
2803  }
2804 
2805  memcpyFromDeviceGlobal(Dest, &Src, IsDeviceImageScoped, NumBytes,
2806  SrcOffset);
2807  }
2808 
2819  template <typename T, typename PropertyListT>
2820  void copy(const std::remove_all_extents_t<T> *Src,
2822  size_t Count = sizeof(T) / sizeof(std::remove_all_extents_t<T>),
2823  size_t StartIndex = 0) {
2824  this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
2825  StartIndex * sizeof(std::remove_all_extents_t<T>));
2826  }
2827 
2838  template <typename T, typename PropertyListT>
2839  void
2841  std::remove_all_extents_t<T> *Dest,
2842  size_t Count = sizeof(T) / sizeof(std::remove_all_extents_t<T>),
2843  size_t StartIndex = 0) {
2844  this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
2845  StartIndex * sizeof(std::remove_all_extents_t<T>));
2846  }
2847 
2848 private:
2849  std::shared_ptr<detail::handler_impl> MImpl;
2850  std::shared_ptr<detail::queue_impl> MQueue;
2855  mutable detail::CG::StorageInitHelper CGData;
2856  std::vector<detail::LocalAccessorImplPtr> MLocalAccStorage;
2857  std::vector<std::shared_ptr<detail::stream_impl>> MStreamStorage;
2859  std::vector<detail::ArgDesc> MArgs;
2863  std::vector<detail::ArgDesc> MAssociatedAccesors;
2865  detail::NDRDescT MNDRDesc;
2866  std::string MKernelName;
2868  std::shared_ptr<detail::kernel_impl> MKernel;
2872  detail::CG::CGTYPE MCGType = detail::CG::None;
2874  void *MSrcPtr = nullptr;
2876  void *MDstPtr = nullptr;
2878  size_t MLength = 0;
2880  std::vector<char> MPattern;
2882  std::unique_ptr<detail::HostKernelBase> MHostKernel;
2884  std::unique_ptr<detail::HostTask> MHostTask;
2885  // Storage for a lambda or function when using InteropTasks
2886  std::unique_ptr<detail::InteropTask> MInteropTask;
2889  std::vector<detail::EventImplPtr> MEventsWaitWithBarrier;
2890 
2891  bool MIsHost = false;
2892 
2893  detail::code_location MCodeLoc = {};
2894  bool MIsFinalized = false;
2895  event MLastEvent;
2896 
2897  // Make queue_impl class friend to be able to call finalize method.
2898  friend class detail::queue_impl;
2899  // Make accessor class friend to keep the list of associated accessors.
2900  template <typename DataT, int Dims, access::mode AccMode,
2901  access::target AccTarget, access::placeholder isPlaceholder,
2902  typename PropertyListT>
2903  friend class accessor;
2905 
2906  template <typename DataT, int Dimensions, access::mode AccessMode,
2909  // Make stream class friend to be able to keep the list of associated streams
2910  friend class stream;
2911  friend class detail::stream_impl;
2912  // Make reduction friends to store buffers and arrays created for it
2913  // in handler from reduction methods.
2914  template <typename T, class BinaryOperation, int Dims, size_t Extent,
2915  bool ExplicitIdentity, typename RedOutVar>
2917 
2918  friend inline void detail::reduction::finalizeHandler(handler &CGH);
2919  template <class FunctorTy>
2920  friend void detail::reduction::withAuxHandler(handler &CGH, FunctorTy Func);
2921 
2922  template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
2923  typename PropertiesT, typename... RestT>
2924  friend void detail::reduction_parallel_for(handler &CGH, range<Dims> NDRange,
2925  PropertiesT Properties,
2926  RestT... Rest);
2927 
2928  template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
2929  typename PropertiesT, typename... RestT>
2930  friend void
2932  PropertiesT Properties, RestT... Rest);
2933 
2934 #ifndef __SYCL_DEVICE_ONLY__
2935  friend void detail::associateWithHandler(handler &,
2937  access::target);
2938 #endif
2939 
2940  friend class ::MockHandler;
2941  friend class detail::queue_impl;
2942 
2943  // Make pipe class friend to be able to call ext_intel_read/write_host_pipe
2944  // method.
2945  template <class _name, class _dataT, int32_t _min_capacity,
2946  class _propertiesT, class>
2948 
2955  void ext_intel_read_host_pipe(const std::string &Name, void *Ptr, size_t Size,
2956  bool Block = false);
2957 
2964  void ext_intel_write_host_pipe(const std::string &Name, void *Ptr,
2965  size_t Size, bool Block = false);
2966 
2967  bool DisableRangeRounding();
2968 
2969  bool RangeRoundingTrace();
2970 
2971  void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
2972  size_t &MinRange);
2973 
2974  template <typename WrapperT, typename TransformedArgType, int Dims,
2975  typename KernelType,
2977  KernelType, TransformedArgType>::value> * = nullptr>
2978  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
2979  range<Dims> NumWorkItems) {
2980  return detail::RoundedRangeKernelWithKH<TransformedArgType, Dims,
2981  KernelType>(NumWorkItems,
2982  KernelFunc);
2983  }
2984 
2985  template <typename WrapperT, typename TransformedArgType, int Dims,
2986  typename KernelType,
2987  std::enable_if_t<!detail::KernelLambdaHasKernelHandlerArgT<
2988  KernelType, TransformedArgType>::value> * = nullptr>
2989  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
2990  range<Dims> NumWorkItems) {
2992  NumWorkItems, KernelFunc);
2993  }
2994 
2995  const std::shared_ptr<detail::context_impl> &getContextImplPtr() const;
2996 
2997  // Checks if 2D memory operations are supported by the underlying platform.
2998  bool supportsUSMMemcpy2D();
2999  bool supportsUSMFill2D();
3000  bool supportsUSMMemset2D();
3001 
3002  // Helper function for getting a loose bound on work-items.
3003  id<2> computeFallbackKernelBounds(size_t Width, size_t Height);
3004 
3005  // Common function for launching a 2D USM memcpy kernel to avoid redefinitions
3006  // of the kernel from copy and memcpy.
3007  template <typename T>
3008  void commonUSMCopy2DFallbackKernel(const void *Src, size_t SrcPitch,
3009  void *Dest, size_t DestPitch, size_t Width,
3010  size_t Height) {
3011  // Otherwise the data is accessible on the device so we do the operation
3012  // there instead.
3013  // Limit number of work items to be resistant to big copies.
3014  id<2> Chunk = computeFallbackKernelBounds(Height, Width);
3015  id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
3016  parallel_for<class __usmmemcpy2d<T>>(
3017  range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) {
3018  T *CastedDest = static_cast<T *>(Dest);
3019  const T *CastedSrc = static_cast<const T *>(Src);
3020  for (uint32_t I = 0; I < Iterations[0]; ++I) {
3021  for (uint32_t J = 0; J < Iterations[1]; ++J) {
3022  id<2> adjustedIndex = Index + Chunk * id<2>{I, J};
3023  if (adjustedIndex[0] < Height && adjustedIndex[1] < Width) {
3024  CastedDest[adjustedIndex[0] * DestPitch + adjustedIndex[1]] =
3025  CastedSrc[adjustedIndex[0] * SrcPitch + adjustedIndex[1]];
3026  }
3027  }
3028  }
3029  });
3030  }
3031 
3032  // Common function for launching a 2D USM memcpy host-task to avoid
3033  // redefinitions of the kernel from copy and memcpy.
3034  template <typename T>
3035  void commonUSMCopy2DFallbackHostTask(const void *Src, size_t SrcPitch,
3036  void *Dest, size_t DestPitch,
3037  size_t Width, size_t Height) {
3038  // If both pointers are host USM or unknown (assumed non-USM) we use a
3039  // host-task to satisfy dependencies.
3040  host_task([=] {
3041  const T *CastedSrc = static_cast<const T *>(Src);
3042  T *CastedDest = static_cast<T *>(Dest);
3043  for (size_t I = 0; I < Height; ++I) {
3044  const T *SrcItBegin = CastedSrc + SrcPitch * I;
3045  T *DestItBegin = CastedDest + DestPitch * I;
3046  std::copy(SrcItBegin, SrcItBegin + Width, DestItBegin);
3047  }
3048  });
3049  }
3050 
3051  // Common function for launching a 2D USM fill kernel to avoid redefinitions
3052  // of the kernel from memset and fill.
3053  template <typename T>
3054  void commonUSMFill2DFallbackKernel(void *Dest, size_t DestPitch,
3055  const T &Pattern, size_t Width,
3056  size_t Height) {
3057  // Otherwise the data is accessible on the device so we do the operation
3058  // there instead.
3059  // Limit number of work items to be resistant to big fill operations.
3060  id<2> Chunk = computeFallbackKernelBounds(Height, Width);
3061  id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
3062  parallel_for<class __usmfill2d<T>>(
3063  range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) {
3064  T *CastedDest = static_cast<T *>(Dest);
3065  for (uint32_t I = 0; I < Iterations[0]; ++I) {
3066  for (uint32_t J = 0; J < Iterations[1]; ++J) {
3067  id<2> adjustedIndex = Index + Chunk * id<2>{I, J};
3068  if (adjustedIndex[0] < Height && adjustedIndex[1] < Width) {
3069  CastedDest[adjustedIndex[0] * DestPitch + adjustedIndex[1]] =
3070  Pattern;
3071  }
3072  }
3073  }
3074  });
3075  }
3076 
3077  // Common function for launching a 2D USM fill kernel or host_task to avoid
3078  // redefinitions of the kernel from memset and fill.
3079  template <typename T>
3080  void commonUSMFill2DFallbackHostTask(void *Dest, size_t DestPitch,
3081  const T &Pattern, size_t Width,
3082  size_t Height) {
3083  // If the pointer is host USM or unknown (assumed non-USM) we use a
3084  // host-task to satisfy dependencies.
3085  host_task([=] {
3086  T *CastedDest = static_cast<T *>(Dest);
3087  for (size_t I = 0; I < Height; ++I) {
3088  T *ItBegin = CastedDest + DestPitch * I;
3089  std::fill(ItBegin, ItBegin + Width, Pattern);
3090  }
3091  });
3092  }
3093 
3094  // Implementation of ext_oneapi_memcpy2d using command for native 2D memcpy.
3095  void ext_oneapi_memcpy2d_impl(void *Dest, size_t DestPitch, const void *Src,
3096  size_t SrcPitch, size_t Width, size_t Height);
3097 
3098  // Untemplated version of ext_oneapi_fill2d using command for native 2D fill.
3099  void ext_oneapi_fill2d_impl(void *Dest, size_t DestPitch, const void *Value,
3100  size_t ValueSize, size_t Width, size_t Height);
3101 
3102  // Implementation of ext_oneapi_memset2d using command for native 2D memset.
3103  void ext_oneapi_memset2d_impl(void *Dest, size_t DestPitch, int Value,
3104  size_t Width, size_t Height);
3105 
3106  // Implementation of memcpy to device_global.
3107  void memcpyToDeviceGlobal(const void *DeviceGlobalPtr, const void *Src,
3108  bool IsDeviceImageScoped, size_t NumBytes,
3109  size_t Offset);
3110 
3111  // Implementation of memcpy from device_global.
3112  void memcpyFromDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
3113  bool IsDeviceImageScoped, size_t NumBytes,
3114  size_t Offset);
3115 
3116  // Implementation of memcpy to an unregistered device_global.
3117  void memcpyToHostOnlyDeviceGlobal(const void *DeviceGlobalPtr,
3118  const void *Src, size_t DeviceGlobalTSize,
3119  bool IsDeviceImageScoped, size_t NumBytes,
3120  size_t Offset);
3121 
3122  // Implementation of memcpy from an unregistered device_global.
3123  void memcpyFromHostOnlyDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
3124  bool IsDeviceImageScoped, size_t NumBytes,
3125  size_t Offset);
3126 
3127  template <typename T, int Dims, access::mode AccessMode,
3128  access::target AccessTarget,
3129  access::placeholder IsPlaceholder = access::placeholder::false_t,
3130  typename PropertyListT = property_list>
3131  void checkIfPlaceholderIsBoundToHandler(
3132  accessor<T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT>
3133  Acc) {
3134  auto *AccBase = reinterpret_cast<detail::AccessorBaseHost *>(&Acc);
3135  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
3136  detail::AccessorImplHost *Req = AccImpl.get();
3137  if (std::find_if(MAssociatedAccesors.begin(), MAssociatedAccesors.end(),
3138  [&](const detail::ArgDesc &AD) {
3139  return AD.MType ==
3140  detail::kernel_param_kind_t::kind_accessor &&
3141  AD.MPtr == Req &&
3142  AD.MSize == static_cast<int>(AccessTarget);
3143  }) == MAssociatedAccesors.end())
3144  throw sycl::exception(make_error_code(errc::kernel_argument),
3145  "placeholder accessor must be bound by calling "
3146  "handler::require() before it can be used.");
3147  }
3148 
3149  // Set value of the gpu cache configuration for the kernel.
3150  void setKernelCacheConfig(RT::PiKernelCacheConfig);
3151 };
3152 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
3153 } // namespace sycl
sycl::_V1::handler::copy
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2197
sycl::_V1::IsPlaceholder
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS IsPlaceholder
Definition: accessor.hpp:3060
sycl::_V1::handler::get_specialization_constant
std::remove_reference_t< decltype(SpecName)>::value_type get_specialization_constant() const
Definition: handler.hpp:1531
sycl::_V1::property_list
Objects of the property_list class are containers for the SYCL properties.
Definition: property_list.hpp:24
sycl::_V1::detail::AccessorImplPtr
std::shared_ptr< AccessorImplHost > AccessorImplPtr
Definition: accessor.hpp:494
sycl::_V1::__SYCL2020_DEPRECATED
signed char __SYCL2020_DEPRECATED
Definition: aliases.hpp:96
sycl::_V1::detail::RoundedRangeKernelWithKH::operator()
void operator()(TransformedArgType Arg, kernel_handler KH) const
Definition: handler.hpp:275
sycl::_V1::detail::associateWithHandler
void associateWithHandler(handler &, AccessorBaseHost *, access::target)
Definition: handler_proxy.cpp:17
sycl::_V1::detail::RoundedRangeKernel::operator()
void operator()(TransformedArgType Arg) const
Definition: handler.hpp:257
__usmmemcpy2d
Definition: handler.hpp:67
sycl::_V1::detail::LocalAccessorImplHost
Definition: accessor_impl.hpp:131
property_list.hpp
sycl::_V1::detail::image_accessor
Definition: accessor.hpp:701
__usmfill
Definition: handler.hpp:65
cg.hpp
sycl::_V1::handler::ext_oneapi_barrier
void ext_oneapi_barrier()
Prevents any commands submitted afterward to this queue from executing until all commands previously ...
Definition: handler.hpp:2479
sycl::_V1::access::mode
mode
Definition: access.hpp:30
PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM
Definition: pi.h:663
sycl::_V1::detail::runKernelWithoutArg
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType >::value > runKernelWithoutArg(KernelType KernelName)
Definition: cg_types.hpp:183
sycl::_V1::handler::require
void require(accessor< DataT, Dims, AccMode, AccTarget, isPlaceholder > Acc)
Requires access to the memory object associated with the placeholder accessor.
Definition: handler.hpp:1559
sycl::_V1::get_pointer_type
usm::alloc get_pointer_type(const void *ptr, const context &ctxt)
Query the allocation type from a USM pointer.
Definition: usm_impl.cpp:577
sycl::_V1::ext::intel::experimental::prefetch
prefetch_impl< _B > prefetch
Definition: fpga_lsu.hpp:45
sycl::_V1::handler::memcpy
void memcpy(void *Dest, const ext::oneapi::experimental::device_global< T, PropertyListT > &Src, size_t NumBytes=sizeof(T), size_t SrcOffset=0)
Copies data from a device_global to USM memory.
Definition: handler.hpp:2787
T
sycl::_V1::detail::reduction_impl_algo
Definition: reduction.hpp:813
sycl::_V1::pipe
ext::intel::pipe< name, dataT, min_capacity > pipe
Definition: pipes.hpp:16
sycl::_V1::nd_item::get_global_id
id< dimensions > get_global_id() const
Definition: nd_item.hpp:44
sycl::_V1::detail::kernel_param_desc_t
Definition: kernel_desc.hpp:48
sycl::_V1::make_error_code
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:94
sycl::_V1::detail::auto_name
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:40
item.hpp
sycl::_V1::handler::set_arg
std::enable_if_t< ShouldEnableSetArg< T >::value, void > set_arg(int ArgIndex, T &&Arg)
Sets argument for OpenCL interoperability kernels.
Definition: handler.hpp:1603
__copyAcc2Acc
Definition: handler.hpp:85
sycl::_V1::detail::GetMergedKernelProperties
Definition: handler.hpp:168
sycl::_V1::handler::parallel_for
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(range< Dims > Range, RestT &&...Rest)
Definition: handler.hpp:2135
sycl::_V1::detail::kernel_param_kind_t
kernel_param_kind_t
Definition: kernel_desc.hpp:37
stl.hpp
cg_types.hpp
sycl::_V1::detail::AccessorImplHost
Definition: accessor_impl.hpp:42
__SYCL_INLINE_VER_NAMESPACE
#define __SYCL_INLINE_VER_NAMESPACE(X)
Definition: defines_elementary.hpp:11
sycl::_V1::handler::parallel_for_work_group
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2006
sycl::_V1::detail::lambda_arg_type
decltype(argument_helper< F, SuggestedArgType >(0)) lambda_arg_type
Definition: handler.hpp:135
sycl::_V1::detail::pi::PiKernelCacheConfig
::pi_kernel_cache_config PiKernelCacheConfig
Definition: pi.hpp:150
sycl::_V1::detail::InteropTask
Definition: cg_types.hpp:220
sycl::_V1::handler::parallel_for
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(nd_range< Dims > Range, PropertiesT Properties, RestT &&...Rest)
Definition: handler.hpp:2147
PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA
Definition: pi.h:665
handler_proxy.hpp
sycl::_V1::ext::oneapi::experimental::properties
Definition: properties.hpp:125
sycl::_V1::detail::LocalAccessorBaseHost
Definition: accessor.hpp:551
sycl::_V1::detail::memcpy
void memcpy(void *Dst, const void *Src, size_t Size)
Definition: memcpy.hpp:16
sycl::_V1::detail::KernelLambdaHasKernelHandlerArgT
Definition: cg_types.hpp:174
sycl::_V1::handler::ext_oneapi_memcpy2d
void ext_oneapi_memcpy2d(void *Dest, size_t DestPitch, const void *Src, size_t SrcPitch, size_t Width, size_t Height)
Copies data from one 2D memory region to another, both pointed by USM pointers.
Definition: handler.hpp:2578
__copyAcc2Ptr
Definition: handler.hpp:72
context.hpp
_KERNELFUNCPARAM
#define _KERNELFUNCPARAM(a)
Definition: handler.hpp:58
sycl::_V1::detail::runKernelWithArg
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType, ArgType >::value > runKernelWithArg(KernelType KernelName, ArgType Arg)
Definition: cg_types.hpp:197
event.hpp
sycl::_V1::handler::host_task
std::enable_if_t< detail::check_fn_signature< std::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< std::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
Definition: handler.hpp:1687
os_util.hpp
properties.hpp
sycl::_V1::Dimensions
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS Dimensions
Definition: accessor.hpp:3059
sycl::_V1::handler::parallel_for
void parallel_for(nd_range< Dims > NDRange, kernel Kernel)
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:1826
sycl::_V1::ext::oneapi::experimental::detail::WorkGroupSize
@ WorkGroupSize
Definition: property.hpp:172
sycl::_V1::handler::ext_oneapi_memset2d
void ext_oneapi_memset2d(void *Dest, size_t DestPitch, int Value, size_t Width, size_t Height)
Fills the memory pointed by a USM pointer with the value specified.
Definition: handler.hpp:2684
sycl
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
sycl::_V1::detail::HostKernel
Definition: cg_types.hpp:245
sycl::_V1::handler::parallel_for_work_group
void parallel_for_work_group(range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:1737
sycl::_V1::detail::RoundedRangeKernelWithKH::RoundedRangeKernelWithKH
RoundedRangeKernelWithKH(range< Dims > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:272
max
simd< _Tp, _Abi > max(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
sycl::_V1::event
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:40
sycl::_V1::detail::KernelBundleImplPtr
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
Definition: kernel_bundle.hpp:138
sycl::_V1::handler::parallel_for
void parallel_for(range< 1 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1785
access.hpp
sycl::_V1::handler::single_task
void single_task(_KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:1637
sycl::_V1::handler::parallel_for
void parallel_for(range< 3 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:1657
sycl::_V1::handler::set_args
void set_args(Ts &&...Args)
Sets arguments for OpenCL interoperability kernels.
Definition: handler.hpp:1625
sycl::_V1::handler::single_task
void single_task(kernel Kernel)
Invokes a SYCL kernel.
Definition: handler.hpp:1772
sycl::_V1::id
A unique identifier of an item in an index space.
Definition: array.hpp:17
id.hpp
sycl::_V1::handler::parallel_for
void parallel_for(range< 2 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:1650
interop_handle.hpp
sycl::_V1::detail::check_fn_signature
Definition: cg_types.hpp:126
sycl::_V1::detail::reduction::finalizeHandler
void finalizeHandler(handler &CGH)
Definition: reduction.hpp:1147
sycl::_V1::kernel
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:72
sycl::_V1::detail::write
void write(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize, unsigned WIOffset, const char *Str, unsigned Len, unsigned Padding=0)
Definition: stream.hpp:110
sycl::_V1::handler::parallel_for
void parallel_for(kernel Kernel, range< Dims > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range.
Definition: handler.hpp:1888
sycl::_V1::detail::reduction_parallel_for
void reduction_parallel_for(handler &CGH, nd_range< Dims > NDRange, PropertiesT Properties, RestT... Rest)
Definition: reduction.hpp:2675
sycl::_V1::range
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: buffer.hpp:28
nd_range.hpp
sycl::_V1::ext::intel::esimd::barrier
__ESIMD_API void barrier()
Generic work-group barrier.
Definition: memory.hpp:1716
sycl::_V1::detail::GetMergedKernelProperties< KernelType, PropertiesT, std::enable_if_t< ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< KernelType >::value > >::type
ext::oneapi::experimental::detail::merged_properties_t< PropertiesT, get_method_properties > type
Definition: handler.hpp:184
export.hpp
sycl::_V1::access::placeholder
placeholder
Definition: access.hpp:45
sycl::_V1::nd_item
Identifies an instance of the function object executing at each point in an nd_range.
Definition: helpers.hpp:33
sycl::_V1::ext::oneapi::experimental::has_property
static constexpr bool has_property()
Definition: annotated_arg.hpp:162
sycl::_V1::handler::copy
void copy(const std::remove_all_extents_t< T > *Src, ext::oneapi::experimental::device_global< T, PropertyListT > &Dest, size_t Count=sizeof(T)/sizeof(std::remove_all_extents_t< T >), size_t StartIndex=0)
Copies elements of type std::remove_all_extents_t<T> from a USM memory region to a device_global.
Definition: handler.hpp:2820
sycl::_V1::handler::parallel_for
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 2 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2094
sycl::_V1::detail::stream_impl
Definition: stream_impl.hpp:25
sycl::_V1::handler::parallel_for
void parallel_for(range< 1 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:1643
sycl::_V1::handler::parallel_for_work_group
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2045
kernel.hpp
sycl::_V1::handler::parallel_for_work_group
void parallel_for_work_group(range< Dims > NumWorkGroups, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
}@
Definition: handler.hpp:2165
sycl::_V1::handler::parallel_for
void parallel_for(range< 3 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1793
sycl::_V1::handler::ext_oneapi_fill2d
void ext_oneapi_fill2d(void *Dest, size_t DestPitch, const T &Pattern, size_t Width, size_t Height)
Fills the memory pointed by a USM pointer with the value specified.
Definition: handler.hpp:2720
sycl::_V1::handler::single_task
void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function.
Definition: handler.hpp:1843
kernel_bundle.hpp
sycl::_V1::item
Identifies an instance of the function object executing at each point in a range.
Definition: helpers.hpp:29
cl.h
sycl::_V1::detail::GetMergedKernelProperties< KernelType, PropertiesT, std::enable_if_t< ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< KernelType >::value > >::get_method_properties
typename ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< KernelType >::properties_t get_method_properties
Definition: handler.hpp:178
sycl::_V1::access::target::host_task
@ host_task
sycl::_V1::ext::oneapi::experimental::operator=
annotated_arg & operator=(annotated_arg &)=default
sycl::_V1::handler::is_same_type
std::is_same< remove_cv_ref_t< U >, remove_cv_ref_t< T > > is_same_type
Definition: handler.hpp:1581
sycl::_V1::handler::__SYCL_DEPRECATED
__SYCL_DEPRECATED("interop_task() is deprecated, use host_task() instead") void interop_task(FuncT Func)
Invokes a lambda on the host.
Definition: handler.hpp:1872
sycl::_V1::handler::ext_oneapi_copy2d
void ext_oneapi_copy2d(const T *Src, size_t SrcPitch, T *Dest, size_t DestPitch, size_t Width, size_t Height)
Copies data from one 2D memory region to another, both pointed by USM pointers.
Definition: handler.hpp:2630
sycl::_V1::handler::set_arg
void set_arg(int ArgIndex, accessor< DataT, Dims, AccessMode, AccessTarget, IsPlaceholder > Arg)
Definition: handler.hpp:1610
sycl::_V1::handler
Command group handler class.
Definition: handler.hpp:325
sycl::_V1::ext::oneapi::experimental::detail::empty_properties_t
properties< std::tuple<> > empty_properties_t
Definition: properties.hpp:203
sycl::_V1::detail::reduction::withAuxHandler
void withAuxHandler(handler &CGH, FunctorTy Func)
Definition: reduction.hpp:1148
sycl::_V1::usm::alloc
alloc
Definition: usm_enums.hpp:14
sycl::_V1::nd_item::get_offset
id< dimensions > get_offset() const
Definition: nd_item.hpp:109
sycl::_V1::kernel_bundle< bundle_state::executable >
sycl::_V1::handler::parallel_for
void parallel_for(kernel Kernel, nd_range< Dims > NDRange, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:1964
sycl::_V1::detail::CG::StorageInitHelper
Definition: cg.hpp:80
reduction_forward.hpp
sycl::_V1::nd_range
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: uniform.hpp:36
sycl::_V1::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:49
sycl::_V1::handler::__SYCL_DEPRECATED
__SYCL_DEPRECATED("run_on_host_intel() is deprecated, use host_task() instead") void run_on_host_intel(FuncT Func)
Defines and invokes a SYCL kernel on host device.
Definition: handler.hpp:1668
sycl::_V1::handler::single_task
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > single_task(PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2075
sycl::_V1::access::target
target
Definition: access.hpp:18
sycl::_V1::read_write
constexpr mode_tag_t< access_mode::read_write > read_write
Definition: access.hpp:76
kernel_execution_properties.hpp
sycl::_V1::handler::copy
void copy(accessor< T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src, IsPlaceholder_Src > Src, accessor< T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst, IsPlaceholder_Dst > Dst)
Copies the content of memory object accessed by Src to the memory object accessed by Dst.
Definition: handler.hpp:2338
sycl::_V1::ext::oneapi::experimental::detail::merged_properties_t
typename merged_properties< LHSPropertiesT, RHSPropertiesT >::type merged_properties_t
Definition: properties.hpp:221
sycl::_V1::handler::set_arg
void set_arg(int ArgIndex, local_accessor< DataT, Dims > Arg)
Definition: handler.hpp:1616
sycl::_V1::handler::parallel_for_work_group
void parallel_for_work_group(range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2174
sycl::_V1::detail::get_kernel_wrapper_name_t
Definition: handler.hpp:140
sycl::_V1::ext::oneapi::experimental::is_property_list
Definition: properties.hpp:189
sycl::_V1::ext::oneapi::experimental::get_property
static constexpr auto get_property()
Definition: annotated_arg.hpp:166
sycl::_V1::detail::queue_impl
Definition: queue_impl.hpp:61
__SYCL_KERNEL_ATTR__
#define __SYCL_KERNEL_ATTR__
Definition: handler.hpp:1218
sycl::_V1::accessor
Definition: accessor.hpp:225
sycl::_V1::detail::LocalAccessorImplPtr
std::shared_ptr< LocalAccessorImplHost > LocalAccessorImplPtr
Definition: accessor.hpp:549
sycl::_V1::detail::__pf_kernel_wrapper
Definition: handler.hpp:138
accessor.hpp
sycl::_V1::handler::fill
void fill(void *Ptr, const T &Pattern, size_t Count)
Fills the specified memory with the specified pattern.
Definition: handler.hpp:2466
sycl::_V1::detail::get_kernel_name_t::name
Name name
Definition: kernel.hpp:46
sycl::_V1::handler::parallel_for
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(nd_range< Dims > Range, PropertiesT Properties, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2114
sycl::_V1::detail::NDRDescT
Definition: cg_types.hpp:41
sycl::_V1::detail::member_ptr_helper
static Arg member_ptr_helper(RetType(Func::*)(Arg) const)
sycl::_V1::detail::RoundedRangeKernel
Definition: handler.hpp:252
sycl::_V1::detail::CG::CGTYPE
CGTYPE
Type of the command group.
Definition: cg.hpp:55
KernelFunc
std::function< void(const sycl::nd_item< NDims > &)> KernelFunc
Definition: pi_esimd_emulator.cpp:192
sycl::_V1::detail::RoundedRangeKernelWithKH
Definition: handler.hpp:270
sycl::_V1::ext::oneapi::experimental::device_image_scope_key
Definition: properties.hpp:20
sycl::_V1::handler::set_specialization_constant
void set_specialization_constant(typename std::remove_reference_t< decltype(SpecName)>::value_type Value)
Definition: handler.hpp:1516
sycl::_V1::ext::intel::experimental::cache_config_key
cache_config cache_config_key
Definition: kernel_execution_properties.hpp:31
kernel_handler.hpp
sycl::_V1::detail::code_location
Definition: common.hpp:66
sycl::_V1::handler::fill
void fill(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT > Dst, const T &Pattern)
Fills memory pointed by accessor with the pattern given.
Definition: handler.hpp:2421
sycl::_V1::handler::parallel_for
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(nd_range< Dims > Range, RestT &&...Rest)
Definition: handler.hpp:2155
sycl::_V1::handler::parallel_for
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 3 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2104
sycl::_V1::local_accessor
Definition: multi_ptr.hpp:68
std
Definition: accessor.hpp:3914
sycl::_V1::detail::getDeviceFromHandler
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
Definition: accessor.cpp:15
sycl::_V1::handler::memcpy
void memcpy(ext::oneapi::experimental::device_global< T, PropertyListT > &Dest, const void *Src, size_t NumBytes=sizeof(T), size_t DestOffset=0)
Copies data from a USM memory region to a device_global.
Definition: handler.hpp:2755
device_global.hpp
sycl::_V1::detail::AreAllButLastReductions
Predicate returning true if all template type parameters except the last one are reductions.
Definition: reduction.hpp:43
sycl::_V1::handler::copy
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, T_Dst *Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2252
sampler.hpp
sycl::_V1::handler::copy
void copy(const ext::oneapi::experimental::device_global< T, PropertyListT > &Src, std::remove_all_extents_t< T > *Dest, size_t Count=sizeof(T)/sizeof(std::remove_all_extents_t< T >), size_t StartIndex=0)
Copies elements of type std::remove_all_extents_t<T> from a device_global to a USM memory region.
Definition: handler.hpp:2840
sycl::_V1::handler::copy
void copy(const T_Src *Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2294
sycl::_V1::detail::getDelinearizedId
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:323
sycl::_V1::handler::update_host
void update_host(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder > Acc)
Provides guarantees that the memory object accessed via Acc is updated on the host after command grou...
Definition: handler.hpp:2391
sycl::_V1::handler::copy
void copy(std::shared_ptr< T_Src > Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2225
sycl::_V1::detail::KernelInfo
Definition: kernel_desc.hpp:77
sycl::_V1::detail::getLinearIndex
size_t getLinearIndex(const T< Dims > &Index, const U< Dims > &Range)
Definition: common.hpp:435
sycl::_V1::ext::oneapi::experimental::detail::properties_t
properties< std::tuple< PropertyValueTs... > > properties_t
Definition: properties.hpp:208
sycl::_V1::handler::parallel_for
void parallel_for(range< 2 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1789
__fill
Definition: handler.hpp:63
sycl::_V1::ext::intel::experimental::pipe
Definition: pipes.hpp:46
nd_item.hpp
property.hpp
_KERNELFUNCPARAMTYPE
#define _KERNELFUNCPARAMTYPE
Definition: handler.hpp:54
sycl::_V1::ext::intel::experimental::large_data
constexpr cache_config_enum large_data
Definition: kernel_execution_properties.hpp:23
sycl::_V1::ext::intel::experimental::large_slm
constexpr cache_config_enum large_slm
Definition: kernel_execution_properties.hpp:21
sycl::_V1::handler::ShouldEnableSetArg
Definition: handler.hpp:1583
sycl::_V1::detail::isDeviceGlobalUsedInKernel
bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr)
Definition: handler.cpp:35
sycl::_V1::handler::parallel_for_work_group
void parallel_for_work_group(range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:1758
sycl::_V1::nd_item::get_global_range
range< dimensions > get_global_range() const
Definition: nd_item.hpp:96
properties.hpp
sycl::_V1::handler::parallel_for
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 1 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2084
std::cout
__SYCL_EXTERN_STREAM_ATTRS ostream cout
Linked to standard output.
sycl::_V1::handler::remove_cv_ref_t
typename std::remove_cv_t< std::remove_reference_t< T > > remove_cv_ref_t
Definition: handler.hpp:1578
sycl::_V1::AccessMode
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
Definition: accessor.hpp:3059
sycl::_V1::detail::argument_helper
SuggestedArgType argument_helper(...)
sycl::_V1::interop_handle
Definition: interop_handle.hpp:36
__usmfill2d
Definition: handler.hpp:66
sycl::_V1::ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod
Definition: properties.hpp:173
sycl::_V1::ext::oneapi::experimental::device_global
Definition: device_global.hpp:105
sycl::_V1::detail::reduction::strategy
strategy
Definition: reduction_forward.hpp:25
sycl::_V1::detail::RoundedRangeKernel::RoundedRangeKernel
RoundedRangeKernel(range< Dims > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:254
sycl::_V1::handler::parallel_for
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< Dims > Range, PropertiesT Properties, RestT &&...Rest)
Reductions.
Definition: handler.hpp:2127
__SYCL2020_DEPRECATED
#define __SYCL2020_DEPRECATED(message)
Definition: defines_elementary.hpp:57
sycl::_V1::detail::getSyclObjImpl
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: common.hpp:302
usm_pointer_info.hpp
__copyPtr2Acc
Definition: handler.hpp:77
sycl::_V1::context
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:41
sycl::_V1::detail::checkValueRange
std::enable_if_t< std::is_same_v< T, nd_range< Dims > > > checkValueRange(const T &V)
Definition: handler.hpp:239
sycl::_V1::detail::AccessorBaseHost
Definition: accessor.hpp:496
sycl::_V1::detail::GetMergedKernelProperties::type
PropertiesT type
Definition: handler.hpp:169