DPC++ Runtime
Runtime libraries for oneAPI DPC++
handler.hpp
Go to the documentation of this file.
1 //==-------- handler.hpp --- SYCL command group handler --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
11 #include <sycl/access/access.hpp>
12 #include <sycl/accessor.hpp>
13 #include <sycl/context.hpp>
14 #include <sycl/detail/cg_types.hpp>
15 #include <sycl/detail/common.hpp>
17 #include <sycl/detail/export.hpp>
20 #include <sycl/detail/pi.h>
21 #include <sycl/detail/pi.hpp>
23 #include <sycl/detail/string.hpp>
25 #include <sycl/device.hpp>
26 #include <sycl/event.hpp>
27 #include <sycl/exception.hpp>
41 #include <sycl/group.hpp>
42 #include <sycl/id.hpp>
43 #include <sycl/item.hpp>
44 #include <sycl/kernel.hpp>
45 #include <sycl/kernel_bundle.hpp>
47 #include <sycl/kernel_handler.hpp>
48 #include <sycl/nd_item.hpp>
49 #include <sycl/nd_range.hpp>
50 #include <sycl/property_list.hpp>
51 #include <sycl/range.hpp>
52 #include <sycl/sampler.hpp>
53 
54 #include <assert.h>
55 #include <functional>
56 #include <memory>
57 #include <stddef.h>
58 #include <stdint.h>
59 #include <string>
60 #include <tuple>
61 #include <type_traits>
62 #include <utility>
63 #include <vector>
64 
65 // TODO: refactor this header
66 // 41(!!!) includes of SYCL headers + 10 includes of standard headers.
67 // 3300+ lines of code
68 
69 // SYCL_LANGUAGE_VERSION is 4 digit year followed by 2 digit revision
70 #if !SYCL_LANGUAGE_VERSION || SYCL_LANGUAGE_VERSION < 202001
71 #define __SYCL_NONCONST_FUNCTOR__
72 #endif
73 
74 // replace _KERNELFUNCPARAM(KernelFunc) with KernelType KernelFunc
75 // or const KernelType &KernelFunc
76 #ifdef __SYCL_NONCONST_FUNCTOR__
77 #define _KERNELFUNCPARAMTYPE KernelType
78 #else
79 #define _KERNELFUNCPARAMTYPE const KernelType &
80 #endif
81 #define _KERNELFUNCPARAM(a) _KERNELFUNCPARAMTYPE a
82 
83 #if defined(__SYCL_UNNAMED_LAMBDA__)
84 // We can't use nested types (e.g. struct S defined inside main() routine) to
85 // name kernels. At the same time, we have to provide a unique kernel name for
86 // sycl::fill and the only thing we can use to introduce that uniqueness (in
87 // general) is the template parameter T which might be exactly that nested type.
88 // That means we cannot support sycl::fill(void *, T&, size_t) for such types in
89 // general. However, we can do better than that when unnamed lambdas are
90 // enabled, so do it here! See also https://github.com/intel/llvm/issues/469.
91 template <typename DataT, int Dimensions, sycl::access::mode AccessMode,
92  sycl::access::target AccessTarget,
94 using __fill = sycl::detail::auto_name;
95 template <typename T> using __usmfill = sycl::detail::auto_name;
96 template <typename T> using __usmfill2d = sycl::detail::auto_name;
97 template <typename T> using __usmmemcpy2d = sycl::detail::auto_name;
98 
99 template <typename T_Src, typename T_Dst, int Dims,
102 using __copyAcc2Ptr = sycl::detail::auto_name;
103 
104 template <typename T_Src, typename T_Dst, int Dims,
107 using __copyPtr2Acc = sycl::detail::auto_name;
108 
109 template <typename T_Src, int Dims_Src, sycl::access::mode AccessMode_Src,
110  sycl::access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
111  sycl::access::mode AccessMode_Dst,
112  sycl::access::target AccessTarget_Dst,
113  sycl::access::placeholder IsPlaceholder_Src,
114  sycl::access::placeholder IsPlaceholder_Dst>
115 using __copyAcc2Acc = sycl::detail::auto_name;
116 #else
117 // Limited fallback path for when unnamed lambdas aren't available. Cannot
118 // handle nested types.
119 template <typename DataT, int Dimensions, sycl::access::mode AccessMode,
120  sycl::access::target AccessTarget,
122 class __fill;
123 template <typename T> class __usmfill;
124 template <typename T> class __usmfill2d;
125 template <typename T> class __usmmemcpy2d;
126 
127 template <typename T_Src, typename T_Dst, int Dims,
131 
132 template <typename T_Src, typename T_Dst, int Dims,
136 
137 template <typename T_Src, int Dims_Src, sycl::access::mode AccessMode_Src,
138  sycl::access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
139  sycl::access::mode AccessMode_Dst,
140  sycl::access::target AccessTarget_Dst,
141  sycl::access::placeholder IsPlaceholder_Src,
142  sycl::access::placeholder IsPlaceholder_Dst>
144 #endif
145 
146 // For unit testing purposes
147 class MockHandler;
148 
149 namespace sycl {
150 inline namespace _V1 {
151 
152 // Forward declaration
153 
154 class handler;
155 template <typename T, int Dimensions, typename AllocatorT, typename Enable>
156 class buffer;
157 
158 namespace ext::intel::experimental {
159 template <class _name, class _dataT, int32_t _min_capacity, class _propertiesT,
160  class>
161 class pipe;
162 }
163 
164 namespace ext ::oneapi ::experimental {
165 struct image_descriptor;
166 } // namespace ext::oneapi::experimental
167 
168 namespace ext::oneapi::experimental::detail {
169 class graph_impl;
170 } // namespace ext::oneapi::experimental::detail
171 namespace detail {
172 
173 class handler_impl;
174 class kernel_impl;
175 class queue_impl;
176 class stream_impl;
177 class event_impl;
178 template <typename DataT, int Dimensions, access::mode AccessMode,
180 class image_accessor;
181 class HandlerAccess;
182 class HostTask;
183 
184 using EventImplPtr = std::shared_ptr<event_impl>;
185 
186 template <typename RetType, typename Func, typename Arg>
187 static Arg member_ptr_helper(RetType (Func::*)(Arg) const);
188 
189 // Non-const version of the above template to match functors whose 'operator()'
190 // is declared w/o the 'const' qualifier.
191 template <typename RetType, typename Func, typename Arg>
192 static Arg member_ptr_helper(RetType (Func::*)(Arg));
193 
194 // Version with two arguments to handle the case when kernel_handler is passed
195 // to a lambda
196 template <typename RetType, typename Func, typename Arg1, typename Arg2>
197 static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2) const);
198 
199 // Non-const version of the above template to match functors whose 'operator()'
200 // is declared w/o the 'const' qualifier.
201 template <typename RetType, typename Func, typename Arg1, typename Arg2>
202 static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2));
203 
204 template <typename F, typename SuggestedArgType>
205 decltype(member_ptr_helper(&F::operator())) argument_helper(int);
206 
207 template <typename F, typename SuggestedArgType>
208 SuggestedArgType argument_helper(...);
209 
210 template <typename F, typename SuggestedArgType>
211 using lambda_arg_type = decltype(argument_helper<F, SuggestedArgType>(0));
212 
213 // Used when parallel_for range is rounded-up.
214 template <typename Name> class __pf_kernel_wrapper;
215 
216 template <typename Type> struct get_kernel_wrapper_name_t {
218 };
219 
220 __SYCL_EXPORT device getDeviceFromHandler(handler &);
221 
222 // Checks if a device_global has any registered kernel usage.
223 __SYCL_EXPORT bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr);
224 
225 // Extracts a pointer to the value inside a dynamic parameter
226 __SYCL_EXPORT void *getValueFromDynamicParameter(
228  &DynamicParamBase);
229 
230 #if __SYCL_ID_QUERIES_FIT_IN_INT__
231 template <typename T> struct NotIntMsg;
232 
233 template <int Dims> struct NotIntMsg<range<Dims>> {
234  constexpr static const char *Msg =
235  "Provided range is out of integer limits. Pass "
236  "`-fno-sycl-id-queries-fit-in-int' to disable range check.";
237 };
238 
239 template <int Dims> struct NotIntMsg<id<Dims>> {
240  constexpr static const char *Msg =
241  "Provided offset is out of integer limits. Pass "
242  "`-fno-sycl-id-queries-fit-in-int' to disable offset check.";
243 };
244 #endif
245 
246 // Helper for merging properties with ones defined in an optional kernel functor
247 // getter.
248 template <typename KernelType, typename PropertiesT, typename Cond = void>
250  using type = PropertiesT;
251 };
252 template <typename KernelType, typename PropertiesT>
254  KernelType, PropertiesT,
255  std::enable_if_t<ext::oneapi::experimental::detail::
256  HasKernelPropertiesGetMethod<KernelType>::value>> {
259  KernelType>::properties_t;
260  static_assert(
262  "get(sycl::ext::oneapi::experimental::properties_tag) member in kernel "
263  "functor class must return a valid property list.");
265  PropertiesT, get_method_properties>;
266 };
267 
268 #if __SYCL_ID_QUERIES_FIT_IN_INT__
269 template <typename T, typename ValT>
270 typename std::enable_if_t<std::is_same<ValT, size_t>::value ||
271  std::is_same<ValT, unsigned long long>::value>
272 checkValueRangeImpl(ValT V) {
273  static constexpr size_t Limit =
274  static_cast<size_t>((std::numeric_limits<int>::max)());
275  if (V > Limit)
276  throw sycl::exception(make_error_code(errc::nd_range), NotIntMsg<T>::Msg);
277 }
278 #endif
279 
280 template <int Dims, typename T>
281 typename std::enable_if_t<std::is_same_v<T, range<Dims>> ||
282  std::is_same_v<T, id<Dims>>>
283 checkValueRange(const T &V) {
284 #if __SYCL_ID_QUERIES_FIT_IN_INT__
285  for (size_t Dim = 0; Dim < Dims; ++Dim)
286  checkValueRangeImpl<T>(V[Dim]);
287 
288  {
289  unsigned long long Product = 1;
290  for (size_t Dim = 0; Dim < Dims; ++Dim) {
291  Product *= V[Dim];
292  // check value now to prevent product overflow in the end
293  checkValueRangeImpl<T>(Product);
294  }
295  }
296 #else
297  (void)V;
298 #endif
299 }
300 
301 template <int Dims>
302 void checkValueRange(const range<Dims> &R, const id<Dims> &O) {
303 #if __SYCL_ID_QUERIES_FIT_IN_INT__
304  checkValueRange<Dims>(R);
305  checkValueRange<Dims>(O);
306 
307  for (size_t Dim = 0; Dim < Dims; ++Dim) {
308  unsigned long long Sum = R[Dim] + O[Dim];
309 
310  checkValueRangeImpl<range<Dims>>(Sum);
311  }
312 #else
313  (void)R;
314  (void)O;
315 #endif
316 }
317 
318 template <int Dims, typename T>
319 typename std::enable_if_t<std::is_same_v<T, nd_range<Dims>>>
320 checkValueRange(const T &V) {
321 #if __SYCL_ID_QUERIES_FIT_IN_INT__
322  checkValueRange<Dims>(V.get_global_range());
323  checkValueRange<Dims>(V.get_local_range());
324  checkValueRange<Dims>(V.get_offset());
325 
326  checkValueRange<Dims>(V.get_global_range(), V.get_offset());
327 #else
328  (void)V;
329 #endif
330 }
331 
332 template <int Dims> class RoundedRangeIDGenerator {
333  id<Dims> Id;
334  id<Dims> InitId;
335  range<Dims> UserRange;
336  range<Dims> RoundedRange;
337  bool Done = false;
338 
339 public:
340  RoundedRangeIDGenerator(const id<Dims> &Id, const range<Dims> &UserRange,
341  const range<Dims> &RoundedRange)
342  : Id(Id), InitId(Id), UserRange(UserRange), RoundedRange(RoundedRange) {
343  for (int i = 0; i < Dims; ++i)
344  if (Id[i] >= UserRange[i])
345  Done = true;
346  }
347 
348  explicit operator bool() { return !Done; }
349 
350  void updateId() {
351  for (int i = 0; i < Dims; ++i) {
352  Id[i] += RoundedRange[i];
353  if (Id[i] < UserRange[i])
354  return;
355  Id[i] = InitId[i];
356  }
357  Done = true;
358  }
359 
360  id<Dims> getId() { return Id; }
361 
362  template <typename KernelType> auto getItem() {
363  if constexpr (std::is_invocable_v<KernelType, item<Dims> &> ||
364  std::is_invocable_v<KernelType, item<Dims> &, kernel_handler>)
365  return detail::Builder::createItem<Dims, true>(UserRange, getId(), {});
366  else {
367  static_assert(std::is_invocable_v<KernelType, item<Dims, false> &> ||
368  std::is_invocable_v<KernelType, item<Dims, false> &,
369  kernel_handler>,
370  "Kernel must be invocable with an item!");
371  return detail::Builder::createItem<Dims, false>(UserRange, getId());
372  }
373  }
374 };
375 
376 // TODO: The wrappers can be optimized further so that the body
377 // essentially looks like this:
378 // for (auto z = it[2]; z < UserRange[2]; z += it.get_range(2))
379 // for (auto y = it[1]; y < UserRange[1]; y += it.get_range(1))
380 // for (auto x = it[0]; x < UserRange[0]; x += it.get_range(0))
381 // KernelFunc({x,y,z});
382 template <typename TransformedArgType, int Dims, typename KernelType>
384 public:
386  KernelType KernelFunc;
387  void operator()(item<Dims> It) const {
388  auto RoundedRange = It.get_range();
389  for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen;
390  Gen.updateId()) {
391  auto item = Gen.template getItem<KernelType>();
392  KernelFunc(item);
393  }
394  }
395 };
396 
397 template <typename TransformedArgType, int Dims, typename KernelType>
399 public:
401  KernelType KernelFunc;
402  void operator()(item<Dims> It, kernel_handler KH) const {
403  auto RoundedRange = It.get_range();
404  for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen;
405  Gen.updateId()) {
406  auto item = Gen.template getItem<KernelType>();
407  KernelFunc(item, KH);
408  }
409  }
410 };
411 
412 using std::enable_if_t;
413 using sycl::detail::queue_impl;
414 
415 // Returns true if x*y will overflow in T;
416 // otherwise, returns false and stores x*y in dst.
417 template <typename T>
418 static std::enable_if_t<std::is_unsigned_v<T>, bool>
419 multiply_with_overflow_check(T &dst, T x, T y) {
420  dst = x * y;
421  return (y != 0) && (x > (std::numeric_limits<T>::max)() / y);
422 }
423 
424 template <int Dims> bool range_size_fits_in_size_t(const range<Dims> &r) {
425  size_t acc = 1;
426  for (int i = 0; i < Dims; ++i) {
427  bool did_overflow = multiply_with_overflow_check(acc, acc, r[i]);
428  if (did_overflow)
429  return false;
430  }
431  return true;
432 }
433 } // namespace detail
434 
468 class __SYCL_EXPORT handler {
469 private:
475  handler(std::shared_ptr<detail::queue_impl> Queue, bool CallerNeedsEvent);
476 
487  handler(std::shared_ptr<detail::queue_impl> Queue,
488  std::shared_ptr<detail::queue_impl> PrimaryQueue,
489  std::shared_ptr<detail::queue_impl> SecondaryQueue,
490  bool CallerNeedsEvent);
491 
498  handler(std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph);
499 
500  void *storeRawArg(const void *Ptr, size_t Size);
501 
502  void *
503  storeRawArg(const sycl::ext::oneapi::experimental::raw_kernel_arg &RKA) {
504  return storeRawArg(RKA.MArgData, RKA.MArgSize);
505  }
506 
508  template <typename T> void *storePlainArg(T &&Arg) {
509  return storeRawArg(&Arg, sizeof(T));
510  }
511 
512  void setType(detail::CGType Type);
513 
514  detail::CGType getType() const;
515 
516  void throwIfActionIsCreated() {
517  if (detail::CGType::None != getType())
519  "Attempt to set multiple actions for the "
520  "command group. Command group must consist of "
521  "a single kernel or explicit memory operation.");
522  }
523 
524  constexpr static int AccessTargetMask = 0x7ff;
528  template <typename KernelName, typename KernelType>
529  void throwOnLocalAccessorMisuse() const {
530  using NameT =
532  using KI = sycl::detail::KernelInfo<NameT>;
533 
534  auto *KernelArgs = &KI::getParamDesc(0);
535 
536  for (unsigned I = 0; I < KI::getNumParams(); ++I) {
537  const detail::kernel_param_kind_t &Kind = KernelArgs[I].kind;
538  const access::target AccTarget =
539  static_cast<access::target>(KernelArgs[I].info & AccessTargetMask);
541  (AccTarget == target::local))
542  throw sycl::exception(
544  "A local accessor must not be used in a SYCL kernel function "
545  "that is invoked via single_task or via the simple form of "
546  "parallel_for that takes a range parameter.");
547  }
548  }
549 
552  void
553  extractArgsAndReqsFromLambda(char *LambdaPtr, size_t KernelArgsNum,
554  const detail::kernel_param_desc_t *KernelArgs,
555  bool IsESIMD);
556 
558  void extractArgsAndReqs();
559 
560  void processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
561  const int Size, const size_t Index, size_t &IndexShift,
562  bool IsKernelCreatedFromSource, bool IsESIMD);
563 
565  detail::string getKernelName();
566 
567  template <typename LambdaNameT> bool lambdaAndKernelHaveEqualName() {
568  // TODO It is unclear a kernel and a lambda/functor must to be equal or not
569  // for parallel_for with sycl::kernel and lambda/functor together
570  // Now if they are equal we extract argumets from lambda/functor for the
571  // kernel. Else it is necessary use set_atg(s) for resolve the order and
572  // values of arguments for the kernel.
573  assert(MKernel && "MKernel is not initialized");
574  const std::string LambdaName = detail::KernelInfo<LambdaNameT>::getName();
575  detail::string KernelName = getKernelName();
576  return KernelName == LambdaName;
577  }
578 
581  void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
582 
589  event finalize();
590 
599  event finalize(bool CallerNeedsEvent);
600 
606  void addStream(const std::shared_ptr<detail::stream_impl> &Stream) {
607  MStreamStorage.push_back(Stream);
608  }
609 
615  void addReduction(const std::shared_ptr<const void> &ReduObj);
616 
622  template <typename T, int Dimensions, typename AllocatorT, typename Enable>
623  void
624  addReduction(const std::shared_ptr<buffer<T, Dimensions, AllocatorT, Enable>>
625  &ReduBuf) {
627  addReduction(std::shared_ptr<const void>(ReduBuf));
628  }
629 
630  ~handler() = default;
631 
632 #ifdef __SYCL_DEVICE_ONLY__
633  // In device compilation accessor isn't inherited from host base classes, so
634  // can't detect by it. Since we don't expect it to be ever called in device
635  // execution, just use blind void *.
636  void associateWithHandler(void *AccBase, access::target AccTarget);
637  void associateWithHandler(void *AccBase, image_target AccTarget);
638 #else
639  void associateWithHandlerCommon(detail::AccessorImplPtr AccImpl,
640  int AccTarget);
642  access::target AccTarget);
644  image_target AccTarget);
646  image_target AccTarget);
647 #endif
648 
649  // Recursively calls itself until arguments pack is fully processed.
650  // The version for regular(standard layout) argument.
651  template <typename T, typename... Ts>
652  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&...Args) {
653  set_arg(ArgIndex, std::move(Arg));
654  setArgsHelper(++ArgIndex, std::move(Args)...);
655  }
656 
657  void setArgsHelper(int) {}
658 
659  void setLocalAccessorArgHelper(int ArgIndex,
660  detail::LocalAccessorBaseHost &LocalAccBase) {
661  detail::LocalAccessorImplPtr LocalAccImpl =
662  detail::getSyclObjImpl(LocalAccBase);
663  detail::LocalAccessorImplHost *Req = LocalAccImpl.get();
664  MLocalAccStorage.push_back(std::move(LocalAccImpl));
666  static_cast<int>(access::target::local), ArgIndex);
667  }
668 
669  // setArgHelper for local accessor argument (legacy accessor interface)
670  template <typename DataT, int Dims, access::mode AccessMode,
672  void setArgHelper(int ArgIndex,
673  accessor<DataT, Dims, AccessMode, access::target::local,
674  IsPlaceholder> &&Arg) {
675  (void)ArgIndex;
676  (void)Arg;
677 #ifndef __SYCL_DEVICE_ONLY__
678  setLocalAccessorArgHelper(ArgIndex, Arg);
679 #endif
680  }
681 
682  // setArgHelper for local accessor argument (up to date accessor interface)
683  template <typename DataT, int Dims>
684  void setArgHelper(int ArgIndex, local_accessor<DataT, Dims> &&Arg) {
685  (void)ArgIndex;
686  (void)Arg;
687 #ifndef __SYCL_DEVICE_ONLY__
688  setLocalAccessorArgHelper(ArgIndex, Arg);
689 #endif
690  }
691 
692  // setArgHelper for non local accessor argument.
693  template <typename DataT, int Dims, access::mode AccessMode,
695  typename std::enable_if_t<AccessTarget != access::target::local, void>
696  setArgHelper(
697  int ArgIndex,
701  detail::AccessorImplHost *Req = AccImpl.get();
702  addAccessorReq(std::move(AccImpl));
703  // Add accessor to the list of arguments.
705  static_cast<int>(AccessTarget), ArgIndex);
706  }
707 
708  template <typename T> void setArgHelper(int ArgIndex, T &&Arg) {
709  void *StoredArg = storePlainArg(Arg);
710 
711  if (!std::is_same<cl_mem, T>::value && std::is_pointer<T>::value) {
712  addArg(detail::kernel_param_kind_t::kind_pointer, StoredArg, sizeof(T),
713  ArgIndex);
714  } else {
715  addArg(detail::kernel_param_kind_t::kind_std_layout, StoredArg, sizeof(T),
716  ArgIndex);
717  }
718  }
719 
720  void setArgHelper(int ArgIndex, sampler &&Arg) {
721  void *StoredArg = storePlainArg(Arg);
723  sizeof(sampler), ArgIndex);
724  }
725 
726  // setArgHelper for graph dynamic_parameters
727  template <typename T>
728  void
729  setArgHelper(int ArgIndex,
731  // Extract and copy arg so we can move it into setArgHelper
732  T ArgValue =
733  *static_cast<T *>(detail::getValueFromDynamicParameter(DynamicParam));
734  // Set the arg in the handler as normal
735  setArgHelper(ArgIndex, std::move(ArgValue));
736  // Register the dynamic parameter with the handler for later association
737  // with the node being added
738  registerDynamicParameter(DynamicParam, ArgIndex);
739  }
740 
741  // setArgHelper for the raw_kernel_arg extension type.
742  void setArgHelper(int ArgIndex,
744  auto StoredArg = storeRawArg(Arg);
746  Arg.MArgSize, ArgIndex);
747  }
748 
753  void registerDynamicParameter(
755  &DynamicParamBase,
756  int ArgIndex);
757 
758  /* The kernel passed to StoreLambda can take an id, an item or an nd_item as
759  * its argument. Since esimd plugin directly invokes the kernel (doesn’t use
760  * piKernelSetArg), the kernel argument type must be known to the plugin.
761  * However, passing kernel argument type to the plugin requires changing ABI
762  * in HostKernel class. To overcome this problem, helpers below wrap the
763  * “original” kernel with a functor that always takes an nd_item as argument.
764  * A functor is used instead of a lambda because extractArgsAndReqsFromLambda
765  * needs access to the “original” kernel and keeps references to its internal
766  * data, i.e. the kernel passed as argument cannot be local in scope. The
767  * functor itself is again encapsulated in a std::function since functor’s
768  * type is unknown to the plugin.
769  */
770 
771  // For 'id, item w/wo offset, nd_item' kernel arguments
772  template <class KernelType, class NormalizedKernelType, int Dims>
773  KernelType *ResetHostKernelHelper(const KernelType &KernelFunc) {
774  NormalizedKernelType NormalizedKernel(KernelFunc);
775  auto NormalizedKernelFunc =
776  std::function<void(const sycl::nd_item<Dims> &)>(NormalizedKernel);
777  auto HostKernelPtr = new detail::HostKernel<decltype(NormalizedKernelFunc),
778  sycl::nd_item<Dims>, Dims>(
779  std::move(NormalizedKernelFunc));
780  MHostKernel.reset(HostKernelPtr);
781  return &HostKernelPtr->MKernel.template target<NormalizedKernelType>()
782  ->MKernelFunc;
783  }
784 
785  // For 'sycl::id<Dims>' kernel argument
786  template <class KernelType, typename ArgT, int Dims>
787  std::enable_if_t<std::is_same_v<ArgT, sycl::id<Dims>>, KernelType *>
788  ResetHostKernel(const KernelType &KernelFunc) {
789  struct NormalizedKernelType {
790  KernelType MKernelFunc;
791  NormalizedKernelType(const KernelType &KernelFunc)
792  : MKernelFunc(KernelFunc) {}
793  void operator()(const nd_item<Dims> &Arg) {
794  detail::runKernelWithArg(MKernelFunc, Arg.get_global_id());
795  }
796  };
797  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
798  KernelFunc);
799  }
800 
801  // For 'sycl::nd_item<Dims>' kernel argument
802  template <class KernelType, typename ArgT, int Dims>
803  std::enable_if_t<std::is_same_v<ArgT, sycl::nd_item<Dims>>, KernelType *>
804  ResetHostKernel(const KernelType &KernelFunc) {
805  struct NormalizedKernelType {
806  KernelType MKernelFunc;
807  NormalizedKernelType(const KernelType &KernelFunc)
808  : MKernelFunc(KernelFunc) {}
809  void operator()(const nd_item<Dims> &Arg) {
810  detail::runKernelWithArg(MKernelFunc, Arg);
811  }
812  };
813  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
814  KernelFunc);
815  }
816 
817  // For 'sycl::item<Dims, without_offset>' kernel argument
818  template <class KernelType, typename ArgT, int Dims>
819  std::enable_if_t<std::is_same_v<ArgT, sycl::item<Dims, false>>, KernelType *>
820  ResetHostKernel(const KernelType &KernelFunc) {
821  struct NormalizedKernelType {
822  KernelType MKernelFunc;
823  NormalizedKernelType(const KernelType &KernelFunc)
824  : MKernelFunc(KernelFunc) {}
825  void operator()(const nd_item<Dims> &Arg) {
826  sycl::item<Dims, false> Item = detail::Builder::createItem<Dims, false>(
827  Arg.get_global_range(), Arg.get_global_id());
828  detail::runKernelWithArg(MKernelFunc, Item);
829  }
830  };
831  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
832  KernelFunc);
833  }
834 
835  // For 'sycl::item<Dims, with_offset>' kernel argument
836  template <class KernelType, typename ArgT, int Dims>
837  std::enable_if_t<std::is_same_v<ArgT, sycl::item<Dims, true>>, KernelType *>
838  ResetHostKernel(const KernelType &KernelFunc) {
839  struct NormalizedKernelType {
840  KernelType MKernelFunc;
841  NormalizedKernelType(const KernelType &KernelFunc)
842  : MKernelFunc(KernelFunc) {}
843  void operator()(const nd_item<Dims> &Arg) {
844  sycl::item<Dims, true> Item = detail::Builder::createItem<Dims, true>(
845  Arg.get_global_range(), Arg.get_global_id(), Arg.get_offset());
846  detail::runKernelWithArg(MKernelFunc, Item);
847  }
848  };
849  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
850  KernelFunc);
851  }
852 
853  // For 'void' kernel argument (single_task)
854  template <class KernelType, typename ArgT, int Dims>
855  typename std::enable_if_t<std::is_same_v<ArgT, void>, KernelType *>
856  ResetHostKernel(const KernelType &KernelFunc) {
857  struct NormalizedKernelType {
858  KernelType MKernelFunc;
859  NormalizedKernelType(const KernelType &KernelFunc)
860  : MKernelFunc(KernelFunc) {}
861  void operator()(const nd_item<Dims> &Arg) {
862  (void)Arg;
863  detail::runKernelWithoutArg(MKernelFunc);
864  }
865  };
866  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
867  KernelFunc);
868  }
869 
870  // For 'sycl::group<Dims>' kernel argument
871  // 'wrapper'-based approach using 'NormalizedKernelType' struct is not used
872  // for 'void(sycl::group<Dims>)' since 'void(sycl::group<Dims>)' is not
873  // supported in ESIMD.
874  template <class KernelType, typename ArgT, int Dims>
875  std::enable_if_t<std::is_same_v<ArgT, sycl::group<Dims>>, KernelType *>
876  ResetHostKernel(const KernelType &KernelFunc) {
877  MHostKernel.reset(
879  return (KernelType *)(MHostKernel->getPtr());
880  }
881 
889  void verifyUsedKernelBundle(const std::string &KernelName) {
890  verifyUsedKernelBundleInternal(detail::string_view{KernelName});
891  }
892  void verifyUsedKernelBundleInternal(detail::string_view KernelName);
893 
900  template <typename KernelName, typename KernelType, int Dims,
901  typename LambdaArgType>
902  void StoreLambda(KernelType KernelFunc) {
904  constexpr bool IsCallableWithKernelHandler =
906  LambdaArgType>::value;
907 
908  KernelType *KernelPtr =
909  ResetHostKernel<KernelType, LambdaArgType, Dims>(KernelFunc);
910 
911  constexpr bool KernelHasName =
912  KI::getName() != nullptr && KI::getName()[0] != '\0';
913 
914  // Some host compilers may have different captures from Clang. Currently
915  // there is no stable way of handling this when extracting the captures, so
916  // a static assert is made to fail for incompatible kernel lambdas.
917  static_assert(
918  !KernelHasName || sizeof(KernelFunc) == KI::getKernelSize(),
919  "Unexpected kernel lambda size. This can be caused by an "
920  "external host compiler producing a lambda with an "
921  "unexpected layout. This is a limitation of the compiler."
922  "In many cases the difference is related to capturing constexpr "
923  "variables. In such cases removing constexpr specifier aligns the "
924  "captures between the host compiler and the device compiler."
925  "\n"
926  "In case of MSVC, passing "
927  "-fsycl-host-compiler-options='/std:c++latest' "
928  "might also help.");
929 
930  // Empty name indicates that the compilation happens without integration
931  // header, so don't perform things that require it.
932  if (KernelHasName) {
933  // TODO support ESIMD in no-integration-header case too.
934  clearArgs();
935  extractArgsAndReqsFromLambda(reinterpret_cast<char *>(KernelPtr),
936  KI::getNumParams(), &KI::getParamDesc(0),
937  KI::isESIMD());
938  MKernelName = KI::getName();
939  } else {
940  // In case w/o the integration header it is necessary to process
941  // accessors from the list(which are associated with this handler) as
942  // arguments. We must copy the associated accessors as they are checked
943  // later during finalize.
944  setArgsToAssociatedAccessors();
945  }
946 
947  // If the kernel lambda is callable with a kernel_handler argument, manifest
948  // the associated kernel handler.
949  if (IsCallableWithKernelHandler) {
950  getOrInsertHandlerKernelBundle(/*Insert=*/true);
951  }
952  }
953 
954  void verifyDeviceHasProgressGuarantee(
958 
959  template <typename Properties>
960  void checkAndSetClusterRange(const Properties &Props) {
961  namespace syclex = sycl::ext::oneapi::experimental;
962  constexpr std::size_t ClusterDim =
963  syclex::detail::getClusterDim<Properties>();
964  if constexpr (ClusterDim > 0) {
965  auto ClusterSize = Props
966  .template get_property<
967  syclex::cuda::cluster_size_key<ClusterDim>>()
968  .get_cluster_size();
969  setKernelClusterLaunch(padRange(ClusterSize), ClusterDim);
970  }
971  }
972 
976  template <
977  typename KernelName,
978  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
979  void processProperties(PropertiesT Props) {
981  static_assert(
983  "Template type is not a property list.");
984  static_assert(
985  !PropertiesT::template has_property<
987  (PropertiesT::template has_property<
989  KI::isESIMD()),
990  "Floating point control property is supported for ESIMD kernels only.");
991  static_assert(
992  !PropertiesT::template has_property<
994  "indirectly_callable property cannot be applied to SYCL kernels");
995  if constexpr (PropertiesT::template has_property<
997  auto Config = Props.template get_property<
1000  setKernelCacheConfig(StableKernelCacheConfig::LargeSLM);
1001  } else if (Config == sycl::ext::intel::experimental::large_data) {
1002  setKernelCacheConfig(StableKernelCacheConfig::LargeData);
1003  }
1004  } else {
1005  std::ignore = Props;
1006  }
1007 
1008  constexpr bool UsesRootSync = PropertiesT::template has_property<
1010  setKernelIsCooperative(UsesRootSync);
1011  if constexpr (PropertiesT::template has_property<
1012  sycl::ext::oneapi::experimental::
1013  work_group_progress_key>()) {
1014  auto prop = Props.template get_property<
1016  verifyDeviceHasProgressGuarantee(
1017  prop.guarantee,
1018  sycl::ext::oneapi::experimental::execution_scope::work_group,
1019  prop.coordinationScope);
1020  }
1021  if constexpr (PropertiesT::template has_property<
1022  sycl::ext::oneapi::experimental::
1023  sub_group_progress_key>()) {
1024  auto prop = Props.template get_property<
1026  verifyDeviceHasProgressGuarantee(
1027  prop.guarantee,
1029  prop.coordinationScope);
1030  }
1031  if constexpr (PropertiesT::template has_property<
1032  sycl::ext::oneapi::experimental::
1033  work_item_progress_key>()) {
1034  auto prop = Props.template get_property<
1036  verifyDeviceHasProgressGuarantee(
1037  prop.guarantee,
1038  sycl::ext::oneapi::experimental::execution_scope::work_item,
1039  prop.coordinationScope);
1040  }
1041 
1042  checkAndSetClusterRange(Props);
1043  }
1044 
1049  template <int Dims_Src, int Dims_Dst>
1050  static bool IsCopyingRectRegionAvailable(const range<Dims_Src> Src,
1051  const range<Dims_Dst> Dst) {
1052  if (Dims_Src > Dims_Dst)
1053  return false;
1054  for (size_t I = 0; I < Dims_Src; ++I)
1055  if (Src[I] > Dst[I])
1056  return false;
1057  return true;
1058  }
1059 
1065  template <typename TSrc, int DimSrc, access::mode ModeSrc,
1066  access::target TargetSrc, typename TDst, int DimDst,
1067  access::mode ModeDst, access::target TargetDst,
1068  access::placeholder IsPHSrc, access::placeholder IsPHDst>
1069  std::enable_if_t<(DimSrc > 0) && (DimDst > 0), bool>
1072  if (IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range()))
1073  return false;
1074 
1075  range<1> LinearizedRange(Src.size());
1076  parallel_for<__copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
1077  ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
1078  LinearizedRange, [=](id<1> Id) {
1079  size_t Index = Id[0];
1080  id<DimSrc> SrcId = detail::getDelinearizedId(Src.get_range(), Index);
1081  id<DimDst> DstId = detail::getDelinearizedId(Dst.get_range(), Index);
1082  Dst[DstId] = Src[SrcId];
1083  });
1084  return true;
1085  }
1086 
1094  template <typename TSrc, int DimSrc, access::mode ModeSrc,
1095  access::target TargetSrc, typename TDst, int DimDst,
1096  access::mode ModeDst, access::target TargetDst,
1097  access::placeholder IsPHSrc, access::placeholder IsPHDst>
1098  std::enable_if_t<DimSrc == 0 || DimDst == 0, bool>
1101  return false;
1102  }
1103 
1104  constexpr static bool isConstOrGlobal(access::target AccessTarget) {
1105  return AccessTarget == access::target::device ||
1106  AccessTarget == access::target::constant_buffer;
1107  }
1108 
1109  constexpr static bool isImageOrImageArray(access::target AccessTarget) {
1110  return AccessTarget == access::target::image ||
1111  AccessTarget == access::target::image_array;
1112  }
1113 
1114  constexpr static bool
1115  isValidTargetForExplicitOp(access::target AccessTarget) {
1116  return isConstOrGlobal(AccessTarget) || isImageOrImageArray(AccessTarget);
1117  }
1118 
1119  constexpr static bool isValidModeForSourceAccessor(access::mode AccessMode) {
1120  return AccessMode == access::mode::read ||
1122  }
1123 
1124  constexpr static bool
1125  isValidModeForDestinationAccessor(access::mode AccessMode) {
1126  return AccessMode == access::mode::write ||
1130  }
1131 
1132  // PI APIs only support select fill sizes: 1, 2, 4, 8, 16, 32, 64, 128
1133  constexpr static bool isBackendSupportedFillSize(size_t Size) {
1134  return Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16 ||
1135  Size == 32 || Size == 64 || Size == 128;
1136  }
1137 
1138  bool eventNeeded() const;
1139 
1140  template <int Dims, typename LambdaArgType> struct TransformUserItemType {
1141  using type = std::conditional_t<
1142  std::is_convertible_v<nd_item<Dims>, LambdaArgType>, nd_item<Dims>,
1143  std::conditional_t<std::is_convertible_v<item<Dims>, LambdaArgType>,
1144  item<Dims>, LambdaArgType>>;
1145  };
1146 
1147  std::optional<std::array<size_t, 3>> getMaxWorkGroups();
1148  // We need to use this version to support gcc 7.5.0. Remove when minimal
1149  // supported gcc version is bumped.
1150  std::tuple<std::array<size_t, 3>, bool> getMaxWorkGroups_v2();
1151 
1152  template <int Dims>
1153  std::tuple<range<Dims>, bool> getRoundedRange(range<Dims> UserRange) {
1154  range<Dims> RoundedRange = UserRange;
1155  // Disable the rounding-up optimizations under these conditions:
1156  // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
1157  // 2. The kernel is provided via an interoperability method (this uses a
1158  // different code path).
1159  // 3. The range is already a multiple of the rounding factor.
1160  //
1161  // Cases 2 and 3 could be supported with extra effort.
1162  // As an optimization for the common case it is an
1163  // implementation choice to not support those scenarios.
1164  // Note that "this_item" is a free function, i.e. not tied to any
1165  // specific id or item. When concurrent parallel_fors are executing
1166  // on a device it is difficult to tell which parallel_for the call is
1167  // being made from. One could replicate portions of the
1168  // call-graph to make this_item calls kernel-specific but this is
1169  // not considered worthwhile.
1170 
1171  // Perform range rounding if rounding-up is enabled.
1172  if (this->DisableRangeRounding())
1173  return {range<Dims>{}, false};
1174 
1175  // Range should be a multiple of this for reasonable performance.
1176  size_t MinFactorX = 16;
1177  // Range should be a multiple of this for improved performance.
1178  size_t GoodFactor = 32;
1179  // Range should be at least this to make rounding worthwhile.
1180  size_t MinRangeX = 1024;
1181 
1182  // Check if rounding parameters have been set through environment:
1183  // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
1184  this->GetRangeRoundingSettings(MinFactorX, GoodFactor, MinRangeX);
1185 
1186  // In SYCL, each dimension of a global range size is specified by
1187  // a size_t, which can be up to 64 bits. All backends should be
1188  // able to accept a kernel launch with a 32-bit global range size
1189  // (i.e. do not throw an error). The OpenCL CPU backend will
1190  // accept every 64-bit global range, but the GPU backends will not
1191  // generally accept every 64-bit global range. So, when we get a
1192  // non-32-bit global range, we wrap the old kernel in a new kernel
1193  // that has each work item peform multiple invocations the old
1194  // kernel in a 32-bit global range.
1195  id<Dims> MaxNWGs = [&] {
1196  auto [MaxWGs, HasMaxWGs] = getMaxWorkGroups_v2();
1197  if (!HasMaxWGs) {
1198  id<Dims> Default;
1199  for (int i = 0; i < Dims; ++i)
1200  Default[i] = (std::numeric_limits<int32_t>::max)();
1201  return Default;
1202  }
1203 
1204  id<Dims> IdResult;
1205  size_t Limit = (std::numeric_limits<int>::max)();
1206  for (int i = 0; i < Dims; ++i)
1207  IdResult[i] = (std::min)(Limit, MaxWGs[Dims - i - 1]);
1208  return IdResult;
1209  }();
1211  range<Dims> MaxRange;
1212  for (int i = 0; i < Dims; ++i) {
1213  auto DesiredSize = MaxNWGs[i] * GoodFactor;
1214  MaxRange[i] =
1215  DesiredSize <= M ? DesiredSize : (M / GoodFactor) * GoodFactor;
1216  }
1217 
1218  bool DidAdjust = false;
1219  auto Adjust = [&](int Dim, size_t Value) {
1220  if (this->RangeRoundingTrace())
1221  std::cout << "parallel_for range adjusted at dim " << Dim << " from "
1222  << RoundedRange[Dim] << " to " << Value << std::endl;
1223  RoundedRange[Dim] = Value;
1224  DidAdjust = true;
1225  };
1226 
1227 #ifdef __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
1228  size_t GoodExpFactor = 1;
1229  switch (Dims) {
1230  case 1:
1231  GoodExpFactor = 32; // Make global range multiple of {32}
1232  break;
1233  case 2:
1234  GoodExpFactor = 16; // Make global range multiple of {16, 16}
1235  break;
1236  case 3:
1237  GoodExpFactor = 8; // Make global range multiple of {8, 8, 8}
1238  break;
1239  }
1240 
1241  // Check if rounding parameters have been set through environment:
1242  // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
1243  this->GetRangeRoundingSettings(MinFactorX, GoodExpFactor, MinRangeX);
1244 
1245  for (auto i = 0; i < Dims; ++i)
1246  if (UserRange[i] % GoodExpFactor) {
1247  Adjust(i, ((UserRange[i] / GoodExpFactor) + 1) * GoodExpFactor);
1248  }
1249 #else
1250  // Perform range rounding if there are sufficient work-items to
1251  // need rounding and the user-specified range is not a multiple of
1252  // a "good" value.
1253  if (RoundedRange[0] % MinFactorX != 0 && RoundedRange[0] >= MinRangeX) {
1254  // It is sufficient to round up just the first dimension.
1255  // Multiplying the rounded-up value of the first dimension
1256  // by the values of the remaining dimensions (if any)
1257  // will yield a rounded-up value for the total range.
1258  Adjust(0, ((RoundedRange[0] + GoodFactor - 1) / GoodFactor) * GoodFactor);
1259  }
1260 #endif // __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
1261 #ifdef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
1262  // If we are forcing range rounding kernels to be used, we always want the
1263  // rounded range kernel to be generated, even if rounding isn't needed
1264  DidAdjust = true;
1265 #endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
1266 
1267  for (int i = 0; i < Dims; ++i)
1268  if (RoundedRange[i] > MaxRange[i])
1269  Adjust(i, MaxRange[i]);
1270 
1271  if (!DidAdjust)
1272  return {range<Dims>{}, false};
1273  return {RoundedRange, true};
1274  }
1275 
1287  template <
1288  typename KernelName, typename KernelType, int Dims,
1289  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1290  void parallel_for_lambda_impl(range<Dims> UserRange, PropertiesT Props,
1291  KernelType KernelFunc) {
1292  throwIfActionIsCreated();
1293  throwOnLocalAccessorMisuse<KernelName, KernelType>();
1294  if (!range_size_fits_in_size_t(UserRange))
1296  "The total number of work-items in "
1297  "a range must fit within size_t");
1298 
1299  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
1300 
1301  // If 1D kernel argument is an integral type, convert it to sycl::item<1>
1302  // If user type is convertible from sycl::item/sycl::nd_item, use
1303  // sycl::item/sycl::nd_item to transport item information
1304  using TransformedArgType = std::conditional_t<
1305  std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
1306  typename TransformUserItemType<Dims, LambdaArgType>::type>;
1307 
1308  static_assert(!std::is_same_v<TransformedArgType, sycl::nd_item<Dims>>,
1309  "Kernel argument cannot have a sycl::nd_item type in "
1310  "sycl::parallel_for with sycl::range");
1311 
1312  static_assert(std::is_convertible_v<item<Dims>, LambdaArgType> ||
1313  std::is_convertible_v<item<Dims, false>, LambdaArgType>,
1314  "sycl::parallel_for(sycl::range) kernel must have the "
1315  "first argument of sycl::item type, or of a type which is "
1316  "implicitly convertible from sycl::item");
1317 
1318  using RefLambdaArgType = std::add_lvalue_reference_t<LambdaArgType>;
1319  static_assert(
1320  (std::is_invocable_v<KernelType, RefLambdaArgType> ||
1321  std::is_invocable_v<KernelType, RefLambdaArgType, kernel_handler>),
1322  "SYCL kernel lambda/functor has an unexpected signature, it should be "
1323  "invocable with sycl::item and optionally sycl::kernel_handler");
1324 
1325  // TODO: Properties may change the kernel function, so in order to avoid
1326  // conflicts they should be included in the name.
1327  using NameT =
1329 
1330  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1331 
1332  // Range rounding can be disabled by the user.
1333  // Range rounding is not done on the host device.
1334  // Range rounding is supported only for newer SYCL standards.
1335 #if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \
1336  !defined(DPCPP_HOST_DEVICE_OPENMP) && \
1337  !defined(DPCPP_HOST_DEVICE_PERF_NATIVE) && SYCL_LANGUAGE_VERSION >= 202001
1338  auto [RoundedRange, HasRoundedRange] = getRoundedRange(UserRange);
1339  if (HasRoundedRange) {
1340  using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
1341  auto Wrapper =
1342  getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
1343  KernelFunc, UserRange);
1344 
1345  using KName = std::conditional_t<std::is_same<KernelType, NameT>::value,
1346  decltype(Wrapper), NameWT>;
1347 
1348  kernel_parallel_for_wrapper<KName, TransformedArgType, decltype(Wrapper),
1349  PropertiesT>(Wrapper);
1350 #ifndef __SYCL_DEVICE_ONLY__
1351  // We are executing over the rounded range, but there are still
1352  // items/ids that are are constructed in ther range rounded
1353  // kernel use items/ids in the user range, which means that
1354  // __SYCL_ASSUME_INT can still be violated. So check the bounds
1355  // of the user range, instead of the rounded range.
1356  detail::checkValueRange<Dims>(UserRange);
1357  setNDRangeDescriptor(RoundedRange);
1358  StoreLambda<KName, decltype(Wrapper), Dims, TransformedArgType>(
1359  std::move(Wrapper));
1360  setType(detail::CGType::Kernel);
1361  setNDRangeUsed(false);
1362 #endif
1363  } else
1364 #endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ &&
1365  // !DPCPP_HOST_DEVICE_OPENMP && !DPCPP_HOST_DEVICE_PERF_NATIVE &&
1366  // SYCL_LANGUAGE_VERSION >= 202001
1367  {
1368  (void)UserRange;
1369  (void)Props;
1370 #ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
1371  // If parallel_for range rounding is forced then only range rounded
1372  // kernel is generated
1373  kernel_parallel_for_wrapper<NameT, TransformedArgType, KernelType,
1374  PropertiesT>(KernelFunc);
1375 #ifndef __SYCL_DEVICE_ONLY__
1376  processProperties<NameT, PropertiesT>(Props);
1377  detail::checkValueRange<Dims>(UserRange);
1378  setNDRangeDescriptor(std::move(UserRange));
1379  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1380  std::move(KernelFunc));
1381  setType(detail::CGType::Kernel);
1382  setNDRangeUsed(false);
1383 #endif
1384 #else
1385  (void)KernelFunc;
1386 #endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
1387  }
1388  }
1389 
1403  template <typename KernelName, typename KernelType, int Dims,
1404  typename PropertiesT>
1405  void parallel_for_impl(nd_range<Dims> ExecutionRange, PropertiesT Props,
1406  _KERNELFUNCPARAM(KernelFunc)) {
1407  throwIfActionIsCreated();
1408  // TODO: Properties may change the kernel function, so in order to avoid
1409  // conflicts they should be included in the name.
1410  using NameT =
1412  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1413  using LambdaArgType =
1414  sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
1415  static_assert(
1416  std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
1417  "Kernel argument of a sycl::parallel_for with sycl::nd_range "
1418  "must be either sycl::nd_item or be convertible from sycl::nd_item");
1419  using TransformedArgType = sycl::nd_item<Dims>;
1420 
1421  (void)ExecutionRange;
1422  (void)Props;
1423  kernel_parallel_for_wrapper<NameT, TransformedArgType, KernelType,
1424  PropertiesT>(KernelFunc);
1425 #ifndef __SYCL_DEVICE_ONLY__
1426  detail::checkValueRange<Dims>(ExecutionRange);
1427  setNDRangeDescriptor(std::move(ExecutionRange));
1428  processProperties<NameT, PropertiesT>(Props);
1429  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1430  std::move(KernelFunc));
1431  setType(detail::CGType::Kernel);
1432  setNDRangeUsed(true);
1433 #endif
1434  }
1435 
1443  template <int Dims>
1444  void parallel_for_impl(range<Dims> NumWorkItems, kernel Kernel) {
1445  throwIfActionIsCreated();
1446  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1447  detail::checkValueRange<Dims>(NumWorkItems);
1448  setNDRangeDescriptor(std::move(NumWorkItems));
1449  setType(detail::CGType::Kernel);
1450  setNDRangeUsed(false);
1451  extractArgsAndReqs();
1452  MKernelName = getKernelName();
1453  }
1454 
1465  template <
1466  typename KernelName, typename KernelType, int Dims,
1467  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1468  void parallel_for_work_group_lambda_impl(range<Dims> NumWorkGroups,
1469  PropertiesT Props,
1470  _KERNELFUNCPARAM(KernelFunc)) {
1471  throwIfActionIsCreated();
1472  // TODO: Properties may change the kernel function, so in order to avoid
1473  // conflicts they should be included in the name.
1474  using NameT =
1476  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1477  using LambdaArgType =
1478  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
1479  (void)NumWorkGroups;
1480  (void)Props;
1481  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType, KernelType,
1482  PropertiesT>(KernelFunc);
1483 #ifndef __SYCL_DEVICE_ONLY__
1484  processProperties<NameT, PropertiesT>(Props);
1485  detail::checkValueRange<Dims>(NumWorkGroups);
1486  setNDRangeDescriptor(NumWorkGroups, /*SetNumWorkGroups=*/true);
1487  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1488  setType(detail::CGType::Kernel);
1489  setNDRangeUsed(false);
1490 #endif // __SYCL_DEVICE_ONLY__
1491  }
1492 
1505  template <
1506  typename KernelName, typename KernelType, int Dims,
1507  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1508  void parallel_for_work_group_lambda_impl(range<Dims> NumWorkGroups,
1510  PropertiesT Props,
1511  _KERNELFUNCPARAM(KernelFunc)) {
1512  throwIfActionIsCreated();
1513  // TODO: Properties may change the kernel function, so in order to avoid
1514  // conflicts they should be included in the name.
1515  using NameT =
1517  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1518  using LambdaArgType =
1519  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
1520  (void)NumWorkGroups;
1521  (void)WorkGroupSize;
1522  (void)Props;
1523  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType, KernelType,
1524  PropertiesT>(KernelFunc);
1525 #ifndef __SYCL_DEVICE_ONLY__
1526  processProperties<NameT, PropertiesT>(Props);
1527  nd_range<Dims> ExecRange =
1528  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
1529  detail::checkValueRange<Dims>(ExecRange);
1530  setNDRangeDescriptor(std::move(ExecRange));
1531  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1532  setType(detail::CGType::Kernel);
1533 #endif // __SYCL_DEVICE_ONLY__
1534  }
1535 
1536 #ifdef SYCL_LANGUAGE_VERSION
1537 #define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel]]
1538 #else
1539 #define __SYCL_KERNEL_ATTR__
1540 #endif
1541 
1542  // NOTE: the name of this function - "kernel_single_task" - is used by the
1543  // Front End to determine kernel invocation kind.
1544  template <typename KernelName, typename KernelType, typename... Props>
1545 #ifdef __SYCL_DEVICE_ONLY__
1546  [[__sycl_detail__::add_ir_attributes_function(
1547  "sycl-single-task",
1549  nullptr,
1551 #endif
1552  __SYCL_KERNEL_ATTR__ void kernel_single_task(_KERNELFUNCPARAM(KernelFunc)) {
1553 #ifdef __SYCL_DEVICE_ONLY__
1554  KernelFunc();
1555 #else
1556  (void)KernelFunc;
1557 #endif
1558  }
1559 
1560  // NOTE: the name of this function - "kernel_single_task" - is used by the
1561  // Front End to determine kernel invocation kind.
1562  template <typename KernelName, typename KernelType, typename... Props>
1563 #ifdef __SYCL_DEVICE_ONLY__
1564  [[__sycl_detail__::add_ir_attributes_function(
1565  "sycl-single-task",
1567  nullptr,
1569 #endif
1570  __SYCL_KERNEL_ATTR__ void kernel_single_task(_KERNELFUNCPARAM(KernelFunc),
1571  kernel_handler KH) {
1572 #ifdef __SYCL_DEVICE_ONLY__
1573  KernelFunc(KH);
1574 #else
1575  (void)KernelFunc;
1576  (void)KH;
1577 #endif
1578  }
1579 
1580  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1581  // Front End to determine kernel invocation kind.
1582  template <typename KernelName, typename ElementType, typename KernelType,
1583  typename... Props>
1584 #ifdef __SYCL_DEVICE_ONLY__
1585  [[__sycl_detail__::add_ir_attributes_function(
1588 #endif
1589  __SYCL_KERNEL_ATTR__ void kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc)) {
1590 #ifdef __SYCL_DEVICE_ONLY__
1591  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1592 #else
1593  (void)KernelFunc;
1594 #endif
1595  }
1596 
1597  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1598  // Front End to determine kernel invocation kind.
1599  template <typename KernelName, typename ElementType, typename KernelType,
1600  typename... Props>
1601 #ifdef __SYCL_DEVICE_ONLY__
1602  [[__sycl_detail__::add_ir_attributes_function(
1605 #endif
1606  __SYCL_KERNEL_ATTR__ void kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc),
1607  kernel_handler KH) {
1608 #ifdef __SYCL_DEVICE_ONLY__
1609  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1610 #else
1611  (void)KernelFunc;
1612  (void)KH;
1613 #endif
1614  }
1615 
1616  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1617  // used by the Front End to determine kernel invocation kind.
1618  template <typename KernelName, typename ElementType, typename KernelType,
1619  typename... Props>
1620 #ifdef __SYCL_DEVICE_ONLY__
1621  [[__sycl_detail__::add_ir_attributes_function(
1624 #endif
1626  kernel_parallel_for_work_group(_KERNELFUNCPARAM(KernelFunc)) {
1627 #ifdef __SYCL_DEVICE_ONLY__
1628  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1629 #else
1630  (void)KernelFunc;
1631 #endif
1632  }
1633 
1634  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1635  // used by the Front End to determine kernel invocation kind.
1636  template <typename KernelName, typename ElementType, typename KernelType,
1637  typename... Props>
1638 #ifdef __SYCL_DEVICE_ONLY__
1639  [[__sycl_detail__::add_ir_attributes_function(
1642 #endif
1644  kernel_parallel_for_work_group(_KERNELFUNCPARAM(KernelFunc),
1645  kernel_handler KH) {
1646 #ifdef __SYCL_DEVICE_ONLY__
1647  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1648 #else
1649  (void)KernelFunc;
1650  (void)KH;
1651 #endif
1652  }
1653 
1654  template <typename... Props> struct KernelPropertiesUnpackerImpl {
1655  // Just pass extra Props... as template parameters to the underlying
1656  // Caller->* member functions. Don't have reflection so try to use
1657  // templates as much as possible to reduce the amount of boilerplate code
1658  // needed. All the type checks are expected to be done at the Caller's
1659  // methods side.
1660 
1661  template <typename... TypesToForward, typename... ArgsTy>
1662  static void kernel_single_task_unpack(handler *h, ArgsTy... Args) {
1663  h->kernel_single_task<TypesToForward..., Props...>(Args...);
1664  }
1665 
1666  template <typename... TypesToForward, typename... ArgsTy>
1667  static void kernel_parallel_for_unpack(handler *h, ArgsTy... Args) {
1668  h->kernel_parallel_for<TypesToForward..., Props...>(Args...);
1669  }
1670 
1671  template <typename... TypesToForward, typename... ArgsTy>
1672  static void kernel_parallel_for_work_group_unpack(handler *h,
1673  ArgsTy... Args) {
1674  h->kernel_parallel_for_work_group<TypesToForward..., Props...>(Args...);
1675  }
1676  };
1677 
1678  template <typename PropertiesT>
1679  struct KernelPropertiesUnpacker : public KernelPropertiesUnpackerImpl<> {
1680  // This should always fail outside the specialization below but must be
1681  // dependent to avoid failing even if not instantiated.
1682  static_assert(
1683  ext::oneapi::experimental::is_property_list<PropertiesT>::value,
1684  "Template type is not a property list.");
1685  };
1686 
1687  template <typename... Props>
1688  struct KernelPropertiesUnpacker<
1690  : public KernelPropertiesUnpackerImpl<Props...> {};
1691 
1692  // Helper function to
1693  //
1694  // * Make use of the KernelPropertiesUnpacker above
1695  // * Decide if we need an extra kernel_handler parameter
1696  //
1697  // The interface uses a \p Lambda callback to propagate that information back
1698  // to the caller as we need the caller to communicate:
1699  //
1700  // * Name of the method to call
1701  // * Provide explicit template type parameters for the call
1702  //
1703  // Couldn't think of a better way to achieve both.
1704  template <typename KernelName, typename KernelType, typename PropertiesT,
1705  bool HasKernelHandlerArg, typename FuncTy>
1706  void unpack(_KERNELFUNCPARAM(KernelFunc), FuncTy Lambda) {
1707 #ifdef __SYCL_DEVICE_ONLY__
1708  detail::CheckDeviceCopyable<KernelType>();
1709 #endif // __SYCL_DEVICE_ONLY__
1710  using MergedPropertiesT =
1711  typename detail::GetMergedKernelProperties<KernelType,
1712  PropertiesT>::type;
1713  using Unpacker = KernelPropertiesUnpacker<MergedPropertiesT>;
1714 #ifndef __SYCL_DEVICE_ONLY__
1715  // If there are properties provided by get method then process them.
1716  if constexpr (ext::oneapi::experimental::detail::
1717  HasKernelPropertiesGetMethod<
1718  _KERNELFUNCPARAMTYPE>::value) {
1719  processProperties<KernelName>(
1720  KernelFunc.get(ext::oneapi::experimental::properties_tag{}));
1721  }
1722 #endif
1723  if constexpr (HasKernelHandlerArg) {
1724  kernel_handler KH;
1725  Lambda(Unpacker{}, this, KernelFunc, KH);
1726  } else {
1727  Lambda(Unpacker{}, this, KernelFunc);
1728  }
1729  }
1730 
1731  // NOTE: to support kernel_handler argument in kernel lambdas, only
1732  // kernel_***_wrapper functions must be called in this code
1733 
1734  template <
1735  typename KernelName, typename KernelType,
1736  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1737  void kernel_single_task_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1738  unpack<KernelName, KernelType, PropertiesT,
1740  KernelFunc, [&](auto Unpacker, auto... args) {
1741  Unpacker.template kernel_single_task_unpack<KernelName, KernelType>(
1742  args...);
1743  });
1744  }
1745 
1746  template <
1747  typename KernelName, typename ElementType, typename KernelType,
1748  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1749  void kernel_parallel_for_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1750  unpack<KernelName, KernelType, PropertiesT,
1751  detail::KernelLambdaHasKernelHandlerArgT<KernelType,
1752  ElementType>::value>(
1753  KernelFunc, [&](auto Unpacker, auto... args) {
1754  Unpacker.template kernel_parallel_for_unpack<KernelName, ElementType,
1755  KernelType>(args...);
1756  });
1757  }
1758 
1759  template <
1760  typename KernelName, typename ElementType, typename KernelType,
1761  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1762  void kernel_parallel_for_work_group_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1763  unpack<KernelName, KernelType, PropertiesT,
1764  detail::KernelLambdaHasKernelHandlerArgT<KernelType,
1765  ElementType>::value>(
1766  KernelFunc, [&](auto Unpacker, auto... args) {
1767  Unpacker.template kernel_parallel_for_work_group_unpack<
1768  KernelName, ElementType, KernelType>(args...);
1769  });
1770  }
1771 
1779  template <
1780  typename KernelName, typename KernelType,
1781  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1782  void single_task_lambda_impl(PropertiesT Props,
1783  _KERNELFUNCPARAM(KernelFunc)) {
1784  (void)Props;
1785  throwIfActionIsCreated();
1786  throwOnLocalAccessorMisuse<KernelName, KernelType>();
1787  // TODO: Properties may change the kernel function, so in order to avoid
1788  // conflicts they should be included in the name.
1789  using NameT =
1791  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1792  kernel_single_task_wrapper<NameT, KernelType, PropertiesT>(KernelFunc);
1793 #ifndef __SYCL_DEVICE_ONLY__
1794  // No need to check if range is out of INT_MAX limits as it's compile-time
1795  // known constant.
1796  setNDRangeDescriptor(range<1>{1});
1797  processProperties<NameT, PropertiesT>(Props);
1798  StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(KernelFunc);
1799  setType(detail::CGType::Kernel);
1800 #endif
1801  }
1802 
1803  void setStateExplicitKernelBundle();
1804  void setStateSpecConstSet();
1805  bool isStateExplicitKernelBundle() const;
1806 
1807  std::shared_ptr<detail::kernel_bundle_impl>
1808  getOrInsertHandlerKernelBundle(bool Insert) const;
1809 
1810  void setHandlerKernelBundle(kernel Kernel);
1811 
1812  void setHandlerKernelBundle(
1813  const std::shared_ptr<detail::kernel_bundle_impl> &NewKernelBundleImpPtr);
1814 
1815  void SetHostTask(std::function<void()> &&Func);
1816  void SetHostTask(std::function<void(interop_handle)> &&Func);
1817 
1818  template <typename FuncT>
1819  std::enable_if_t<detail::check_fn_signature<std::remove_reference_t<FuncT>,
1820  void()>::value ||
1821  detail::check_fn_signature<std::remove_reference_t<FuncT>,
1822  void(interop_handle)>::value>
1823  host_task_impl(FuncT &&Func) {
1824  throwIfActionIsCreated();
1825 
1826  // Need to copy these rather than move so that we can check associated
1827  // accessors during finalize
1828  setArgsToAssociatedAccessors();
1829 
1830  SetHostTask(std::move(Func));
1831  }
1832 
1833  template <typename FuncT>
1834  std::enable_if_t<detail::check_fn_signature<std::remove_reference_t<FuncT>,
1835  void(interop_handle)>::value>
1836  ext_codeplay_enqueue_native_command_impl(FuncT &&Func) {
1837  throwIfActionIsCreated();
1838 
1839  // Need to copy these rather than move so that we can check associated
1840  // accessors during finalize
1841  setArgsToAssociatedAccessors();
1842 
1843  SetHostTask(std::move(Func));
1845  }
1846 
1850  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
1851  getCommandGraph() const;
1852 
1859  void setUserFacingNodeType(ext::oneapi::experimental::node_type Type);
1860 
1861 public:
1862  handler(const handler &) = delete;
1863  handler(handler &&) = delete;
1864  handler &operator=(const handler &) = delete;
1865  handler &operator=(handler &&) = delete;
1866 
1867  template <auto &SpecName>
1869  typename std::remove_reference_t<decltype(SpecName)>::value_type Value) {
1870 
1871  setStateSpecConstSet();
1872 
1873  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1874  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1875 
1876  detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1878  .set_specialization_constant<SpecName>(Value);
1879  }
1880 
1881  template <auto &SpecName>
1882  typename std::remove_reference_t<decltype(SpecName)>::value_type
1884 
1885  if (isStateExplicitKernelBundle())
1887  "Specialization constants cannot be read after "
1888  "explicitly setting the used kernel bundle");
1889 
1890  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1891  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1892 
1893  return detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1895  .get_specialization_constant<SpecName>();
1896  }
1897 
1898  void
1899  use_kernel_bundle(const kernel_bundle<bundle_state::executable> &ExecBundle);
1900 
1909  template <typename DataT, int Dims, access::mode AccMode,
1912  if (Acc.is_placeholder())
1913  associateWithHandler(&Acc, AccTarget);
1914  }
1915 
1925  template <typename DataT, int Dims, access::mode AccMode,
1929  dynamicParamAcc) {
1931  AccT Acc = *static_cast<AccT *>(
1932  detail::getValueFromDynamicParameter(dynamicParamAcc));
1933  if (Acc.is_placeholder())
1934  associateWithHandler(&Acc, AccTarget);
1935  }
1936 
1940  void depends_on(event Event);
1941 
1945  void depends_on(const std::vector<event> &Events);
1946 
1947  template <typename T>
1948  using remove_cv_ref_t = typename std::remove_cv_t<std::remove_reference_t<T>>;
1949 
1950  template <typename U, typename T>
1951  using is_same_type = std::is_same<remove_cv_ref_t<U>, remove_cv_ref_t<T>>;
1952 
1953  template <typename T> struct ShouldEnableSetArg {
1954  static constexpr bool value =
1955  std::is_trivially_copyable_v<std::remove_reference_t<T>>
1956 #if SYCL_LANGUAGE_VERSION && SYCL_LANGUAGE_VERSION <= 201707
1957  && std::is_standard_layout<std::remove_reference_t<T>>::value
1958 #endif
1959  || is_same_type<sampler, T>::value // Sampler
1961  std::is_pointer_v<remove_cv_ref_t<T>>) // USM
1962  || is_same_type<cl_mem, T>::value; // Interop
1963  };
1964 
1971  template <typename T>
1972  typename std::enable_if_t<ShouldEnableSetArg<T>::value, void>
1973  set_arg(int ArgIndex, T &&Arg) {
1974  setArgHelper(ArgIndex, std::move(Arg));
1975  }
1976 
1977  template <typename DataT, int Dims, access::mode AccessMode,
1979  void
1980  set_arg(int ArgIndex,
1982  setArgHelper(ArgIndex, std::move(Arg));
1983  }
1984 
1985  template <typename DataT, int Dims>
1986  void set_arg(int ArgIndex, local_accessor<DataT, Dims> Arg) {
1987  setArgHelper(ArgIndex, std::move(Arg));
1988  }
1989 
1990  // set_arg for graph dynamic_parameters
1991  template <typename T>
1992  void set_arg(int argIndex,
1994  setArgHelper(argIndex, dynamicParam);
1995  }
1996 
1997  // set_arg for the raw_kernel_arg extension type.
1999  setArgHelper(argIndex, std::move(Arg));
2000  }
2001 
2007  template <typename... Ts> void set_args(Ts &&...Args) {
2008  setArgsHelper(0, std::move(Args)...);
2009  }
2010 
2018  template <typename KernelName = detail::auto_name, typename KernelType>
2019  void single_task(_KERNELFUNCPARAM(KernelFunc)) {
2020  single_task_lambda_impl<KernelName>(
2022  }
2023 
2024  template <typename KernelName = detail::auto_name, typename KernelType>
2025  void parallel_for(range<1> NumWorkItems, _KERNELFUNCPARAM(KernelFunc)) {
2026  parallel_for_lambda_impl<KernelName>(
2028  std::move(KernelFunc));
2029  }
2030 
2031  template <typename KernelName = detail::auto_name, typename KernelType>
2032  void parallel_for(range<2> NumWorkItems, _KERNELFUNCPARAM(KernelFunc)) {
2033  parallel_for_lambda_impl<KernelName>(
2035  std::move(KernelFunc));
2036  }
2037 
2038  template <typename KernelName = detail::auto_name, typename KernelType>
2039  void parallel_for(range<3> NumWorkItems, _KERNELFUNCPARAM(KernelFunc)) {
2040  parallel_for_lambda_impl<KernelName>(
2042  std::move(KernelFunc));
2043  }
2044 
2046  template <typename FuncT>
2047  std::enable_if_t<detail::check_fn_signature<std::remove_reference_t<FuncT>,
2048  void()>::value ||
2050  void(interop_handle)>::value>
2051  host_task(FuncT &&Func) {
2052  host_task_impl(Func);
2053  }
2054 
2056  template <typename FuncT>
2057  std::enable_if_t<detail::check_fn_signature<std::remove_reference_t<FuncT>,
2058  void(interop_handle)>::value>
2060  throwIfGraphAssociated<
2061  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
2062  sycl_ext_codeplay_enqueue_native_command>();
2063  ext_codeplay_enqueue_native_command_impl(Func);
2064  }
2065 
2079  template <typename KernelName = detail::auto_name, typename KernelType,
2080  int Dims>
2081  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL2020")
2082  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
2083  _KERNELFUNCPARAM(KernelFunc)) {
2084  throwIfActionIsCreated();
2085  using NameT =
2087  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2088  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
2089  using TransformedArgType = std::conditional_t<
2090  std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
2091  typename TransformUserItemType<Dims, LambdaArgType>::type>;
2092  (void)NumWorkItems;
2093  (void)WorkItemOffset;
2094  kernel_parallel_for_wrapper<NameT, TransformedArgType>(KernelFunc);
2095 #ifndef __SYCL_DEVICE_ONLY__
2096  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2097  setNDRangeDescriptor(std::move(NumWorkItems), std::move(WorkItemOffset));
2098  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
2099  std::move(KernelFunc));
2100  setType(detail::CGType::Kernel);
2101  setNDRangeUsed(false);
2102 #endif
2103  }
2104 
2115  template <typename KernelName = detail::auto_name, typename KernelType,
2116  int Dims>
2118  _KERNELFUNCPARAM(KernelFunc)) {
2119  parallel_for_work_group_lambda_impl<KernelName>(
2121  KernelFunc);
2122  }
2123 
2136  template <typename KernelName = detail::auto_name, typename KernelType,
2137  int Dims>
2140  _KERNELFUNCPARAM(KernelFunc)) {
2141  parallel_for_work_group_lambda_impl<KernelName>(
2142  NumWorkGroups, WorkGroupSize,
2144  }
2145 
2152  void single_task(kernel Kernel) {
2153  throwIfActionIsCreated();
2154  // Ignore any set kernel bundles and use the one associated with the kernel
2155  setHandlerKernelBundle(Kernel);
2156  // No need to check if range is out of INT_MAX limits as it's compile-time
2157  // known constant
2158  setNDRangeDescriptor(range<1>{1});
2159  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2160  setType(detail::CGType::Kernel);
2161  extractArgsAndReqs();
2162  MKernelName = getKernelName();
2163  }
2164 
2165  void parallel_for(range<1> NumWorkItems, kernel Kernel) {
2166  parallel_for_impl(NumWorkItems, Kernel);
2167  }
2168 
2169  void parallel_for(range<2> NumWorkItems, kernel Kernel) {
2170  parallel_for_impl(NumWorkItems, Kernel);
2171  }
2172 
2173  void parallel_for(range<3> NumWorkItems, kernel Kernel) {
2174  parallel_for_impl(NumWorkItems, Kernel);
2175  }
2176 
2185  template <int Dims>
2186  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
2187  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
2188  kernel Kernel) {
2189  throwIfActionIsCreated();
2190  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2191  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2192  setNDRangeDescriptor(std::move(NumWorkItems), std::move(WorkItemOffset));
2193  setType(detail::CGType::Kernel);
2194  setNDRangeUsed(false);
2195  extractArgsAndReqs();
2196  MKernelName = getKernelName();
2197  }
2198 
2207  template <int Dims> void parallel_for(nd_range<Dims> NDRange, kernel Kernel) {
2208  throwIfActionIsCreated();
2209  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2210  detail::checkValueRange<Dims>(NDRange);
2211  setNDRangeDescriptor(std::move(NDRange));
2212  setType(detail::CGType::Kernel);
2213  setNDRangeUsed(true);
2214  extractArgsAndReqs();
2215  MKernelName = getKernelName();
2216  }
2217 
2224  template <typename KernelName = detail::auto_name, typename KernelType>
2225  void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc)) {
2226  throwIfActionIsCreated();
2227  // Ignore any set kernel bundles and use the one associated with the kernel
2228  setHandlerKernelBundle(Kernel);
2229  using NameT =
2231  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2232  (void)Kernel;
2233  kernel_single_task<NameT>(KernelFunc);
2234 #ifndef __SYCL_DEVICE_ONLY__
2235  // No need to check if range is out of INT_MAX limits as it's compile-time
2236  // known constant
2237  setNDRangeDescriptor(range<1>{1});
2238  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2239  setType(detail::CGType::Kernel);
2240  if (!lambdaAndKernelHaveEqualName<NameT>()) {
2241  extractArgsAndReqs();
2242  MKernelName = getKernelName();
2243  } else
2244  StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(std::move(KernelFunc));
2245 #else
2246  detail::CheckDeviceCopyable<KernelType>();
2247 #endif
2248  }
2249 
2257  template <typename KernelName = detail::auto_name, typename KernelType,
2258  int Dims>
2259  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
2260  _KERNELFUNCPARAM(KernelFunc)) {
2261  throwIfActionIsCreated();
2262  // Ignore any set kernel bundles and use the one associated with the kernel
2263  setHandlerKernelBundle(Kernel);
2264  using NameT =
2266  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2267  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
2268  (void)Kernel;
2269  (void)NumWorkItems;
2270  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2271 #ifndef __SYCL_DEVICE_ONLY__
2272  detail::checkValueRange<Dims>(NumWorkItems);
2273  setNDRangeDescriptor(std::move(NumWorkItems));
2274  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2275  setType(detail::CGType::Kernel);
2276  setNDRangeUsed(false);
2277  if (!lambdaAndKernelHaveEqualName<NameT>()) {
2278  extractArgsAndReqs();
2279  MKernelName = getKernelName();
2280  } else
2281  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2282  std::move(KernelFunc));
2283 #endif
2284  }
2285 
2295  template <typename KernelName = detail::auto_name, typename KernelType,
2296  int Dims>
2297  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
2298  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
2299  id<Dims> WorkItemOffset, _KERNELFUNCPARAM(KernelFunc)) {
2300  throwIfActionIsCreated();
2301  // Ignore any set kernel bundles and use the one associated with the kernel
2302  setHandlerKernelBundle(Kernel);
2303  using NameT =
2305  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2306  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
2307  (void)Kernel;
2308  (void)NumWorkItems;
2309  (void)WorkItemOffset;
2310  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2311 #ifndef __SYCL_DEVICE_ONLY__
2312  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2313  setNDRangeDescriptor(std::move(NumWorkItems), std::move(WorkItemOffset));
2314  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2315  setType(detail::CGType::Kernel);
2316  setNDRangeUsed(false);
2317  if (!lambdaAndKernelHaveEqualName<NameT>()) {
2318  extractArgsAndReqs();
2319  MKernelName = getKernelName();
2320  } else
2321  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2322  std::move(KernelFunc));
2323 #endif
2324  }
2325 
2335  template <typename KernelName = detail::auto_name, typename KernelType,
2336  int Dims>
2337  void parallel_for(kernel Kernel, nd_range<Dims> NDRange,
2338  _KERNELFUNCPARAM(KernelFunc)) {
2339  throwIfActionIsCreated();
2340  // Ignore any set kernel bundles and use the one associated with the kernel
2341  setHandlerKernelBundle(Kernel);
2342  using NameT =
2344  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2345  using LambdaArgType =
2346  sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
2347  (void)Kernel;
2348  (void)NDRange;
2349  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2350 #ifndef __SYCL_DEVICE_ONLY__
2351  detail::checkValueRange<Dims>(NDRange);
2352  setNDRangeDescriptor(std::move(NDRange));
2353  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2354  setType(detail::CGType::Kernel);
2355  setNDRangeUsed(true);
2356  if (!lambdaAndKernelHaveEqualName<NameT>()) {
2357  extractArgsAndReqs();
2358  MKernelName = getKernelName();
2359  } else
2360  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2361  std::move(KernelFunc));
2362 #endif
2363  }
2364 
2378  template <typename KernelName = detail::auto_name, typename KernelType,
2379  int Dims>
2380  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2381  _KERNELFUNCPARAM(KernelFunc)) {
2382  throwIfActionIsCreated();
2383  // Ignore any set kernel bundles and use the one associated with the kernel
2384  setHandlerKernelBundle(Kernel);
2385  using NameT =
2387  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2388  using LambdaArgType =
2389  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
2390  (void)Kernel;
2391  (void)NumWorkGroups;
2392  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2393 #ifndef __SYCL_DEVICE_ONLY__
2394  detail::checkValueRange<Dims>(NumWorkGroups);
2395  setNDRangeDescriptor(NumWorkGroups, /*SetNumWorkGroups=*/true);
2396  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2397  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2398  setType(detail::CGType::Kernel);
2399 #endif // __SYCL_DEVICE_ONLY__
2400  }
2401 
2417  template <typename KernelName = detail::auto_name, typename KernelType,
2418  int Dims>
2419  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2421  _KERNELFUNCPARAM(KernelFunc)) {
2422  throwIfActionIsCreated();
2423  // Ignore any set kernel bundles and use the one associated with the kernel
2424  setHandlerKernelBundle(Kernel);
2425  using NameT =
2427  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2428  using LambdaArgType =
2429  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
2430  (void)Kernel;
2431  (void)NumWorkGroups;
2432  (void)WorkGroupSize;
2433  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2434 #ifndef __SYCL_DEVICE_ONLY__
2435  nd_range<Dims> ExecRange =
2436  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
2437  detail::checkValueRange<Dims>(ExecRange);
2438  setNDRangeDescriptor(std::move(ExecRange));
2439  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2440  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2441  setType(detail::CGType::Kernel);
2442 #endif // __SYCL_DEVICE_ONLY__
2443  }
2444 
2445  template <typename KernelName = detail::auto_name, typename KernelType,
2446  typename PropertiesT>
2447  std::enable_if_t<
2449  single_task(PropertiesT Props, _KERNELFUNCPARAM(KernelFunc)) {
2450  single_task_lambda_impl<KernelName, KernelType, PropertiesT>(Props,
2451  KernelFunc);
2452  }
2453 
2454  template <typename KernelName = detail::auto_name, typename KernelType,
2455  typename PropertiesT>
2456  std::enable_if_t<
2458  parallel_for(range<1> NumWorkItems, PropertiesT Props,
2459  _KERNELFUNCPARAM(KernelFunc)) {
2460  parallel_for_lambda_impl<KernelName, KernelType, 1, PropertiesT>(
2461  NumWorkItems, Props, std::move(KernelFunc));
2462  }
2463 
2464  template <typename KernelName = detail::auto_name, typename KernelType,
2465  typename PropertiesT>
2466  std::enable_if_t<
2468  parallel_for(range<2> NumWorkItems, PropertiesT Props,
2469  _KERNELFUNCPARAM(KernelFunc)) {
2470  parallel_for_lambda_impl<KernelName, KernelType, 2, PropertiesT>(
2471  NumWorkItems, Props, std::move(KernelFunc));
2472  }
2473 
2474  template <typename KernelName = detail::auto_name, typename KernelType,
2475  typename PropertiesT>
2476  std::enable_if_t<
2478  parallel_for(range<3> NumWorkItems, PropertiesT Props,
2479  _KERNELFUNCPARAM(KernelFunc)) {
2480  parallel_for_lambda_impl<KernelName, KernelType, 3, PropertiesT>(
2481  NumWorkItems, Props, std::move(KernelFunc));
2482  }
2483 
2484  template <typename KernelName = detail::auto_name, typename KernelType,
2485  typename PropertiesT, int Dims>
2486  std::enable_if_t<
2488  parallel_for(nd_range<Dims> Range, PropertiesT Properties,
2489  _KERNELFUNCPARAM(KernelFunc)) {
2490  parallel_for_impl<KernelName>(Range, Properties, std::move(KernelFunc));
2491  }
2492 
2494 
2495  template <typename KernelName = detail::auto_name, typename PropertiesT,
2496  typename... RestT>
2497  std::enable_if_t<
2498  (sizeof...(RestT) > 1) &&
2499  detail::AreAllButLastReductions<RestT...>::value &&
2501  parallel_for(range<1> Range, PropertiesT Properties, RestT &&...Rest) {
2502  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2503  UnsupportedGraphFeatures::sycl_reductions>();
2504  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2505  std::forward<RestT>(Rest)...);
2506  }
2507 
2508  template <typename KernelName = detail::auto_name, typename PropertiesT,
2509  typename... RestT>
2510  std::enable_if_t<
2511  (sizeof...(RestT) > 1) &&
2512  detail::AreAllButLastReductions<RestT...>::value &&
2514  parallel_for(range<2> Range, PropertiesT Properties, RestT &&...Rest) {
2515  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2516  UnsupportedGraphFeatures::sycl_reductions>();
2517  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2518  std::forward<RestT>(Rest)...);
2519  }
2520 
2521  template <typename KernelName = detail::auto_name, typename PropertiesT,
2522  typename... RestT>
2523  std::enable_if_t<
2524  (sizeof...(RestT) > 1) &&
2525  detail::AreAllButLastReductions<RestT...>::value &&
2527  parallel_for(range<3> Range, PropertiesT Properties, RestT &&...Rest) {
2528  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2529  UnsupportedGraphFeatures::sycl_reductions>();
2530  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2531  std::forward<RestT>(Rest)...);
2532  }
2533 
2534  template <typename KernelName = detail::auto_name, typename... RestT>
2535  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2536  parallel_for(range<1> Range, RestT &&...Rest) {
2537  parallel_for<KernelName>(Range,
2539  std::forward<RestT>(Rest)...);
2540  }
2541 
2542  template <typename KernelName = detail::auto_name, typename... RestT>
2543  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2544  parallel_for(range<2> Range, RestT &&...Rest) {
2545  parallel_for<KernelName>(Range,
2547  std::forward<RestT>(Rest)...);
2548  }
2549 
2550  template <typename KernelName = detail::auto_name, typename... RestT>
2551  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2552  parallel_for(range<3> Range, RestT &&...Rest) {
2553  parallel_for<KernelName>(Range,
2555  std::forward<RestT>(Rest)...);
2556  }
2557 
2558  template <typename KernelName = detail::auto_name, int Dims,
2559  typename PropertiesT, typename... RestT>
2560  std::enable_if_t<
2561  (sizeof...(RestT) > 1) &&
2562  detail::AreAllButLastReductions<RestT...>::value &&
2564  parallel_for(nd_range<Dims> Range, PropertiesT Properties, RestT &&...Rest) {
2565  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2566  UnsupportedGraphFeatures::sycl_reductions>();
2567  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2568  std::forward<RestT>(Rest)...);
2569  }
2570 
2571  template <typename KernelName = detail::auto_name, int Dims,
2572  typename... RestT>
2573  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2574  parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
2575  parallel_for<KernelName>(Range,
2577  std::forward<RestT>(Rest)...);
2578  }
2579 
2581 
2582  template <typename KernelName = detail::auto_name, typename KernelType,
2583  int Dims, typename PropertiesT>
2584  void parallel_for_work_group(range<Dims> NumWorkGroups, PropertiesT Props,
2585  _KERNELFUNCPARAM(KernelFunc)) {
2586  parallel_for_work_group_lambda_impl<KernelName, KernelType, Dims,
2587  PropertiesT>(NumWorkGroups, Props,
2588  KernelFunc);
2589  }
2590 
2591  template <typename KernelName = detail::auto_name, typename KernelType,
2592  int Dims, typename PropertiesT>
2594  range<Dims> WorkGroupSize, PropertiesT Props,
2595  _KERNELFUNCPARAM(KernelFunc)) {
2596  parallel_for_work_group_lambda_impl<KernelName, KernelType, Dims,
2597  PropertiesT>(
2598  NumWorkGroups, WorkGroupSize, Props, KernelFunc);
2599  }
2600 
2601  // Clean up KERNELFUNC macro.
2602 #undef _KERNELFUNCPARAM
2603 
2604  // Explicit copy operations API
2605 
2613  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2614  access::target AccessTarget,
2617  std::shared_ptr<T_Dst> Dst) {
2618  if (Src.is_placeholder())
2619  checkIfPlaceholderIsBoundToHandler(Src);
2620 
2621  throwIfActionIsCreated();
2622  static_assert(isValidTargetForExplicitOp(AccessTarget),
2623  "Invalid accessor target for the copy method.");
2624  static_assert(isValidModeForSourceAccessor(AccessMode),
2625  "Invalid accessor mode for the copy method.");
2626  // Make sure data shared_ptr points to is not released until we finish
2627  // work with it.
2628  addLifetimeSharedPtrStorage(Dst);
2629  typename std::shared_ptr<T_Dst>::element_type *RawDstPtr = Dst.get();
2630  copy(Src, RawDstPtr);
2631  }
2632 
2640  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2641  access::target AccessTarget,
2643  void
2644  copy(std::shared_ptr<T_Src> Src,
2646  if (Dst.is_placeholder())
2647  checkIfPlaceholderIsBoundToHandler(Dst);
2648 
2649  throwIfActionIsCreated();
2650  static_assert(isValidTargetForExplicitOp(AccessTarget),
2651  "Invalid accessor target for the copy method.");
2652  static_assert(isValidModeForDestinationAccessor(AccessMode),
2653  "Invalid accessor mode for the copy method.");
2654  // TODO: Add static_assert with is_device_copyable when vec is
2655  // device-copyable.
2656  // Make sure data shared_ptr points to is not released until we finish
2657  // work with it.
2658  addLifetimeSharedPtrStorage(Src);
2659  typename std::shared_ptr<T_Src>::element_type *RawSrcPtr = Src.get();
2660  copy(RawSrcPtr, Dst);
2661  }
2662 
2670  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2671  access::target AccessTarget,
2674  T_Dst *Dst) {
2675  if (Src.is_placeholder())
2676  checkIfPlaceholderIsBoundToHandler(Src);
2677 
2678  throwIfActionIsCreated();
2679  static_assert(isValidTargetForExplicitOp(AccessTarget),
2680  "Invalid accessor target for the copy method.");
2681  static_assert(isValidModeForSourceAccessor(AccessMode),
2682  "Invalid accessor mode for the copy method.");
2684 
2686  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2687 
2688  MSrcPtr = static_cast<void *>(AccImpl.get());
2689  MDstPtr = static_cast<void *>(Dst);
2690  // Store copy of accessor to the local storage to make sure it is alive
2691  // until we finish
2692  addAccessorReq(std::move(AccImpl));
2693  }
2694 
2702  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2703  access::target AccessTarget,
2705  void
2706  copy(const T_Src *Src,
2708  if (Dst.is_placeholder())
2709  checkIfPlaceholderIsBoundToHandler(Dst);
2710 
2711  throwIfActionIsCreated();
2712  static_assert(isValidTargetForExplicitOp(AccessTarget),
2713  "Invalid accessor target for the copy method.");
2714  static_assert(isValidModeForDestinationAccessor(AccessMode),
2715  "Invalid accessor mode for the copy method.");
2716  // TODO: Add static_assert with is_device_copyable when vec is
2717  // device-copyable.
2718 
2720 
2722  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2723 
2724  MSrcPtr = const_cast<T_Src *>(Src);
2725  MDstPtr = static_cast<void *>(AccImpl.get());
2726  // Store copy of accessor to the local storage to make sure it is alive
2727  // until we finish
2728  addAccessorReq(std::move(AccImpl));
2729  }
2730 
2738  template <
2739  typename T_Src, int Dims_Src, access::mode AccessMode_Src,
2740  access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
2741  access::mode AccessMode_Dst, access::target AccessTarget_Dst,
2744  void copy(accessor<T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src,
2745  IsPlaceholder_Src>
2746  Src,
2747  accessor<T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst,
2748  IsPlaceholder_Dst>
2749  Dst) {
2750  if (Src.is_placeholder())
2751  checkIfPlaceholderIsBoundToHandler(Src);
2752  if (Dst.is_placeholder())
2753  checkIfPlaceholderIsBoundToHandler(Dst);
2754 
2755  throwIfActionIsCreated();
2756  static_assert(isValidTargetForExplicitOp(AccessTarget_Src),
2757  "Invalid source accessor target for the copy method.");
2758  static_assert(isValidTargetForExplicitOp(AccessTarget_Dst),
2759  "Invalid destination accessor target for the copy method.");
2760  static_assert(isValidModeForSourceAccessor(AccessMode_Src),
2761  "Invalid source accessor mode for the copy method.");
2762  static_assert(isValidModeForDestinationAccessor(AccessMode_Dst),
2763  "Invalid destination accessor mode for the copy method.");
2764  if (Dst.get_size() < Src.get_size())
2766  "The destination accessor size is too small to "
2767  "copy the memory into.");
2768 
2769  if (copyAccToAccHelper(Src, Dst))
2770  return;
2772 
2773  detail::AccessorBaseHost *AccBaseSrc = (detail::AccessorBaseHost *)&Src;
2774  detail::AccessorImplPtr AccImplSrc = detail::getSyclObjImpl(*AccBaseSrc);
2775 
2776  detail::AccessorBaseHost *AccBaseDst = (detail::AccessorBaseHost *)&Dst;
2777  detail::AccessorImplPtr AccImplDst = detail::getSyclObjImpl(*AccBaseDst);
2778 
2779  MSrcPtr = AccImplSrc.get();
2780  MDstPtr = AccImplDst.get();
2781  // Store copy of accessor to the local storage to make sure it is alive
2782  // until we finish
2783  addAccessorReq(std::move(AccImplSrc));
2784  addAccessorReq(std::move(AccImplDst));
2785  }
2786 
2791  template <typename T, int Dims, access::mode AccessMode,
2792  access::target AccessTarget,
2794  void
2796  if (Acc.is_placeholder())
2797  checkIfPlaceholderIsBoundToHandler(Acc);
2798 
2799  throwIfActionIsCreated();
2800  static_assert(isValidTargetForExplicitOp(AccessTarget),
2801  "Invalid accessor target for the update_host method.");
2802  setType(detail::CGType::UpdateHost);
2803 
2805  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2806 
2807  MDstPtr = static_cast<void *>(AccImpl.get());
2808  addAccessorReq(std::move(AccImpl));
2809  }
2810 
2811 public:
2820  template <typename T, int Dims, access::mode AccessMode,
2821  access::target AccessTarget,
2823  typename PropertyListT = property_list>
2824  void
2826  Dst,
2827  const T &Pattern) {
2828  if (Dst.is_placeholder())
2829  checkIfPlaceholderIsBoundToHandler(Dst);
2830 
2831  throwIfActionIsCreated();
2832  setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill);
2833  // TODO add check:T must be an integral scalar value or a SYCL vector type
2834  static_assert(isValidTargetForExplicitOp(AccessTarget),
2835  "Invalid accessor target for the fill method.");
2836  // CG::Fill will result in piEnqueuFillBuffer/Image which requires that mem
2837  // data is contiguous. Thus we check range and offset when dim > 1
2838  // Images don't allow ranged accessors and are fine.
2839  if constexpr (isBackendSupportedFillSize(sizeof(T)) &&
2840  ((Dims <= 1) || isImageOrImageArray(AccessTarget))) {
2841  StageFillCG(Dst, Pattern);
2842  } else if constexpr (Dims == 0) {
2843  // Special case for zero-dim accessors.
2844  parallel_for<__fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
2845  range<1>(1), [=](id<1>) { Dst = Pattern; });
2846  } else {
2847  // Dim > 1
2848  bool OffsetUsable = (Dst.get_offset() == sycl::id<Dims>{});
2850  bool RangesUsable =
2851  (AccBase->getAccessRange() == AccBase->getMemoryRange());
2852  if (OffsetUsable && RangesUsable &&
2853  isBackendSupportedFillSize(sizeof(T))) {
2854  StageFillCG(Dst, Pattern);
2855  } else {
2856  range<Dims> Range = Dst.get_range();
2857  parallel_for<__fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
2858  Range, [=](id<Dims> Index) { Dst[Index] = Pattern; });
2859  }
2860  }
2861  }
2862 
2869  template <typename T> void fill(void *Ptr, const T &Pattern, size_t Count) {
2870  throwIfActionIsCreated();
2871  setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill);
2872  static_assert(is_device_copyable<T>::value,
2873  "Pattern must be device copyable");
2874  if (getDeviceBackend() == backend::ext_oneapi_level_zero) {
2875  parallel_for<__usmfill<T>>(range<1>(Count), [=](id<1> Index) {
2876  T *CastedPtr = static_cast<T *>(Ptr);
2877  CastedPtr[Index] = Pattern;
2878  });
2879  } else {
2880  this->fill_impl(Ptr, &Pattern, sizeof(T), Count);
2881  }
2882  }
2883 
2888  throwIfActionIsCreated();
2889  setType(detail::CGType::Barrier);
2890  }
2891 
2898  void ext_oneapi_barrier(const std::vector<event> &WaitList);
2899 
2910  void memcpy(void *Dest, const void *Src, size_t Count);
2911 
2922  template <typename T> void copy(const T *Src, T *Dest, size_t Count) {
2923  this->memcpy(Dest, Src, Count * sizeof(T));
2924  }
2925 
2933  void memset(void *Dest, int Value, size_t Count);
2934 
2941  void prefetch(const void *Ptr, size_t Count);
2942 
2949  void mem_advise(const void *Ptr, size_t Length, int Advice);
2950 
2967  template <typename T = unsigned char,
2968  typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
2969  void ext_oneapi_memcpy2d(void *Dest, size_t DestPitch, const void *Src,
2970  size_t SrcPitch, size_t Width, size_t Height);
2971 
2985  template <typename T>
2986  void ext_oneapi_copy2d(const T *Src, size_t SrcPitch, T *Dest,
2987  size_t DestPitch, size_t Width, size_t Height);
2988 
3004  template <typename T = unsigned char,
3005  typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
3006  void ext_oneapi_memset2d(void *Dest, size_t DestPitch, int Value,
3007  size_t Width, size_t Height);
3008 
3021  template <typename T>
3022  void ext_oneapi_fill2d(void *Dest, size_t DestPitch, const T &Pattern,
3023  size_t Width, size_t Height);
3024 
3033  template <typename T, typename PropertyListT>
3035  const void *Src, size_t NumBytes = sizeof(T),
3036  size_t DestOffset = 0) {
3037  throwIfGraphAssociated<
3038  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
3039  sycl_ext_oneapi_device_global>();
3040  if (sizeof(T) < DestOffset + NumBytes)
3042  "Copy to device_global is out of bounds.");
3043 
3044  constexpr bool IsDeviceImageScoped = PropertyListT::template has_property<
3046 
3047  if (!detail::isDeviceGlobalUsedInKernel(&Dest)) {
3048  // If the corresponding device_global isn't used in any kernels, we fall
3049  // back to doing the memory operation on host-only.
3050  memcpyToHostOnlyDeviceGlobal(&Dest, Src, sizeof(T), IsDeviceImageScoped,
3051  NumBytes, DestOffset);
3052  return;
3053  }
3054 
3055  memcpyToDeviceGlobal(&Dest, Src, IsDeviceImageScoped, NumBytes, DestOffset);
3056  }
3057 
3066  template <typename T, typename PropertyListT>
3067  void
3068  memcpy(void *Dest,
3070  size_t NumBytes = sizeof(T), size_t SrcOffset = 0) {
3071  throwIfGraphAssociated<
3072  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
3073  sycl_ext_oneapi_device_global>();
3074  if (sizeof(T) < SrcOffset + NumBytes)
3076  "Copy from device_global is out of bounds.");
3077 
3078  constexpr bool IsDeviceImageScoped = PropertyListT::template has_property<
3080 
3082  // If the corresponding device_global isn't used in any kernels, we fall
3083  // back to doing the memory operation on host-only.
3084  memcpyFromHostOnlyDeviceGlobal(Dest, &Src, IsDeviceImageScoped, NumBytes,
3085  SrcOffset);
3086  return;
3087  }
3088 
3089  memcpyFromDeviceGlobal(Dest, &Src, IsDeviceImageScoped, NumBytes,
3090  SrcOffset);
3091  }
3092 
3102  template <typename T, typename PropertyListT>
3103  void copy(const std::remove_all_extents_t<T> *Src,
3105  size_t Count = sizeof(T) / sizeof(std::remove_all_extents_t<T>),
3106  size_t StartIndex = 0) {
3107  this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
3108  StartIndex * sizeof(std::remove_all_extents_t<T>));
3109  }
3110 
3121  template <typename T, typename PropertyListT>
3122  void
3124  std::remove_all_extents_t<T> *Dest,
3125  size_t Count = sizeof(T) / sizeof(std::remove_all_extents_t<T>),
3126  size_t StartIndex = 0) {
3127  this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
3128  StartIndex * sizeof(std::remove_all_extents_t<T>));
3129  }
3133  void ext_oneapi_graph(ext::oneapi::experimental::command_graph<
3135  Graph);
3136 
3145  void ext_oneapi_copy(
3146  const void *Src, ext::oneapi::experimental::image_mem_handle Dest,
3147  const ext::oneapi::experimental::image_descriptor &DestImgDesc);
3148 
3169  void ext_oneapi_copy(
3170  const void *Src, sycl::range<3> SrcOffset, sycl::range<3> SrcExtent,
3172  sycl::range<3> DestOffset,
3174  sycl::range<3> CopyExtent);
3175 
3185  void ext_oneapi_copy(
3186  const ext::oneapi::experimental::image_mem_handle Src, void *Dest,
3188 
3210  void
3211  ext_oneapi_copy(const ext::oneapi::experimental::image_mem_handle Src,
3212  sycl::range<3> SrcOffset,
3214  void *Dest, sycl::range<3> DestOffset,
3215  sycl::range<3> DestExtent, sycl::range<3> CopyExtent);
3216 
3227  void ext_oneapi_copy(
3228  const void *Src, void *Dest,
3229  const ext::oneapi::experimental::image_descriptor &DeviceImgDesc,
3230  size_t DeviceRowPitch);
3231 
3239  void
3240  ext_oneapi_copy(const ext::oneapi::experimental::image_mem_handle Src,
3243 
3266  void ext_oneapi_copy(
3267  const void *Src, sycl::range<3> SrcOffset, void *Dest,
3268  sycl::range<3> DestOffset,
3269  const ext::oneapi::experimental::image_descriptor &DeviceImgDesc,
3270  size_t DeviceRowPitch, sycl::range<3> HostExtent,
3271  sycl::range<3> CopyExtent);
3272 
3274  // semaphore to the queue.
3279  void ext_oneapi_wait_external_semaphore(
3281 
3283  // semaphore to the queue.
3290  void ext_oneapi_wait_external_semaphore(
3292  uint64_t WaitValue);
3293 
3300  void ext_oneapi_signal_external_semaphore(
3302 
3312  void ext_oneapi_signal_external_semaphore(
3314  uint64_t SignalValue);
3315 
3316 private:
3317  std::shared_ptr<detail::handler_impl> impl;
3318  std::shared_ptr<detail::queue_impl> MQueue;
3319 
3320  std::vector<detail::LocalAccessorImplPtr> MLocalAccStorage;
3321  std::vector<std::shared_ptr<detail::stream_impl>> MStreamStorage;
3322  detail::string MKernelName;
3324  std::shared_ptr<detail::kernel_impl> MKernel;
3326  void *MSrcPtr = nullptr;
3328  void *MDstPtr = nullptr;
3330  size_t MLength = 0;
3332  std::vector<unsigned char> MPattern;
3334  std::unique_ptr<detail::HostKernelBase> MHostKernel;
3335 
3336  detail::code_location MCodeLoc = {};
3337  bool MIsFinalized = false;
3338  event MLastEvent;
3339 
3340  // Make queue_impl class friend to be able to call finalize method.
3341  friend class detail::queue_impl;
3342  // Make accessor class friend to keep the list of associated accessors.
3343  template <typename DataT, int Dims, access::mode AccMode,
3345  typename PropertyListT>
3346  friend class accessor;
3348 
3349  template <typename DataT, int Dimensions, access::mode AccessMode,
3352  // Make stream class friend to be able to keep the list of associated streams
3353  friend class stream;
3354  friend class detail::stream_impl;
3355  // Make reduction friends to store buffers and arrays created for it
3356  // in handler from reduction methods.
3357  template <typename T, class BinaryOperation, int Dims, size_t Extent,
3358  bool ExplicitIdentity, typename RedOutVar>
3360 
3362  template <class FunctorTy>
3363  friend void detail::reduction::withAuxHandler(handler &CGH, FunctorTy Func);
3364 
3365  template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
3366  typename PropertiesT, typename... RestT>
3368  PropertiesT Properties,
3369  RestT... Rest);
3370 
3371  template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
3372  typename PropertiesT, typename... RestT>
3373  friend void
3375  PropertiesT Properties, RestT... Rest);
3376 
3377 #ifndef __SYCL_DEVICE_ONLY__
3380  access::target);
3385 #endif
3386 
3387  friend class ::MockHandler;
3388  friend class detail::queue_impl;
3389 
3390  // Make pipe class friend to be able to call ext_intel_read/write_host_pipe
3391  // method.
3392  template <class _name, class _dataT, int32_t _min_capacity,
3393  class _propertiesT, class>
3395 
3396  template <class Obj>
3397  friend const decltype(Obj::impl) &
3398  sycl::detail::getSyclObjImpl(const Obj &SyclObject);
3399 
3406  void ext_intel_read_host_pipe(const std::string &Name, void *Ptr, size_t Size,
3407  bool Block = false) {
3408  ext_intel_read_host_pipe(detail::string_view(Name), Ptr, Size, Block);
3409  }
3410  void ext_intel_read_host_pipe(detail::string_view Name, void *Ptr,
3411  size_t Size, bool Block = false);
3412 
3419  void ext_intel_write_host_pipe(const std::string &Name, void *Ptr,
3420  size_t Size, bool Block = false) {
3421  ext_intel_write_host_pipe(detail::string_view(Name), Ptr, Size, Block);
3422  }
3423  void ext_intel_write_host_pipe(detail::string_view Name, void *Ptr,
3424  size_t Size, bool Block = false);
3427 
3428  bool DisableRangeRounding();
3429 
3430  bool RangeRoundingTrace();
3431 
3432  void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
3433  size_t &MinRange);
3434 
3435  template <typename WrapperT, typename TransformedArgType, int Dims,
3436  typename KernelType,
3438  KernelType, TransformedArgType>::value> * = nullptr>
3439  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
3440  range<Dims> UserRange) {
3441  return detail::RoundedRangeKernelWithKH<TransformedArgType, Dims,
3442  KernelType>{UserRange, KernelFunc};
3443  }
3444 
3445  template <typename WrapperT, typename TransformedArgType, int Dims,
3446  typename KernelType,
3447  std::enable_if_t<!detail::KernelLambdaHasKernelHandlerArgT<
3448  KernelType, TransformedArgType>::value> * = nullptr>
3449  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
3450  range<Dims> UserRange) {
3451  return detail::RoundedRangeKernel<TransformedArgType, Dims, KernelType>{
3452  UserRange, KernelFunc};
3453  }
3454 
3455  const std::shared_ptr<detail::context_impl> &getContextImplPtr() const;
3456 
3457  // Checks if 2D memory operations are supported by the underlying platform.
3458  bool supportsUSMMemcpy2D();
3459  bool supportsUSMFill2D();
3460  bool supportsUSMMemset2D();
3461 
3462  // Helper function for getting a loose bound on work-items.
3463  id<2> computeFallbackKernelBounds(size_t Width, size_t Height);
3464 
3465  // Function to get information about the backend for which the code is
3466  // compiled for
3467  backend getDeviceBackend() const;
3468 
3469  // Common function for launching a 2D USM memcpy kernel to avoid redefinitions
3470  // of the kernel from copy and memcpy.
3471  template <typename T>
3472  void commonUSMCopy2DFallbackKernel(const void *Src, size_t SrcPitch,
3473  void *Dest, size_t DestPitch, size_t Width,
3474  size_t Height) {
3475  // Otherwise the data is accessible on the device so we do the operation
3476  // there instead.
3477  // Limit number of work items to be resistant to big copies.
3478  id<2> Chunk = computeFallbackKernelBounds(Height, Width);
3479  id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
3480  parallel_for<__usmmemcpy2d<T>>(
3481  range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) {
3482  T *CastedDest = static_cast<T *>(Dest);
3483  const T *CastedSrc = static_cast<const T *>(Src);
3484  for (uint32_t I = 0; I < Iterations[0]; ++I) {
3485  for (uint32_t J = 0; J < Iterations[1]; ++J) {
3486  id<2> adjustedIndex = Index + Chunk * id<2>{I, J};
3487  if (adjustedIndex[0] < Height && adjustedIndex[1] < Width) {
3488  CastedDest[adjustedIndex[0] * DestPitch + adjustedIndex[1]] =
3489  CastedSrc[adjustedIndex[0] * SrcPitch + adjustedIndex[1]];
3490  }
3491  }
3492  }
3493  });
3494  }
3495 
3496  // Common function for launching a 2D USM memcpy host-task to avoid
3497  // redefinitions of the kernel from copy and memcpy.
3498  template <typename T>
3499  void commonUSMCopy2DFallbackHostTask(const void *Src, size_t SrcPitch,
3500  void *Dest, size_t DestPitch,
3501  size_t Width, size_t Height) {
3502  // If both pointers are host USM or unknown (assumed non-USM) we use a
3503  // host-task to satisfy dependencies.
3504  host_task([=] {
3505  const T *CastedSrc = static_cast<const T *>(Src);
3506  T *CastedDest = static_cast<T *>(Dest);
3507  for (size_t I = 0; I < Height; ++I) {
3508  const T *SrcItBegin = CastedSrc + SrcPitch * I;
3509  T *DestItBegin = CastedDest + DestPitch * I;
3510  std::copy(SrcItBegin, SrcItBegin + Width, DestItBegin);
3511  }
3512  });
3513  }
3514 
3515  // StageFillCG() Supporting function to fill()
3516  template <typename T, int Dims, access::mode AccessMode,
3517  access::target AccessTarget,
3519  typename PropertyListT = property_list>
3520  void StageFillCG(
3521  accessor<T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT>
3522  Dst,
3523  const T &Pattern) {
3524  setType(detail::CGType::Fill);
3525  detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Dst;
3526  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
3527 
3528  MDstPtr = static_cast<void *>(AccImpl.get());
3529  addAccessorReq(std::move(AccImpl));
3530 
3531  MPattern.resize(sizeof(T));
3532  auto PatternPtr = reinterpret_cast<T *>(MPattern.data());
3533  *PatternPtr = Pattern;
3534  }
3535 
3536  // Common function for launching a 2D USM fill kernel to avoid redefinitions
3537  // of the kernel from memset and fill.
3538  template <typename T>
3539  void commonUSMFill2DFallbackKernel(void *Dest, size_t DestPitch,
3540  const T &Pattern, size_t Width,
3541  size_t Height) {
3542  // Otherwise the data is accessible on the device so we do the operation
3543  // there instead.
3544  // Limit number of work items to be resistant to big fill operations.
3545  id<2> Chunk = computeFallbackKernelBounds(Height, Width);
3546  id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
3547  parallel_for<__usmfill2d<T>>(
3548  range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) {
3549  T *CastedDest = static_cast<T *>(Dest);
3550  for (uint32_t I = 0; I < Iterations[0]; ++I) {
3551  for (uint32_t J = 0; J < Iterations[1]; ++J) {
3552  id<2> adjustedIndex = Index + Chunk * id<2>{I, J};
3553  if (adjustedIndex[0] < Height && adjustedIndex[1] < Width) {
3554  CastedDest[adjustedIndex[0] * DestPitch + adjustedIndex[1]] =
3555  Pattern;
3556  }
3557  }
3558  }
3559  });
3560  }
3561 
3562  // Common function for launching a 2D USM fill kernel or host_task to avoid
3563  // redefinitions of the kernel from memset and fill.
3564  template <typename T>
3565  void commonUSMFill2DFallbackHostTask(void *Dest, size_t DestPitch,
3566  const T &Pattern, size_t Width,
3567  size_t Height) {
3568  // If the pointer is host USM or unknown (assumed non-USM) we use a
3569  // host-task to satisfy dependencies.
3570  host_task([=] {
3571  T *CastedDest = static_cast<T *>(Dest);
3572  for (size_t I = 0; I < Height; ++I) {
3573  T *ItBegin = CastedDest + DestPitch * I;
3574  std::fill(ItBegin, ItBegin + Width, Pattern);
3575  }
3576  });
3577  }
3578 
3579  // Implementation of USM fill using command for native fill.
3580  void fill_impl(void *Dest, const void *Value, size_t ValueSize, size_t Count);
3581 
3582  // Implementation of ext_oneapi_memcpy2d using command for native 2D memcpy.
3583  void ext_oneapi_memcpy2d_impl(void *Dest, size_t DestPitch, const void *Src,
3584  size_t SrcPitch, size_t Width, size_t Height);
3585 
3586  // Untemplated version of ext_oneapi_fill2d using command for native 2D fill.
3587  void ext_oneapi_fill2d_impl(void *Dest, size_t DestPitch, const void *Value,
3588  size_t ValueSize, size_t Width, size_t Height);
3589 
3590  // Implementation of ext_oneapi_memset2d using command for native 2D memset.
3591  void ext_oneapi_memset2d_impl(void *Dest, size_t DestPitch, int Value,
3592  size_t Width, size_t Height);
3593 
3594  // Implementation of memcpy to device_global.
3595  void memcpyToDeviceGlobal(const void *DeviceGlobalPtr, const void *Src,
3596  bool IsDeviceImageScoped, size_t NumBytes,
3597  size_t Offset);
3598 
3599  // Implementation of memcpy from device_global.
3600  void memcpyFromDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
3601  bool IsDeviceImageScoped, size_t NumBytes,
3602  size_t Offset);
3603 
3604  // Implementation of memcpy to an unregistered device_global.
3605  void memcpyToHostOnlyDeviceGlobal(const void *DeviceGlobalPtr,
3606  const void *Src, size_t DeviceGlobalTSize,
3607  bool IsDeviceImageScoped, size_t NumBytes,
3608  size_t Offset);
3609 
3610  // Implementation of memcpy from an unregistered device_global.
3611  void memcpyFromHostOnlyDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
3612  bool IsDeviceImageScoped, size_t NumBytes,
3613  size_t Offset);
3614 
3615  template <typename T, int Dims, access::mode AccessMode,
3616  access::target AccessTarget,
3618  typename PropertyListT = property_list>
3619  void checkIfPlaceholderIsBoundToHandler(
3620  accessor<T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT>
3621  Acc) {
3622  auto *AccBase = reinterpret_cast<detail::AccessorBaseHost *>(&Acc);
3623  detail::AccessorImplHost *Req = detail::getSyclObjImpl(*AccBase).get();
3624  if (HasAssociatedAccessor(Req, AccessTarget))
3626  "placeholder accessor must be bound by calling "
3627  "handler::require() before it can be used.");
3628  }
3629 
3630  // Changing values in this will break ABI/API.
3631  enum class StableKernelCacheConfig : int32_t {
3632  Default = 0,
3633  LargeSLM = 1,
3634  LargeData = 2
3635  };
3636 
3637  // Set value of the gpu cache configuration for the kernel.
3638  void setKernelCacheConfig(StableKernelCacheConfig);
3639  // Set value of the kernel is cooperative flag
3640  void setKernelIsCooperative(bool);
3641 
3642  // Set using cuda thread block cluster launch flag and set the launch bounds.
3643  void setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims);
3644 
3645  template <
3647  void throwIfGraphAssociated() const {
3648 
3649  if (getCommandGraph()) {
3650  std::string FeatureString =
3652  FeatureT);
3654  "The " + FeatureString +
3655  " feature is not yet available "
3656  "for use with the SYCL Graph extension.");
3657  }
3658  }
3659 
3660  // Set that an ND Range was used during a call to parallel_for
3661  void setNDRangeUsed(bool Value);
3662 
3663  inline void internalProfilingTagImpl() {
3664  throwIfActionIsCreated();
3666  }
3667 
3668  void addAccessorReq(detail::AccessorImplPtr Accessor);
3669 
3670  void addLifetimeSharedPtrStorage(std::shared_ptr<const void> SPtr);
3671 
3672  void addArg(detail::kernel_param_kind_t ArgKind, void *Req, int AccessTarget,
3673  int ArgIndex);
3674  void clearArgs();
3675  void setArgsToAssociatedAccessors();
3676 
3677  bool HasAssociatedAccessor(detail::AccessorImplHost *Req,
3678  access::target AccessTarget) const;
3679 
3680  template <int Dims> static sycl::range<3> padRange(sycl::range<Dims> Range) {
3681  if constexpr (Dims == 3) {
3682  return Range;
3683  } else {
3684  sycl::range<3> Res{0, 0, 0};
3685  for (int I = 0; I < Dims; ++I)
3686  Res[I] = Range[I];
3687  return Res;
3688  }
3689  }
3690 
3691  template <int Dims> static sycl::id<3> padId(sycl::id<Dims> Id) {
3692  if constexpr (Dims == 3) {
3693  return Id;
3694  } else {
3695  sycl::id<3> Res{0, 0, 0};
3696  for (int I = 0; I < Dims; ++I)
3697  Res[I] = Id[I];
3698  return Res;
3699  }
3700  }
3701 
3702  template <int Dims>
3703  void setNDRangeDescriptor(sycl::range<Dims> N,
3704  bool SetNumWorkGroups = false) {
3705  return setNDRangeDescriptorPadded(padRange(N), SetNumWorkGroups, Dims);
3706  }
3707  template <int Dims>
3708  void setNDRangeDescriptor(sycl::range<Dims> NumWorkItems,
3709  sycl::id<Dims> Offset) {
3710  return setNDRangeDescriptorPadded(padRange(NumWorkItems), padId(Offset),
3711  Dims);
3712  }
3713  template <int Dims>
3714  void setNDRangeDescriptor(sycl::nd_range<Dims> ExecutionRange) {
3715  return setNDRangeDescriptorPadded(
3716  padRange(ExecutionRange.get_global_range()),
3717  padRange(ExecutionRange.get_local_range()),
3718  padId(ExecutionRange.get_offset()), Dims);
3719  }
3720 
3721  void setNDRangeDescriptorPadded(sycl::range<3> N, bool SetNumWorkGroups,
3722  int Dims);
3723  void setNDRangeDescriptorPadded(sycl::range<3> NumWorkItems,
3724  sycl::id<3> Offset, int Dims);
3725  void setNDRangeDescriptorPadded(sycl::range<3> NumWorkItems,
3726  sycl::range<3> LocalSize, sycl::id<3> Offset,
3727  int Dims);
3728 
3730 
3731 protected:
3733  void depends_on(const detail::EventImplPtr &Event);
3735  void depends_on(const std::vector<detail::EventImplPtr> &Events);
3736 };
3737 
3738 namespace detail {
3740 public:
3741  static void internalProfilingTagImpl(handler &Handler) {
3742  Handler.internalProfilingTagImpl();
3743  }
3744 };
3745 } // namespace detail
3746 
3747 } // namespace _V1
3748 } // namespace sycl
The file contains implementations of accessor class.
Defines a shared array that can be used by kernels in queues.
Definition: buffer.hpp:173
static void internalProfilingTagImpl(handler &Handler)
Definition: handler.hpp:3741
RoundedRangeIDGenerator(const id< Dims > &Id, const range< Dims > &UserRange, const range< Dims > &RoundedRange)
Definition: handler.hpp:340
void operator()(item< Dims > It, kernel_handler KH) const
Definition: handler.hpp:402
void operator()(item< Dims > It) const
Definition: handler.hpp:387
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:44
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
Implementation details of command_graph<modifiable>.
Definition: graph_impl.hpp:853
Command group handler class.
Definition: handler.hpp:468
void fill(void *Ptr, const T &Pattern, size_t Count)
Fills the specified memory with the specified pattern.
Definition: handler.hpp:2869
void parallel_for(range< 2 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:2169
void parallel_for(kernel Kernel, range< Dims > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range.
Definition: handler.hpp:2259
void single_task(_KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:2019
void copy(const std::remove_all_extents_t< T > *Src, ext::oneapi::experimental::device_global< T, PropertyListT > &Dest, size_t Count=sizeof(T)/sizeof(std::remove_all_extents_t< T >), size_t StartIndex=0)
Copies elements of type std::remove_all_extents_t<T> from a USM memory region to a device_global.
Definition: handler.hpp:3103
void parallel_for(nd_range< Dims > NDRange, kernel Kernel)
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:2207
void parallel_for(range< 1 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2025
void parallel_for_work_group(range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2593
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 2 > Range, PropertiesT Properties, RestT &&...Rest)
Definition: handler.hpp:2514
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2616
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(nd_range< Dims > Range, PropertiesT Properties, RestT &&...Rest)
Definition: handler.hpp:2564
void copy(const ext::oneapi::experimental::device_global< T, PropertyListT > &Src, std::remove_all_extents_t< T > *Dest, size_t Count=sizeof(T)/sizeof(std::remove_all_extents_t< T >), size_t StartIndex=0)
Copies elements of type std::remove_all_extents_t<T> from a device_global to a USM memory region.
Definition: handler.hpp:3123
void parallel_for(range< 3 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2039
handler & operator=(handler &&)=delete
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 3 > Range, PropertiesT Properties, RestT &&...Rest)
Definition: handler.hpp:2527
handler(handler &&)=delete
void copy(accessor< T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src, IsPlaceholder_Src > Src, accessor< T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst, IsPlaceholder_Dst > Dst)
Copies the content of memory object accessed by Src to the memory object accessed by Dst.
Definition: handler.hpp:2744
void copy(const T *Src, T *Dest, size_t Count)
Copies data from one memory region to another, each is either a host pointer or a pointer within USM ...
Definition: handler.hpp:2922
void parallel_for_work_group(range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:2138
void parallel_for(range< 3 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:2173
void parallel_for(range< 1 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:2165
void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function.
Definition: handler.hpp:2225
handler & operator=(const handler &)=delete
void parallel_for(kernel Kernel, nd_range< Dims > NDRange, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:2337
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2419
void require(accessor< DataT, Dims, AccMode, AccTarget, isPlaceholder > Acc)
Requires access to the memory object associated with the placeholder accessor.
Definition: handler.hpp:1911
void fill(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT > Dst, const T &Pattern)
Fills memory pointed by accessor with the pattern given.
Definition: handler.hpp:2825
void update_host(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder > Acc)
Provides guarantees that the memory object accessed via Acc is updated on the host after command grou...
Definition: handler.hpp:2795
void set_arg(int argIndex, ext::oneapi::experimental::dynamic_parameter< T > &dynamicParam)
Definition: handler.hpp:1992
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 2 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2468
std::is_same< remove_cv_ref_t< U >, remove_cv_ref_t< T > > is_same_type
Definition: handler.hpp:1951
std::enable_if_t< ShouldEnableSetArg< T >::value, void > set_arg(int ArgIndex, T &&Arg)
Sets argument for OpenCL interoperability kernels.
Definition: handler.hpp:1973
void parallel_for_work_group(range< Dims > NumWorkGroups, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
}@
Definition: handler.hpp:2584
void require(ext::oneapi::experimental::dynamic_parameter< accessor< DataT, Dims, AccMode, AccTarget, isPlaceholder >> dynamicParamAcc)
Requires access to the memory object associated with the placeholder accessor contained in a dynamic_...
Definition: handler.hpp:1927
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > single_task(PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2449
void single_task(kernel Kernel)
Invokes a SYCL kernel.
Definition: handler.hpp:2152
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(nd_range< Dims > Range, RestT &&...Rest)
Definition: handler.hpp:2574
void copy(const T_Src *Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2706
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 3 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2478
std::enable_if_t< detail::check_fn_signature< std::remove_reference_t< FuncT >, void(interop_handle)>::value > ext_codeplay_enqueue_native_command(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func immediately.
Definition: handler.hpp:2059
void copy(std::shared_ptr< T_Src > Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2644
handler(const handler &)=delete
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, T_Dst *Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2673
typename std::remove_cv_t< std::remove_reference_t< T > > remove_cv_ref_t
Definition: handler.hpp:1948
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(nd_range< Dims > Range, PropertiesT Properties, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2488
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(range< 1 > Range, RestT &&...Rest)
Definition: handler.hpp:2536
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(range< 2 > Range, RestT &&...Rest)
Definition: handler.hpp:2544
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(range< 3 > Range, RestT &&...Rest)
Definition: handler.hpp:2552
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2380
void ext_oneapi_barrier()
Prevents any commands submitted afterward to this queue from executing until all commands previously ...
Definition: handler.hpp:2887
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 1 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2458
void memcpy(void *Dest, const ext::oneapi::experimental::device_global< T, PropertyListT > &Src, size_t NumBytes=sizeof(T), size_t SrcOffset=0)
Copies data from a device_global to USM memory.
Definition: handler.hpp:3068
void parallel_for_work_group(range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:2117
std::enable_if_t< detail::check_fn_signature< std::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< std::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
Definition: handler.hpp:2051
void set_arg(int argIndex, ext::oneapi::experimental::raw_kernel_arg &&Arg)
Definition: handler.hpp:1998
void set_specialization_constant(typename std::remove_reference_t< decltype(SpecName)>::value_type Value)
Definition: handler.hpp:1868
void set_args(Ts &&...Args)
Sets arguments for OpenCL interoperability kernels.
Definition: handler.hpp:2007
void memcpy(ext::oneapi::experimental::device_global< T, PropertyListT > &Dest, const void *Src, size_t NumBytes=sizeof(T), size_t DestOffset=0)
Copies data from a USM memory region to a device_global.
Definition: handler.hpp:3034
void parallel_for(range< 2 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2032
void set_arg(int ArgIndex, accessor< DataT, Dims, AccessMode, AccessTarget, IsPlaceholder > Arg)
Definition: handler.hpp:1980
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 1 > Range, PropertiesT Properties, RestT &&...Rest)
Reductions.
Definition: handler.hpp:2501
std::remove_reference_t< decltype(SpecName)>::value_type get_specialization_constant() const
Definition: handler.hpp:1883
void set_arg(int ArgIndex, local_accessor< DataT, Dims > Arg)
Definition: handler.hpp:1986
A unique identifier of an item in an index space.
Definition: id.hpp:36
Identifies an instance of the function object executing at each point in a range.
Definition: item.hpp:37
id< Dimensions > get_id() const
Definition: item.hpp:55
range< Dimensions > get_range() const
Definition: item.hpp:69
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:71
Identifies an instance of the function object executing at each point in an nd_range.
Definition: nd_item.hpp:48
id< Dimensions > get_global_id() const
Definition: nd_item.hpp:52
id< Dimensions > get_offset() const
Definition: nd_item.hpp:187
range< Dimensions > get_global_range() const
Definition: nd_item.hpp:158
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:22
range< Dimensions > get_global_range() const
Definition: nd_range.hpp:43
range< Dimensions > get_local_range() const
Definition: nd_range.hpp:45
id< Dimensions > get_offset() const
Definition: nd_range.hpp:50
Objects of the property_list class are containers for the SYCL properties.
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: range.hpp:26
size_t size() const
Definition: range.hpp:56
#define _KERNELFUNCPARAMTYPE
Definition: handler.hpp:77
#define __SYCL_KERNEL_ATTR__
Definition: handler.hpp:1539
#define _KERNELFUNCPARAM(a)
Definition: handler.hpp:81
__SYCL_EXTERN_STREAM_ATTRS ostream cout
Linked to standard output.
void withAuxHandler(handler &CGH, FunctorTy Func)
Definition: reduction.hpp:1171
void finalizeHandler(handler &CGH)
Definition: reduction.hpp:1170
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:31
void * getValueFromDynamicParameter(ext::oneapi::experimental::detail::dynamic_parameter_base &DynamicParamBase)
Definition: handler.cpp:77
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
Definition: accessor.cpp:17
decltype(member_ptr_helper(&F::operator())) argument_helper(int)
Definition: handler.hpp:205
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:313
static Arg member_ptr_helper(RetType(Func::*)(Arg) const)
bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr)
Definition: handler.cpp:42
static std::enable_if_t< std::is_unsigned_v< T >, bool > multiply_with_overflow_check(T &dst, T x, T y)
Definition: handler.hpp:419
std::enable_if_t< std::is_same_v< T, range< Dims > >||std::is_same_v< T, id< Dims > > > checkValueRange(const T &V)
Definition: handler.hpp:283
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType, ArgType >::value > runKernelWithArg(KernelType KernelName, ArgType Arg)
Definition: cg_types.hpp:141
std::shared_ptr< event_impl > EventImplPtr
Definition: handler.hpp:184
void markBufferAsInternal(const std::shared_ptr< buffer_impl > &BufImpl)
Definition: helpers.cpp:33
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType >::value > runKernelWithoutArg(KernelType KernelName)
Definition: cg_types.hpp:127
std::shared_ptr< LocalAccessorImplHost > LocalAccessorImplPtr
Definition: accessor.hpp:569
CGType
Type of the command group.
Definition: cg_types.hpp:41
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
bool range_size_fits_in_size_t(const range< Dims > &r)
Definition: handler.hpp:424
void associateWithHandler(handler &, AccessorBaseHost *, access::target)
void reduction_parallel_for(handler &CGH, range< Dims > NDRange, PropertiesT Properties, RestT... Rest)
Definition: reduction.hpp:2717
decltype(argument_helper< F, SuggestedArgType >(0)) lambda_arg_type
Definition: handler.hpp:211
std::shared_ptr< AccessorImplHost > AccessorImplPtr
Definition: accessor.hpp:513
const char * UnsupportedFeatureToString(UnsupportedGraphFeatures Feature)
Definition: graph.hpp:62
typename merged_properties< LHSPropertiesT, RHSPropertiesT >::type merged_properties_t
Definition: properties.hpp:267
properties< std::tuple< PropertyValueTs... > > properties_t
Definition: properties.hpp:254
void mem_advise(handler &CGH, void *Ptr, size_t NumBytes, int Advice)
@ executable
In executable state, the graph is ready to execute.
void copy(handler &CGH, const T *Src, T *Dest, size_t Count)
static constexpr bool has_property()
static constexpr auto get_property()
properties< std::tuple<> > empty_properties_t
Definition: properties.hpp:234
void fill(sycl::handler &CGH, T *Ptr, const T &Pattern, size_t Count)
void parallel_for(handler &CGH, range< Dimensions > Range, const KernelType &KernelObj, ReductionsT &&...Reductions)
image_target
Definition: access.hpp:74
class __SYCL_EBO __SYCL_SPECIAL_CLASS Dimensions
signed char __SYCL2020_DEPRECATED
Definition: aliases.hpp:94
ext::intel::pipe< name, dataT, min_capacity > pipe
Definition: pipes.hpp:18
class __SYCL_EBO __SYCL_SPECIAL_CLASS IsPlaceholder
PropertyListT Accessor
Definition: multi_ptr.hpp:510
class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:64
autodecltype(x) x
const void value_type
Definition: multi_ptr.hpp:457
Definition: access.hpp:18
C++ wrapper of extern "C" PI interfaces.
Predicate returning true if all template type parameters except the last one are reductions.
static constexpr const char * getName()
Definition: kernel_desc.hpp:84
A struct to describe the properties of an image.
is_device_copyable is a user specializable class template to indicate that a type T is device copyabl...