DPC++ Runtime
Runtime libraries for oneAPI DPC++
handler.hpp
Go to the documentation of this file.
1 //==-------- handler.hpp --- SYCL command group handler --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
11 #include <sycl/access/access.hpp>
12 #include <sycl/accessor.hpp>
13 #include <sycl/context.hpp>
14 #include <sycl/detail/cg.hpp>
15 #include <sycl/detail/cg_types.hpp>
16 #include <sycl/detail/common.hpp>
18 #include <sycl/detail/export.hpp>
21 #include <sycl/detail/pi.h>
22 #include <sycl/detail/pi.hpp>
24 #include <sycl/detail/string.hpp>
26 #include <sycl/device.hpp>
27 #include <sycl/event.hpp>
28 #include <sycl/exception.hpp>
40 #include <sycl/group.hpp>
41 #include <sycl/id.hpp>
42 #include <sycl/interop_handle.hpp>
43 #include <sycl/item.hpp>
44 #include <sycl/kernel.hpp>
45 #include <sycl/kernel_bundle.hpp>
47 #include <sycl/kernel_handler.hpp>
48 #include <sycl/nd_item.hpp>
49 #include <sycl/nd_range.hpp>
50 #include <sycl/property_list.hpp>
51 #include <sycl/range.hpp>
52 #include <sycl/sampler.hpp>
53 
54 #include <assert.h>
55 #include <functional>
56 #include <memory>
57 #include <stddef.h>
58 #include <stdint.h>
59 #include <string>
60 #include <tuple>
61 #include <type_traits>
62 #include <utility>
63 #include <vector>
64 
65 // TODO: refactor this header
66 // 41(!!!) includes of SYCL headers + 10 includes of standard headers.
67 // 3300+ lines of code
68 
69 // SYCL_LANGUAGE_VERSION is 4 digit year followed by 2 digit revision
70 #if !SYCL_LANGUAGE_VERSION || SYCL_LANGUAGE_VERSION < 202001
71 #define __SYCL_NONCONST_FUNCTOR__
72 #endif
73 
74 // replace _KERNELFUNCPARAM(KernelFunc) with KernelType KernelFunc
75 // or const KernelType &KernelFunc
76 #ifdef __SYCL_NONCONST_FUNCTOR__
77 #define _KERNELFUNCPARAMTYPE KernelType
78 #else
79 #define _KERNELFUNCPARAMTYPE const KernelType &
80 #endif
81 #define _KERNELFUNCPARAM(a) _KERNELFUNCPARAMTYPE a
82 
83 #if defined(__SYCL_UNNAMED_LAMBDA__)
84 // We can't use nested types (e.g. struct S defined inside main() routine) to
85 // name kernels. At the same time, we have to provide a unique kernel name for
86 // sycl::fill and the only thing we can use to introduce that uniqueness (in
87 // general) is the template parameter T which might be exactly that nested type.
88 // That means we cannot support sycl::fill(void *, T&, size_t) for such types in
89 // general. However, we can do better than that when unnamed lambdas are
90 // enabled, so do it here! See also https://github.com/intel/llvm/issues/469.
91 template <typename DataT, int Dimensions, sycl::access::mode AccessMode,
92  sycl::access::target AccessTarget,
94 using __fill = sycl::detail::auto_name;
95 template <typename T> using __usmfill = sycl::detail::auto_name;
96 template <typename T> using __usmfill2d = sycl::detail::auto_name;
97 template <typename T> using __usmmemcpy2d = sycl::detail::auto_name;
98 
99 template <typename T_Src, typename T_Dst, int Dims,
102 using __copyAcc2Ptr = sycl::detail::auto_name;
103 
104 template <typename T_Src, typename T_Dst, int Dims,
107 using __copyPtr2Acc = sycl::detail::auto_name;
108 
109 template <typename T_Src, int Dims_Src, sycl::access::mode AccessMode_Src,
110  sycl::access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
111  sycl::access::mode AccessMode_Dst,
112  sycl::access::target AccessTarget_Dst,
113  sycl::access::placeholder IsPlaceholder_Src,
114  sycl::access::placeholder IsPlaceholder_Dst>
115 using __copyAcc2Acc = sycl::detail::auto_name;
116 #else
117 // Limited fallback path for when unnamed lambdas aren't available. Cannot
118 // handle nested types.
119 template <typename DataT, int Dimensions, sycl::access::mode AccessMode,
120  sycl::access::target AccessTarget,
122 class __fill;
123 template <typename T> class __usmfill;
124 template <typename T> class __usmfill2d;
125 template <typename T> class __usmmemcpy2d;
126 
127 template <typename T_Src, typename T_Dst, int Dims,
131 
132 template <typename T_Src, typename T_Dst, int Dims,
136 
137 template <typename T_Src, int Dims_Src, sycl::access::mode AccessMode_Src,
138  sycl::access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
139  sycl::access::mode AccessMode_Dst,
140  sycl::access::target AccessTarget_Dst,
141  sycl::access::placeholder IsPlaceholder_Src,
142  sycl::access::placeholder IsPlaceholder_Dst>
144 #endif
145 
146 // For unit testing purposes
147 class MockHandler;
148 
149 namespace sycl {
150 inline namespace _V1 {
151 
152 // Forward declaration
153 
154 class handler;
155 template <typename T, int Dimensions, typename AllocatorT, typename Enable>
156 class buffer;
157 
158 namespace ext::intel::experimental {
159 template <class _name, class _dataT, int32_t _min_capacity, class _propertiesT,
160  class>
161 class pipe;
162 }
163 
164 namespace ext::oneapi::experimental::detail {
165 class graph_impl;
166 } // namespace ext::oneapi::experimental::detail
167 namespace detail {
168 
169 class handler_impl;
170 class kernel_impl;
171 class queue_impl;
172 class stream_impl;
173 template <typename DataT, int Dimensions, access::mode AccessMode,
175 class image_accessor;
176 template <typename RetType, typename Func, typename Arg>
177 static Arg member_ptr_helper(RetType (Func::*)(Arg) const);
178 
179 // Non-const version of the above template to match functors whose 'operator()'
180 // is declared w/o the 'const' qualifier.
181 template <typename RetType, typename Func, typename Arg>
182 static Arg member_ptr_helper(RetType (Func::*)(Arg));
183 
184 // Version with two arguments to handle the case when kernel_handler is passed
185 // to a lambda
186 template <typename RetType, typename Func, typename Arg1, typename Arg2>
187 static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2) const);
188 
189 // Non-const version of the above template to match functors whose 'operator()'
190 // is declared w/o the 'const' qualifier.
191 template <typename RetType, typename Func, typename Arg1, typename Arg2>
192 static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2));
193 
194 template <typename F, typename SuggestedArgType>
195 decltype(member_ptr_helper(&F::operator())) argument_helper(int);
196 
197 template <typename F, typename SuggestedArgType>
198 SuggestedArgType argument_helper(...);
199 
200 template <typename F, typename SuggestedArgType>
201 using lambda_arg_type = decltype(argument_helper<F, SuggestedArgType>(0));
202 
203 // Used when parallel_for range is rounded-up.
204 template <typename Name> class __pf_kernel_wrapper;
205 
206 template <typename Type> struct get_kernel_wrapper_name_t {
208 };
209 
210 __SYCL_EXPORT device getDeviceFromHandler(handler &);
211 
212 // Checks if a device_global has any registered kernel usage.
213 __SYCL_EXPORT bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr);
214 
215 // Extracts a pointer to the value inside a dynamic parameter
216 __SYCL_EXPORT void *getValueFromDynamicParameter(
218  &DynamicParamBase);
219 
220 #if __SYCL_ID_QUERIES_FIT_IN_INT__
221 template <typename T> struct NotIntMsg;
222 
223 template <int Dims> struct NotIntMsg<range<Dims>> {
224  constexpr static const char *Msg =
225  "Provided range is out of integer limits. Pass "
226  "`-fno-sycl-id-queries-fit-in-int' to disable range check.";
227 };
228 
229 template <int Dims> struct NotIntMsg<id<Dims>> {
230  constexpr static const char *Msg =
231  "Provided offset is out of integer limits. Pass "
232  "`-fno-sycl-id-queries-fit-in-int' to disable offset check.";
233 };
234 #endif
235 
236 // Helper for merging properties with ones defined in an optional kernel functor
237 // getter.
238 template <typename KernelType, typename PropertiesT, typename Cond = void>
240  using type = PropertiesT;
241 };
242 template <typename KernelType, typename PropertiesT>
244  KernelType, PropertiesT,
245  std::enable_if_t<ext::oneapi::experimental::detail::
246  HasKernelPropertiesGetMethod<KernelType>::value>> {
249  KernelType>::properties_t;
250  static_assert(
252  "get(sycl::ext::oneapi::experimental::properties_tag) member in kernel "
253  "functor class must return a valid property list.");
255  PropertiesT, get_method_properties>;
256 };
257 
258 #if __SYCL_ID_QUERIES_FIT_IN_INT__
259 template <typename T, typename ValT>
260 typename std::enable_if_t<std::is_same<ValT, size_t>::value ||
261  std::is_same<ValT, unsigned long long>::value>
262 checkValueRangeImpl(ValT V) {
263  static constexpr size_t Limit =
264  static_cast<size_t>((std::numeric_limits<int>::max)());
265  if (V > Limit)
266  throw sycl::exception(make_error_code(errc::nd_range), NotIntMsg<T>::Msg);
267 }
268 #endif
269 
270 template <int Dims, typename T>
271 typename std::enable_if_t<std::is_same_v<T, range<Dims>> ||
272  std::is_same_v<T, id<Dims>>>
273 checkValueRange(const T &V) {
274 #if __SYCL_ID_QUERIES_FIT_IN_INT__
275  for (size_t Dim = 0; Dim < Dims; ++Dim)
276  checkValueRangeImpl<T>(V[Dim]);
277 
278  {
279  unsigned long long Product = 1;
280  for (size_t Dim = 0; Dim < Dims; ++Dim) {
281  Product *= V[Dim];
282  // check value now to prevent product overflow in the end
283  checkValueRangeImpl<T>(Product);
284  }
285  }
286 #else
287  (void)V;
288 #endif
289 }
290 
291 template <int Dims>
292 void checkValueRange(const range<Dims> &R, const id<Dims> &O) {
293 #if __SYCL_ID_QUERIES_FIT_IN_INT__
294  checkValueRange<Dims>(R);
295  checkValueRange<Dims>(O);
296 
297  for (size_t Dim = 0; Dim < Dims; ++Dim) {
298  unsigned long long Sum = R[Dim] + O[Dim];
299 
300  checkValueRangeImpl<range<Dims>>(Sum);
301  }
302 #else
303  (void)R;
304  (void)O;
305 #endif
306 }
307 
308 template <int Dims, typename T>
309 typename std::enable_if_t<std::is_same_v<T, nd_range<Dims>>>
310 checkValueRange(const T &V) {
311 #if __SYCL_ID_QUERIES_FIT_IN_INT__
312  checkValueRange<Dims>(V.get_global_range());
313  checkValueRange<Dims>(V.get_local_range());
314  checkValueRange<Dims>(V.get_offset());
315 
316  checkValueRange<Dims>(V.get_global_range(), V.get_offset());
317 #else
318  (void)V;
319 #endif
320 }
321 
322 template <int Dims> class RoundedRangeIDGenerator {
323  id<Dims> Id;
324  id<Dims> InitId;
325  range<Dims> UserRange;
326  range<Dims> RoundedRange;
327  bool Done = false;
328 
329 public:
330  RoundedRangeIDGenerator(const id<Dims> &Id, const range<Dims> &UserRange,
331  const range<Dims> &RoundedRange)
332  : Id(Id), InitId(Id), UserRange(UserRange), RoundedRange(RoundedRange) {
333  for (int i = 0; i < Dims; ++i)
334  if (Id[i] >= UserRange[i])
335  Done = true;
336  }
337 
338  explicit operator bool() { return !Done; }
339 
340  void updateId() {
341  for (int i = 0; i < Dims; ++i) {
342  Id[i] += RoundedRange[i];
343  if (Id[i] < UserRange[i])
344  return;
345  Id[i] = InitId[i];
346  }
347  Done = true;
348  }
349 
350  id<Dims> getId() { return Id; }
351 
352  template <typename KernelType> auto getItem() {
353  if constexpr (std::is_invocable_v<KernelType, item<Dims> &> ||
354  std::is_invocable_v<KernelType, item<Dims> &, kernel_handler>)
355  return detail::Builder::createItem<Dims, true>(UserRange, getId(), {});
356  else {
357  static_assert(std::is_invocable_v<KernelType, item<Dims, false> &> ||
358  std::is_invocable_v<KernelType, item<Dims, false> &,
359  kernel_handler>,
360  "Kernel must be invocable with an item!");
361  return detail::Builder::createItem<Dims, false>(UserRange, getId());
362  }
363  }
364 };
365 
366 // TODO: The wrappers can be optimized further so that the body
367 // essentially looks like this:
368 // for (auto z = it[2]; z < UserRange[2]; z += it.get_range(2))
369 // for (auto y = it[1]; y < UserRange[1]; y += it.get_range(1))
370 // for (auto x = it[0]; x < UserRange[0]; x += it.get_range(0))
371 // KernelFunc({x,y,z});
372 template <typename TransformedArgType, int Dims, typename KernelType>
374 public:
376  KernelType KernelFunc;
377  void operator()(item<Dims> It) const {
378  auto RoundedRange = It.get_range();
379  for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen;
380  Gen.updateId()) {
381  auto item = Gen.template getItem<KernelType>();
382  KernelFunc(item);
383  }
384  }
385 };
386 
387 template <typename TransformedArgType, int Dims, typename KernelType>
389 public:
391  KernelType KernelFunc;
392  void operator()(item<Dims> It, kernel_handler KH) const {
393  auto RoundedRange = It.get_range();
394  for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen;
395  Gen.updateId()) {
396  auto item = Gen.template getItem<KernelType>();
397  KernelFunc(item, KH);
398  }
399  }
400 };
401 
402 using std::enable_if_t;
403 using sycl::detail::queue_impl;
404 
405 // Returns true if x*y will overflow in T;
406 // otherwise, returns false and stores x*y in dst.
407 template <typename T>
408 static std::enable_if_t<std::is_unsigned_v<T>, bool>
409 multiply_with_overflow_check(T &dst, T x, T y) {
410  dst = x * y;
411  return (y != 0) && (x > (std::numeric_limits<T>::max)() / y);
412 }
413 
414 template <int Dims> bool range_size_fits_in_size_t(const range<Dims> &r) {
415  size_t acc = 1;
416  for (int i = 0; i < Dims; ++i) {
417  bool did_overflow = multiply_with_overflow_check(acc, acc, r[i]);
418  if (did_overflow)
419  return false;
420  }
421  return true;
422 }
423 } // namespace detail
424 
458 class __SYCL_EXPORT handler {
459 private:
464  handler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost);
465 
475  handler(std::shared_ptr<detail::queue_impl> Queue,
476  std::shared_ptr<detail::queue_impl> PrimaryQueue,
477  std::shared_ptr<detail::queue_impl> SecondaryQueue, bool IsHost);
478 
485  handler(std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph);
486 
488  template <typename T, typename F = typename std::remove_const_t<
489  typename std::remove_reference_t<T>>>
490  F *storePlainArg(T &&Arg) {
491  CGData.MArgsStorage.emplace_back(sizeof(T));
492  auto Storage = reinterpret_cast<F *>(CGData.MArgsStorage.back().data());
493  *Storage = Arg;
494  return Storage;
495  }
496 
497  void setType(detail::CG::CGTYPE Type) { MCGType = Type; }
498 
499  detail::CG::CGTYPE getType() { return MCGType; }
500 
501  void throwIfActionIsCreated() {
502  if (detail::CG::None != getType())
504  "Attempt to set multiple actions for the "
505  "command group. Command group must consist of "
506  "a single kernel or explicit memory operation.");
507  }
508 
509  constexpr static int AccessTargetMask = 0x7ff;
513  template <typename KernelName, typename KernelType>
514  void throwOnLocalAccessorMisuse() const {
515  using NameT =
517  using KI = sycl::detail::KernelInfo<NameT>;
518 
519  auto *KernelArgs = &KI::getParamDesc(0);
520 
521  for (unsigned I = 0; I < KI::getNumParams(); ++I) {
522  const detail::kernel_param_kind_t &Kind = KernelArgs[I].kind;
523  const access::target AccTarget =
524  static_cast<access::target>(KernelArgs[I].info & AccessTargetMask);
526  (AccTarget == target::local))
527  throw sycl::exception(
529  "A local accessor must not be used in a SYCL kernel function "
530  "that is invoked via single_task or via the simple form of "
531  "parallel_for that takes a range parameter.");
532  }
533  }
534 
537  void
538  extractArgsAndReqsFromLambda(char *LambdaPtr, size_t KernelArgsNum,
539  const detail::kernel_param_desc_t *KernelArgs,
540  bool IsESIMD);
541 
543  void extractArgsAndReqs();
544 
545  void processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
546  const int Size, const size_t Index, size_t &IndexShift,
547  bool IsKernelCreatedFromSource, bool IsESIMD);
548 
550  detail::string getKernelName();
551 
552  template <typename LambdaNameT> bool lambdaAndKernelHaveEqualName() {
553  // TODO It is unclear a kernel and a lambda/functor must to be equal or not
554  // for parallel_for with sycl::kernel and lambda/functor together
555  // Now if they are equal we extract argumets from lambda/functor for the
556  // kernel. Else it is necessary use set_atg(s) for resolve the order and
557  // values of arguments for the kernel.
558  assert(MKernel && "MKernel is not initialized");
559  const std::string LambdaName = detail::KernelInfo<LambdaNameT>::getName();
560  detail::string KernelName = getKernelName();
561  return KernelName == LambdaName;
562  }
563 
566  void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
567 
574  event finalize();
575 
581  void addStream(const std::shared_ptr<detail::stream_impl> &Stream) {
582  MStreamStorage.push_back(Stream);
583  }
584 
590  void addReduction(const std::shared_ptr<const void> &ReduObj);
591 
597  template <typename T, int Dimensions, typename AllocatorT, typename Enable>
598  void
599  addReduction(const std::shared_ptr<buffer<T, Dimensions, AllocatorT, Enable>>
600  &ReduBuf) {
602  addReduction(std::shared_ptr<const void>(ReduBuf));
603  }
604 
605  ~handler() = default;
606 
607  // TODO: Private and unusued. Remove when ABI break is allowed.
608  bool is_host() { return MIsHost; }
609 
610 #ifdef __SYCL_DEVICE_ONLY__
611  // In device compilation accessor isn't inherited from host base classes, so
612  // can't detect by it. Since we don't expect it to be ever called in device
613  // execution, just use blind void *.
614  void associateWithHandler(void *AccBase, access::target AccTarget);
615  void associateWithHandler(void *AccBase, image_target AccTarget);
616 #else
617  void associateWithHandlerCommon(detail::AccessorImplPtr AccImpl,
618  int AccTarget);
620  access::target AccTarget);
622  image_target AccTarget);
624  image_target AccTarget);
625 #endif
626 
627  // Recursively calls itself until arguments pack is fully processed.
628  // The version for regular(standard layout) argument.
629  template <typename T, typename... Ts>
630  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&...Args) {
631  set_arg(ArgIndex, std::move(Arg));
632  setArgsHelper(++ArgIndex, std::move(Args)...);
633  }
634 
635  void setArgsHelper(int) {}
636 
637  void setLocalAccessorArgHelper(int ArgIndex,
638  detail::LocalAccessorBaseHost &LocalAccBase) {
639  detail::LocalAccessorImplPtr LocalAccImpl =
640  detail::getSyclObjImpl(LocalAccBase);
641  detail::LocalAccessorImplHost *Req = LocalAccImpl.get();
642  MLocalAccStorage.push_back(std::move(LocalAccImpl));
643  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
644  static_cast<int>(access::target::local), ArgIndex);
645  }
646 
647  // setArgHelper for local accessor argument (legacy accessor interface)
648  template <typename DataT, int Dims, access::mode AccessMode,
650  void setArgHelper(int ArgIndex,
651  accessor<DataT, Dims, AccessMode, access::target::local,
652  IsPlaceholder> &&Arg) {
653  (void)ArgIndex;
654  (void)Arg;
655 #ifndef __SYCL_DEVICE_ONLY__
656  setLocalAccessorArgHelper(ArgIndex, Arg);
657 #endif
658  }
659 
660  // setArgHelper for local accessor argument (up to date accessor interface)
661  template <typename DataT, int Dims>
662  void setArgHelper(int ArgIndex, local_accessor<DataT, Dims> &&Arg) {
663  (void)ArgIndex;
664  (void)Arg;
665 #ifndef __SYCL_DEVICE_ONLY__
666  setLocalAccessorArgHelper(ArgIndex, Arg);
667 #endif
668  }
669 
670  // setArgHelper for non local accessor argument.
671  template <typename DataT, int Dims, access::mode AccessMode,
673  typename std::enable_if_t<AccessTarget != access::target::local, void>
674  setArgHelper(
675  int ArgIndex,
679  detail::AccessorImplHost *Req = AccImpl.get();
680  // Add accessor to the list of requirements.
681  CGData.MRequirements.push_back(Req);
682  // Store copy of the accessor.
683  CGData.MAccStorage.push_back(std::move(AccImpl));
684  // Add accessor to the list of arguments.
685  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
686  static_cast<int>(AccessTarget), ArgIndex);
687  }
688 
689  template <typename T> void setArgHelper(int ArgIndex, T &&Arg) {
690  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
691 
692  if (!std::is_same<cl_mem, T>::value && std::is_pointer<T>::value) {
693  MArgs.emplace_back(detail::kernel_param_kind_t::kind_pointer, StoredArg,
694  sizeof(T), ArgIndex);
695  } else {
697  StoredArg, sizeof(T), ArgIndex);
698  }
699  }
700 
701  void setArgHelper(int ArgIndex, sampler &&Arg) {
702  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
703  MArgs.emplace_back(detail::kernel_param_kind_t::kind_sampler, StoredArg,
704  sizeof(sampler), ArgIndex);
705  }
706 
707  // setArgHelper for graph dynamic_parameters
708  template <typename T>
709  void
710  setArgHelper(int ArgIndex,
712  // Extract and copy arg so we can move it into setArgHelper
713  T ArgValue =
714  *static_cast<T *>(detail::getValueFromDynamicParameter(DynamicParam));
715  // Set the arg in the handler as normal
716  setArgHelper(ArgIndex, std::move(ArgValue));
717  // Register the dynamic parameter with the handler for later association
718  // with the node being added
719  registerDynamicParameter(DynamicParam, ArgIndex);
720  }
721 
726  void registerDynamicParameter(
728  &DynamicParamBase,
729  int ArgIndex);
730 
731  // TODO: Unusued. Remove when ABI break is allowed.
732  void verifyKernelInvoc(const kernel &Kernel) {
733  std::ignore = Kernel;
734  return;
735  }
736 
737  /* The kernel passed to StoreLambda can take an id, an item or an nd_item as
738  * its argument. Since esimd plugin directly invokes the kernel (doesn’t use
739  * piKernelSetArg), the kernel argument type must be known to the plugin.
740  * However, passing kernel argument type to the plugin requires changing ABI
741  * in HostKernel class. To overcome this problem, helpers below wrap the
742  * “original” kernel with a functor that always takes an nd_item as argument.
743  * A functor is used instead of a lambda because extractArgsAndReqsFromLambda
744  * needs access to the “original” kernel and keeps references to its internal
745  * data, i.e. the kernel passed as argument cannot be local in scope. The
746  * functor itself is again encapsulated in a std::function since functor’s
747  * type is unknown to the plugin.
748  */
749 
750  // For 'id, item w/wo offset, nd_item' kernel arguments
751  template <class KernelType, class NormalizedKernelType, int Dims>
752  KernelType *ResetHostKernelHelper(const KernelType &KernelFunc) {
753  NormalizedKernelType NormalizedKernel(KernelFunc);
754  auto NormalizedKernelFunc =
755  std::function<void(const sycl::nd_item<Dims> &)>(NormalizedKernel);
756  auto HostKernelPtr = new detail::HostKernel<decltype(NormalizedKernelFunc),
757  sycl::nd_item<Dims>, Dims>(
758  std::move(NormalizedKernelFunc));
759  MHostKernel.reset(HostKernelPtr);
760  return &HostKernelPtr->MKernel.template target<NormalizedKernelType>()
761  ->MKernelFunc;
762  }
763 
764  // For 'sycl::id<Dims>' kernel argument
765  template <class KernelType, typename ArgT, int Dims>
766  std::enable_if_t<std::is_same_v<ArgT, sycl::id<Dims>>, KernelType *>
767  ResetHostKernel(const KernelType &KernelFunc) {
768  struct NormalizedKernelType {
769  KernelType MKernelFunc;
770  NormalizedKernelType(const KernelType &KernelFunc)
771  : MKernelFunc(KernelFunc) {}
772  void operator()(const nd_item<Dims> &Arg) {
773  detail::runKernelWithArg(MKernelFunc, Arg.get_global_id());
774  }
775  };
776  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
777  KernelFunc);
778  }
779 
780  // For 'sycl::nd_item<Dims>' kernel argument
781  template <class KernelType, typename ArgT, int Dims>
782  std::enable_if_t<std::is_same_v<ArgT, sycl::nd_item<Dims>>, KernelType *>
783  ResetHostKernel(const KernelType &KernelFunc) {
784  struct NormalizedKernelType {
785  KernelType MKernelFunc;
786  NormalizedKernelType(const KernelType &KernelFunc)
787  : MKernelFunc(KernelFunc) {}
788  void operator()(const nd_item<Dims> &Arg) {
789  detail::runKernelWithArg(MKernelFunc, Arg);
790  }
791  };
792  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
793  KernelFunc);
794  }
795 
796  // For 'sycl::item<Dims, without_offset>' kernel argument
797  template <class KernelType, typename ArgT, int Dims>
798  std::enable_if_t<std::is_same_v<ArgT, sycl::item<Dims, false>>, KernelType *>
799  ResetHostKernel(const KernelType &KernelFunc) {
800  struct NormalizedKernelType {
801  KernelType MKernelFunc;
802  NormalizedKernelType(const KernelType &KernelFunc)
803  : MKernelFunc(KernelFunc) {}
804  void operator()(const nd_item<Dims> &Arg) {
805  sycl::item<Dims, false> Item = detail::Builder::createItem<Dims, false>(
806  Arg.get_global_range(), Arg.get_global_id());
807  detail::runKernelWithArg(MKernelFunc, Item);
808  }
809  };
810  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
811  KernelFunc);
812  }
813 
814  // For 'sycl::item<Dims, with_offset>' kernel argument
815  template <class KernelType, typename ArgT, int Dims>
816  std::enable_if_t<std::is_same_v<ArgT, sycl::item<Dims, true>>, KernelType *>
817  ResetHostKernel(const KernelType &KernelFunc) {
818  struct NormalizedKernelType {
819  KernelType MKernelFunc;
820  NormalizedKernelType(const KernelType &KernelFunc)
821  : MKernelFunc(KernelFunc) {}
822  void operator()(const nd_item<Dims> &Arg) {
823  sycl::item<Dims, true> Item = detail::Builder::createItem<Dims, true>(
824  Arg.get_global_range(), Arg.get_global_id(), Arg.get_offset());
825  detail::runKernelWithArg(MKernelFunc, Item);
826  }
827  };
828  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
829  KernelFunc);
830  }
831 
832  // For 'void' kernel argument (single_task)
833  template <class KernelType, typename ArgT, int Dims>
834  typename std::enable_if_t<std::is_same_v<ArgT, void>, KernelType *>
835  ResetHostKernel(const KernelType &KernelFunc) {
836  struct NormalizedKernelType {
837  KernelType MKernelFunc;
838  NormalizedKernelType(const KernelType &KernelFunc)
839  : MKernelFunc(KernelFunc) {}
840  void operator()(const nd_item<Dims> &Arg) {
841  (void)Arg;
842  detail::runKernelWithoutArg(MKernelFunc);
843  }
844  };
845  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
846  KernelFunc);
847  }
848 
849  // For 'sycl::group<Dims>' kernel argument
850  // 'wrapper'-based approach using 'NormalizedKernelType' struct is not used
851  // for 'void(sycl::group<Dims>)' since 'void(sycl::group<Dims>)' is not
852  // supported in ESIMD.
853  template <class KernelType, typename ArgT, int Dims>
854  std::enable_if_t<std::is_same_v<ArgT, sycl::group<Dims>>, KernelType *>
855  ResetHostKernel(const KernelType &KernelFunc) {
856  MHostKernel.reset(
858  return (KernelType *)(MHostKernel->getPtr());
859  }
860 
868  void verifyUsedKernelBundle(const std::string &KernelName) {
869  verifyUsedKernelBundleInternal(detail::string_view{KernelName});
870  }
871  void verifyUsedKernelBundleInternal(detail::string_view KernelName);
872 
879  template <typename KernelName, typename KernelType, int Dims,
880  typename LambdaArgType>
881  void StoreLambda(KernelType KernelFunc) {
883  constexpr bool IsCallableWithKernelHandler =
885  LambdaArgType>::value;
886 
887  if (IsCallableWithKernelHandler && MIsHost) {
888  throw sycl::feature_not_supported(
889  "kernel_handler is not yet supported by host device.",
890  PI_ERROR_INVALID_OPERATION);
891  }
892 
893  KernelType *KernelPtr =
894  ResetHostKernel<KernelType, LambdaArgType, Dims>(KernelFunc);
895 
896  constexpr bool KernelHasName =
897  KI::getName() != nullptr && KI::getName()[0] != '\0';
898 
899  // Some host compilers may have different captures from Clang. Currently
900  // there is no stable way of handling this when extracting the captures, so
901  // a static assert is made to fail for incompatible kernel lambdas.
902  static_assert(
903  !KernelHasName || sizeof(KernelFunc) == KI::getKernelSize(),
904  "Unexpected kernel lambda size. This can be caused by an "
905  "external host compiler producing a lambda with an "
906  "unexpected layout. This is a limitation of the compiler."
907  "In many cases the difference is related to capturing constexpr "
908  "variables. In such cases removing constexpr specifier aligns the "
909  "captures between the host compiler and the device compiler."
910  "\n"
911  "In case of MSVC, passing "
912  "-fsycl-host-compiler-options='/std:c++latest' "
913  "might also help.");
914 
915  // Empty name indicates that the compilation happens without integration
916  // header, so don't perform things that require it.
917  if (KernelHasName) {
918  // TODO support ESIMD in no-integration-header case too.
919  MArgs.clear();
920  extractArgsAndReqsFromLambda(reinterpret_cast<char *>(KernelPtr),
921  KI::getNumParams(), &KI::getParamDesc(0),
922  KI::isESIMD());
923  MKernelName = KI::getName();
924  } else {
925  // In case w/o the integration header it is necessary to process
926  // accessors from the list(which are associated with this handler) as
927  // arguments. We must copy the associated accessors as they are checked
928  // later during finalize.
929  MArgs = MAssociatedAccesors;
930  }
931 
932  // If the kernel lambda is callable with a kernel_handler argument, manifest
933  // the associated kernel handler.
934  if (IsCallableWithKernelHandler) {
935  getOrInsertHandlerKernelBundle(/*Insert=*/true);
936  }
937  }
938 
942  template <
943  typename KernelName,
944  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
945  void processProperties(PropertiesT Props) {
947  static_assert(
949  "Template type is not a property list.");
950  static_assert(
951  !PropertiesT::template has_property<
953  (PropertiesT::template has_property<
955  KI::isESIMD()),
956  "Floating point control property is supported for ESIMD kernels only.");
957  if constexpr (PropertiesT::template has_property<
959  auto Config = Props.template get_property<
962  setKernelCacheConfig(PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM);
963  } else if (Config == sycl::ext::intel::experimental::large_data) {
964  setKernelCacheConfig(PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA);
965  }
966  } else {
967  std::ignore = Props;
968  }
969 
970  constexpr bool UsesRootSync = PropertiesT::template has_property<
972  setKernelIsCooperative(UsesRootSync);
973  }
974 
979  template <int Dims_Src, int Dims_Dst>
980  static bool IsCopyingRectRegionAvailable(const range<Dims_Src> Src,
981  const range<Dims_Dst> Dst) {
982  if (Dims_Src > Dims_Dst)
983  return false;
984  for (size_t I = 0; I < Dims_Src; ++I)
985  if (Src[I] > Dst[I])
986  return false;
987  return true;
988  }
989 
995  template <typename TSrc, int DimSrc, access::mode ModeSrc,
996  access::target TargetSrc, typename TDst, int DimDst,
997  access::mode ModeDst, access::target TargetDst,
998  access::placeholder IsPHSrc, access::placeholder IsPHDst>
999  std::enable_if_t<(DimSrc > 0) && (DimDst > 0), bool>
1002  if (!MIsHost &&
1003  IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range()))
1004  return false;
1005 
1006  range<1> LinearizedRange(Src.size());
1007  parallel_for<__copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
1008  ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
1009  LinearizedRange, [=](id<1> Id) {
1010  size_t Index = Id[0];
1011  id<DimSrc> SrcId = detail::getDelinearizedId(Src.get_range(), Index);
1012  id<DimDst> DstId = detail::getDelinearizedId(Dst.get_range(), Index);
1013  Dst[DstId] = Src[SrcId];
1014  });
1015  return true;
1016  }
1017 
1025  template <typename TSrc, int DimSrc, access::mode ModeSrc,
1026  access::target TargetSrc, typename TDst, int DimDst,
1027  access::mode ModeDst, access::target TargetDst,
1028  access::placeholder IsPHSrc, access::placeholder IsPHDst>
1029  std::enable_if_t<DimSrc == 0 || DimDst == 0, bool>
1032  if (!MIsHost)
1033  return false;
1034 
1035  single_task<__copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
1036  ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
1037  [=]() { *(Dst.get_pointer()) = *(Src.get_pointer()); });
1038  return true;
1039  }
1040 
1041 #ifndef __SYCL_DEVICE_ONLY__
1047  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
1048  access::target AccTarget, access::placeholder IsPH>
1049  std::enable_if_t<(Dim > 0)>
1050  copyAccToPtrHost(accessor<TSrc, Dim, AccMode, AccTarget, IsPH> Src,
1051  TDst *Dst) {
1052  range<Dim> Range = Src.get_range();
1053  parallel_for<__copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
1054  Range, [=](id<Dim> Index) {
1055  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
1056  using TSrcNonConst = typename std::remove_const_t<TSrc>;
1057  (reinterpret_cast<TSrcNonConst *>(Dst))[LinearIndex] = Src[Index];
1058  });
1059  }
1060 
1066  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
1067  access::target AccTarget, access::placeholder IsPH>
1068  std::enable_if_t<Dim == 0>
1069  copyAccToPtrHost(accessor<TSrc, Dim, AccMode, AccTarget, IsPH> Src,
1070  TDst *Dst) {
1071  single_task<__copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
1072  [=]() {
1073  using TSrcNonConst = typename std::remove_const_t<TSrc>;
1074  *(reinterpret_cast<TSrcNonConst *>(Dst)) = *(Src.get_pointer());
1075  });
1076  }
1077 
1082  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
1083  access::target AccTarget, access::placeholder IsPH>
1084  std::enable_if_t<(Dim > 0)>
1085  copyPtrToAccHost(TSrc *Src,
1087  range<Dim> Range = Dst.get_range();
1088  parallel_for<__copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
1089  Range, [=](id<Dim> Index) {
1090  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
1091  Dst[Index] = (reinterpret_cast<const TDst *>(Src))[LinearIndex];
1092  });
1093  }
1094 
1100  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
1101  access::target AccTarget, access::placeholder IsPH>
1102  std::enable_if_t<Dim == 0>
1103  copyPtrToAccHost(TSrc *Src,
1105  single_task<__copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
1106  [=]() {
1107  *(Dst.get_pointer()) = *(reinterpret_cast<const TDst *>(Src));
1108  });
1109  }
1110 #endif // __SYCL_DEVICE_ONLY__
1111 
1112  constexpr static bool isConstOrGlobal(access::target AccessTarget) {
1113  return AccessTarget == access::target::device ||
1114  AccessTarget == access::target::constant_buffer;
1115  }
1116 
1117  constexpr static bool isImageOrImageArray(access::target AccessTarget) {
1118  return AccessTarget == access::target::image ||
1119  AccessTarget == access::target::image_array;
1120  }
1121 
1122  constexpr static bool
1123  isValidTargetForExplicitOp(access::target AccessTarget) {
1124  return isConstOrGlobal(AccessTarget) || isImageOrImageArray(AccessTarget);
1125  }
1126 
1127  constexpr static bool isValidModeForSourceAccessor(access::mode AccessMode) {
1128  return AccessMode == access::mode::read ||
1130  }
1131 
1132  constexpr static bool
1133  isValidModeForDestinationAccessor(access::mode AccessMode) {
1134  return AccessMode == access::mode::write ||
1138  }
1139 
1140  // PI APIs only support select fill sizes: 1, 2, 4, 8, 16, 32, 64, 128
1141  constexpr static bool isBackendSupportedFillSize(size_t Size) {
1142  return Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16 ||
1143  Size == 32 || Size == 64 || Size == 128;
1144  }
1145 
1146  template <int Dims, typename LambdaArgType> struct TransformUserItemType {
1147  using type = std::conditional_t<
1148  std::is_convertible_v<nd_item<Dims>, LambdaArgType>, nd_item<Dims>,
1149  std::conditional_t<std::is_convertible_v<item<Dims>, LambdaArgType>,
1150  item<Dims>, LambdaArgType>>;
1151  };
1152 
1153  std::optional<std::array<size_t, 3>> getMaxWorkGroups();
1154  // We need to use this version to support gcc 7.5.0. Remove when minimal
1155  // supported gcc version is bumped.
1156  std::tuple<std::array<size_t, 3>, bool> getMaxWorkGroups_v2();
1157 
1158  template <int Dims>
1159  std::tuple<range<Dims>, bool> getRoundedRange(range<Dims> UserRange) {
1160  range<Dims> RoundedRange = UserRange;
1161  // Disable the rounding-up optimizations under these conditions:
1162  // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
1163  // 2. The kernel is provided via an interoperability method (this uses a
1164  // different code path).
1165  // 3. The range is already a multiple of the rounding factor.
1166  //
1167  // Cases 2 and 3 could be supported with extra effort.
1168  // As an optimization for the common case it is an
1169  // implementation choice to not support those scenarios.
1170  // Note that "this_item" is a free function, i.e. not tied to any
1171  // specific id or item. When concurrent parallel_fors are executing
1172  // on a device it is difficult to tell which parallel_for the call is
1173  // being made from. One could replicate portions of the
1174  // call-graph to make this_item calls kernel-specific but this is
1175  // not considered worthwhile.
1176 
1177  // Perform range rounding if rounding-up is enabled.
1178  if (this->DisableRangeRounding())
1179  return {range<Dims>{}, false};
1180 
1181  // Range should be a multiple of this for reasonable performance.
1182  size_t MinFactorX = 16;
1183  // Range should be a multiple of this for improved performance.
1184  size_t GoodFactor = 32;
1185  // Range should be at least this to make rounding worthwhile.
1186  size_t MinRangeX = 1024;
1187 
1188  // Check if rounding parameters have been set through environment:
1189  // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
1190  this->GetRangeRoundingSettings(MinFactorX, GoodFactor, MinRangeX);
1191 
1192  // In SYCL, each dimension of a global range size is specified by
1193  // a size_t, which can be up to 64 bits. All backends should be
1194  // able to accept a kernel launch with a 32-bit global range size
1195  // (i.e. do not throw an error). The OpenCL CPU backend will
1196  // accept every 64-bit global range, but the GPU backends will not
1197  // generally accept every 64-bit global range. So, when we get a
1198  // non-32-bit global range, we wrap the old kernel in a new kernel
1199  // that has each work item peform multiple invocations the old
1200  // kernel in a 32-bit global range.
1201  id<Dims> MaxNWGs = [&] {
1202  auto [MaxWGs, HasMaxWGs] = getMaxWorkGroups_v2();
1203  if (!HasMaxWGs) {
1204  id<Dims> Default;
1205  for (int i = 0; i < Dims; ++i)
1206  Default[i] = (std::numeric_limits<int32_t>::max)();
1207  return Default;
1208  }
1209 
1210  id<Dims> IdResult;
1211  size_t Limit = (std::numeric_limits<int>::max)();
1212  for (int i = 0; i < Dims; ++i)
1213  IdResult[i] = (std::min)(Limit, MaxWGs[Dims - i - 1]);
1214  return IdResult;
1215  }();
1217  range<Dims> MaxRange;
1218  for (int i = 0; i < Dims; ++i) {
1219  auto DesiredSize = MaxNWGs[i] * GoodFactor;
1220  MaxRange[i] =
1221  DesiredSize <= M ? DesiredSize : (M / GoodFactor) * GoodFactor;
1222  }
1223 
1224  bool DidAdjust = false;
1225  auto Adjust = [&](int Dim, size_t Value) {
1226  if (this->RangeRoundingTrace())
1227  std::cout << "parallel_for range adjusted at dim " << Dim << " from "
1228  << RoundedRange[Dim] << " to " << Value << std::endl;
1229  RoundedRange[Dim] = Value;
1230  DidAdjust = true;
1231  };
1232 
1233 #ifdef __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
1234  size_t GoodExpFactor = 1;
1235  switch (Dims) {
1236  case 1:
1237  GoodExpFactor = 32; // Make global range multiple of {32}
1238  break;
1239  case 2:
1240  GoodExpFactor = 16; // Make global range multiple of {16, 16}
1241  break;
1242  case 3:
1243  GoodExpFactor = 8; // Make global range multiple of {8, 8, 8}
1244  break;
1245  }
1246 
1247  // Check if rounding parameters have been set through environment:
1248  // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
1249  this->GetRangeRoundingSettings(MinFactorX, GoodExpFactor, MinRangeX);
1250 
1251  for (auto i = 0; i < Dims; ++i)
1252  if (UserRange[i] % GoodExpFactor) {
1253  Adjust(i, ((UserRange[i] / GoodExpFactor) + 1) * GoodExpFactor);
1254  }
1255 #else
1256  // Perform range rounding if there are sufficient work-items to
1257  // need rounding and the user-specified range is not a multiple of
1258  // a "good" value.
1259  if (RoundedRange[0] % MinFactorX != 0 && RoundedRange[0] >= MinRangeX) {
1260  // It is sufficient to round up just the first dimension.
1261  // Multiplying the rounded-up value of the first dimension
1262  // by the values of the remaining dimensions (if any)
1263  // will yield a rounded-up value for the total range.
1264  Adjust(0, ((RoundedRange[0] + GoodFactor - 1) / GoodFactor) * GoodFactor);
1265  }
1266 #endif // __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
1267 #ifdef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
1268  // If we are forcing range rounding kernels to be used, we always want the
1269  // rounded range kernel to be generated, even if rounding isn't needed
1270  DidAdjust = true;
1271 #endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
1272 
1273  for (int i = 0; i < Dims; ++i)
1274  if (RoundedRange[i] > MaxRange[i])
1275  Adjust(i, MaxRange[i]);
1276 
1277  if (!DidAdjust)
1278  return {range<Dims>{}, false};
1279  return {RoundedRange, true};
1280  }
1281 
1293  template <
1294  typename KernelName, typename KernelType, int Dims,
1295  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1296  void parallel_for_lambda_impl(range<Dims> UserRange, PropertiesT Props,
1297  KernelType KernelFunc) {
1298  throwIfActionIsCreated();
1299  throwOnLocalAccessorMisuse<KernelName, KernelType>();
1300  if (!range_size_fits_in_size_t(UserRange))
1302  "The total number of work-items in "
1303  "a range must fit within size_t");
1304 
1305  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
1306 
1307  // If 1D kernel argument is an integral type, convert it to sycl::item<1>
1308  // If user type is convertible from sycl::item/sycl::nd_item, use
1309  // sycl::item/sycl::nd_item to transport item information
1310  using TransformedArgType = std::conditional_t<
1311  std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
1312  typename TransformUserItemType<Dims, LambdaArgType>::type>;
1313 
1314  static_assert(!std::is_same_v<TransformedArgType, sycl::nd_item<Dims>>,
1315  "Kernel argument cannot have a sycl::nd_item type in "
1316  "sycl::parallel_for with sycl::range");
1317 
1318  static_assert(std::is_convertible_v<item<Dims>, LambdaArgType> ||
1319  std::is_convertible_v<item<Dims, false>, LambdaArgType>,
1320  "sycl::parallel_for(sycl::range) kernel must have the "
1321  "first argument of sycl::item type, or of a type which is "
1322  "implicitly convertible from sycl::item");
1323 
1324  using RefLambdaArgType = std::add_lvalue_reference_t<LambdaArgType>;
1325  static_assert(
1326  (std::is_invocable_v<KernelType, RefLambdaArgType> ||
1327  std::is_invocable_v<KernelType, RefLambdaArgType, kernel_handler>),
1328  "SYCL kernel lambda/functor has an unexpected signature, it should be "
1329  "invocable with sycl::item and optionally sycl::kernel_handler");
1330 
1331  // TODO: Properties may change the kernel function, so in order to avoid
1332  // conflicts they should be included in the name.
1333  using NameT =
1335 
1336  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1337 
1338  // Range rounding can be disabled by the user.
1339  // Range rounding is not done on the host device.
1340  // Range rounding is supported only for newer SYCL standards.
1341 #if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \
1342  !defined(DPCPP_HOST_DEVICE_OPENMP) && \
1343  !defined(DPCPP_HOST_DEVICE_PERF_NATIVE) && SYCL_LANGUAGE_VERSION >= 202001
1344  auto [RoundedRange, HasRoundedRange] = getRoundedRange(UserRange);
1345  if (HasRoundedRange) {
1346  using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
1347  auto Wrapper =
1348  getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
1349  KernelFunc, UserRange);
1350 
1351  using KName = std::conditional_t<std::is_same<KernelType, NameT>::value,
1352  decltype(Wrapper), NameWT>;
1353 
1354  kernel_parallel_for_wrapper<KName, TransformedArgType, decltype(Wrapper),
1355  PropertiesT>(Wrapper);
1356 #ifndef __SYCL_DEVICE_ONLY__
1357  // We are executing over the rounded range, but there are still
1358  // items/ids that are are constructed in ther range rounded
1359  // kernel use items/ids in the user range, which means that
1360  // __SYCL_ASSUME_INT can still be violated. So check the bounds
1361  // of the user range, instead of the rounded range.
1362  detail::checkValueRange<Dims>(UserRange);
1363  MNDRDesc.set(RoundedRange);
1364  StoreLambda<KName, decltype(Wrapper), Dims, TransformedArgType>(
1365  std::move(Wrapper));
1366  setType(detail::CG::Kernel);
1367  setNDRangeUsed(false);
1368 #endif
1369  } else
1370 #endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ &&
1371  // !DPCPP_HOST_DEVICE_OPENMP && !DPCPP_HOST_DEVICE_PERF_NATIVE &&
1372  // SYCL_LANGUAGE_VERSION >= 202001
1373  {
1374  (void)UserRange;
1375  (void)Props;
1376 #ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
1377  // If parallel_for range rounding is forced then only range rounded
1378  // kernel is generated
1379  kernel_parallel_for_wrapper<NameT, TransformedArgType, KernelType,
1380  PropertiesT>(KernelFunc);
1381 #ifndef __SYCL_DEVICE_ONLY__
1382  processProperties<NameT, PropertiesT>(Props);
1383  detail::checkValueRange<Dims>(UserRange);
1384  MNDRDesc.set(std::move(UserRange));
1385  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1386  std::move(KernelFunc));
1387  setType(detail::CG::Kernel);
1388  setNDRangeUsed(false);
1389 #endif
1390 #else
1391  (void)KernelFunc;
1392 #endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
1393  }
1394  }
1395 
1409  template <typename KernelName, typename KernelType, int Dims,
1410  typename PropertiesT>
1411  void parallel_for_impl(nd_range<Dims> ExecutionRange, PropertiesT Props,
1412  _KERNELFUNCPARAM(KernelFunc)) {
1413  throwIfActionIsCreated();
1414  // TODO: Properties may change the kernel function, so in order to avoid
1415  // conflicts they should be included in the name.
1416  using NameT =
1418  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1419  using LambdaArgType =
1420  sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
1421  static_assert(
1422  std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
1423  "Kernel argument of a sycl::parallel_for with sycl::nd_range "
1424  "must be either sycl::nd_item or be convertible from sycl::nd_item");
1425  using TransformedArgType = sycl::nd_item<Dims>;
1426 
1427  (void)ExecutionRange;
1428  (void)Props;
1429  kernel_parallel_for_wrapper<NameT, TransformedArgType, KernelType,
1430  PropertiesT>(KernelFunc);
1431 #ifndef __SYCL_DEVICE_ONLY__
1432  processProperties<NameT, PropertiesT>(Props);
1433  detail::checkValueRange<Dims>(ExecutionRange);
1434  MNDRDesc.set(std::move(ExecutionRange));
1435  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1436  std::move(KernelFunc));
1437  setType(detail::CG::Kernel);
1438  setNDRangeUsed(true);
1439 #endif
1440  }
1441 
1449  template <int Dims>
1450  void parallel_for_impl(range<Dims> NumWorkItems, kernel Kernel) {
1451  throwIfActionIsCreated();
1452  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1453  detail::checkValueRange<Dims>(NumWorkItems);
1454  MNDRDesc.set(std::move(NumWorkItems));
1455  setType(detail::CG::Kernel);
1456  setNDRangeUsed(false);
1457  extractArgsAndReqs();
1458  MKernelName = getKernelName();
1459  }
1460 
1471  template <
1472  typename KernelName, typename KernelType, int Dims,
1473  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1474  void parallel_for_work_group_lambda_impl(range<Dims> NumWorkGroups,
1475  PropertiesT Props,
1476  _KERNELFUNCPARAM(KernelFunc)) {
1477  throwIfActionIsCreated();
1478  // TODO: Properties may change the kernel function, so in order to avoid
1479  // conflicts they should be included in the name.
1480  using NameT =
1482  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1483  using LambdaArgType =
1484  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
1485  (void)NumWorkGroups;
1486  (void)Props;
1487  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType, KernelType,
1488  PropertiesT>(KernelFunc);
1489 #ifndef __SYCL_DEVICE_ONLY__
1490  processProperties<NameT, PropertiesT>(Props);
1491  detail::checkValueRange<Dims>(NumWorkGroups);
1492  MNDRDesc.setNumWorkGroups(NumWorkGroups);
1493  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1494  setType(detail::CG::Kernel);
1495  setNDRangeUsed(false);
1496 #endif // __SYCL_DEVICE_ONLY__
1497  }
1498 
1511  template <
1512  typename KernelName, typename KernelType, int Dims,
1513  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1514  void parallel_for_work_group_lambda_impl(range<Dims> NumWorkGroups,
1516  PropertiesT Props,
1517  _KERNELFUNCPARAM(KernelFunc)) {
1518  throwIfActionIsCreated();
1519  // TODO: Properties may change the kernel function, so in order to avoid
1520  // conflicts they should be included in the name.
1521  using NameT =
1523  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1524  using LambdaArgType =
1525  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
1526  (void)NumWorkGroups;
1527  (void)WorkGroupSize;
1528  (void)Props;
1529  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType, KernelType,
1530  PropertiesT>(KernelFunc);
1531 #ifndef __SYCL_DEVICE_ONLY__
1532  processProperties<NameT, PropertiesT>(Props);
1533  nd_range<Dims> ExecRange =
1534  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
1535  detail::checkValueRange<Dims>(ExecRange);
1536  MNDRDesc.set(std::move(ExecRange));
1537  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1538  setType(detail::CG::Kernel);
1539 #endif // __SYCL_DEVICE_ONLY__
1540  }
1541 
1542 #ifdef SYCL_LANGUAGE_VERSION
1543 #define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel]]
1544 #else
1545 #define __SYCL_KERNEL_ATTR__
1546 #endif
1547 
1548  // NOTE: the name of this function - "kernel_single_task" - is used by the
1549  // Front End to determine kernel invocation kind.
1550  template <typename KernelName, typename KernelType, typename... Props>
1551 #ifdef __SYCL_DEVICE_ONLY__
1552  [[__sycl_detail__::add_ir_attributes_function(
1553  "sycl-single-task",
1555  nullptr,
1557 #endif
1559  kernel_single_task(_KERNELFUNCPARAM(KernelFunc)) {
1560 #ifdef __SYCL_DEVICE_ONLY__
1561  KernelFunc();
1562 #else
1563  (void)KernelFunc;
1564 #endif
1565  }
1566 
1567  // NOTE: the name of this function - "kernel_single_task" - is used by the
1568  // Front End to determine kernel invocation kind.
1569  template <typename KernelName, typename KernelType, typename... Props>
1570 #ifdef __SYCL_DEVICE_ONLY__
1571  [[__sycl_detail__::add_ir_attributes_function(
1572  "sycl-single-task",
1574  nullptr,
1576 #endif
1578  kernel_single_task(_KERNELFUNCPARAM(KernelFunc), kernel_handler KH) {
1579 #ifdef __SYCL_DEVICE_ONLY__
1580  KernelFunc(KH);
1581 #else
1582  (void)KernelFunc;
1583  (void)KH;
1584 #endif
1585  }
1586 
1587  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1588  // Front End to determine kernel invocation kind.
1589  template <typename KernelName, typename ElementType, typename KernelType,
1590  typename... Props>
1591 #ifdef __SYCL_DEVICE_ONLY__
1592  [[__sycl_detail__::add_ir_attributes_function(
1595 #endif
1597  kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc)) {
1598 #ifdef __SYCL_DEVICE_ONLY__
1599  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1600 #else
1601  (void)KernelFunc;
1602 #endif
1603  }
1604 
1605  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1606  // Front End to determine kernel invocation kind.
1607  template <typename KernelName, typename ElementType, typename KernelType,
1608  typename... Props>
1609 #ifdef __SYCL_DEVICE_ONLY__
1610  [[__sycl_detail__::add_ir_attributes_function(
1613 #endif
1615  kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc), kernel_handler KH) {
1616 #ifdef __SYCL_DEVICE_ONLY__
1617  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1618 #else
1619  (void)KernelFunc;
1620  (void)KH;
1621 #endif
1622  }
1623 
1624  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1625  // used by the Front End to determine kernel invocation kind.
1626  template <typename KernelName, typename ElementType, typename KernelType,
1627  typename... Props>
1628 #ifdef __SYCL_DEVICE_ONLY__
1629  [[__sycl_detail__::add_ir_attributes_function(
1632 #endif
1634  kernel_parallel_for_work_group(_KERNELFUNCPARAM(KernelFunc)) {
1635 #ifdef __SYCL_DEVICE_ONLY__
1636  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1637 #else
1638  (void)KernelFunc;
1639 #endif
1640  }
1641 
1642  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1643  // used by the Front End to determine kernel invocation kind.
1644  template <typename KernelName, typename ElementType, typename KernelType,
1645  typename... Props>
1646 #ifdef __SYCL_DEVICE_ONLY__
1647  [[__sycl_detail__::add_ir_attributes_function(
1650 #endif
1652  kernel_parallel_for_work_group(_KERNELFUNCPARAM(KernelFunc),
1653  kernel_handler KH) {
1654 #ifdef __SYCL_DEVICE_ONLY__
1655  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1656 #else
1657  (void)KernelFunc;
1658  (void)KH;
1659 #endif
1660  }
1661 
1662  template <typename... Props> struct KernelPropertiesUnpackerImpl {
1663  // Just pass extra Props... as template parameters to the underlying
1664  // Caller->* member functions. Don't have reflection so try to use
1665  // templates as much as possible to reduce the amount of boilerplate code
1666  // needed. All the type checks are expected to be done at the Caller's
1667  // methods side.
1668 
1669  template <typename... TypesToForward, typename... ArgsTy>
1670  static void kernel_single_task_unpack(handler *h, ArgsTy... Args) {
1671  h->kernel_single_task<TypesToForward..., Props...>(Args...);
1672  }
1673 
1674  template <typename... TypesToForward, typename... ArgsTy>
1675  static void kernel_parallel_for_unpack(handler *h, ArgsTy... Args) {
1676  h->kernel_parallel_for<TypesToForward..., Props...>(Args...);
1677  }
1678 
1679  template <typename... TypesToForward, typename... ArgsTy>
1680  static void kernel_parallel_for_work_group_unpack(handler *h,
1681  ArgsTy... Args) {
1682  h->kernel_parallel_for_work_group<TypesToForward..., Props...>(Args...);
1683  }
1684  };
1685 
1686  template <typename PropertiesT>
1687  struct KernelPropertiesUnpacker : public KernelPropertiesUnpackerImpl<> {
1688  // This should always fail outside the specialization below but must be
1689  // dependent to avoid failing even if not instantiated.
1690  static_assert(
1691  ext::oneapi::experimental::is_property_list<PropertiesT>::value,
1692  "Template type is not a property list.");
1693  };
1694 
1695  template <typename... Props>
1696  struct KernelPropertiesUnpacker<
1698  : public KernelPropertiesUnpackerImpl<Props...> {};
1699 
1700  // Helper function to
1701  //
1702  // * Make use of the KernelPropertiesUnpacker above
1703  // * Decide if we need an extra kernel_handler parameter
1704  //
1705  // The interface uses a \p Lambda callback to propagate that information back
1706  // to the caller as we need the caller to communicate:
1707  //
1708  // * Name of the method to call
1709  // * Provide explicit template type parameters for the call
1710  //
1711  // Couldn't think of a better way to achieve both.
1712  template <typename KernelName, typename KernelType, typename PropertiesT,
1713  bool HasKernelHandlerArg, typename FuncTy>
1714  void unpack(_KERNELFUNCPARAM(KernelFunc), FuncTy Lambda) {
1715 #ifdef __SYCL_DEVICE_ONLY__
1716  detail::CheckDeviceCopyable<KernelType>();
1717 #endif // __SYCL_DEVICE_ONLY__
1718  using MergedPropertiesT =
1719  typename detail::GetMergedKernelProperties<KernelType,
1720  PropertiesT>::type;
1721  using Unpacker = KernelPropertiesUnpacker<MergedPropertiesT>;
1722 #ifndef __SYCL_DEVICE_ONLY__
1723  // If there are properties provided by get method then process them.
1724  if constexpr (ext::oneapi::experimental::detail::
1725  HasKernelPropertiesGetMethod<
1726  _KERNELFUNCPARAMTYPE>::value) {
1727  processProperties<KernelName>(
1728  KernelFunc.get(ext::oneapi::experimental::properties_tag{}));
1729  }
1730 #endif
1731  if constexpr (HasKernelHandlerArg) {
1732  kernel_handler KH;
1733  Lambda(Unpacker{}, this, KernelFunc, KH);
1734  } else {
1735  Lambda(Unpacker{}, this, KernelFunc);
1736  }
1737  }
1738 
1739  // NOTE: to support kernel_handler argument in kernel lambdas, only
1740  // kernel_***_wrapper functions must be called in this code
1741 
1742  template <
1743  typename KernelName, typename KernelType,
1744  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1745  void kernel_single_task_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1746  unpack<KernelName, KernelType, PropertiesT,
1748  KernelFunc, [&](auto Unpacker, auto... args) {
1749  Unpacker.template kernel_single_task_unpack<KernelName, KernelType>(
1750  args...);
1751  });
1752  }
1753 
1754  template <
1755  typename KernelName, typename ElementType, typename KernelType,
1756  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1757  void kernel_parallel_for_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1758  unpack<KernelName, KernelType, PropertiesT,
1759  detail::KernelLambdaHasKernelHandlerArgT<KernelType,
1760  ElementType>::value>(
1761  KernelFunc, [&](auto Unpacker, auto... args) {
1762  Unpacker.template kernel_parallel_for_unpack<KernelName, ElementType,
1763  KernelType>(args...);
1764  });
1765  }
1766 
1767  template <
1768  typename KernelName, typename ElementType, typename KernelType,
1769  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1770  void kernel_parallel_for_work_group_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1771  unpack<KernelName, KernelType, PropertiesT,
1772  detail::KernelLambdaHasKernelHandlerArgT<KernelType,
1773  ElementType>::value>(
1774  KernelFunc, [&](auto Unpacker, auto... args) {
1775  Unpacker.template kernel_parallel_for_work_group_unpack<
1776  KernelName, ElementType, KernelType>(args...);
1777  });
1778  }
1779 
1787  template <
1788  typename KernelName, typename KernelType,
1789  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1790  void single_task_lambda_impl(PropertiesT Props,
1791  _KERNELFUNCPARAM(KernelFunc)) {
1792  (void)Props;
1793  throwIfActionIsCreated();
1794  throwOnLocalAccessorMisuse<KernelName, KernelType>();
1795  // TODO: Properties may change the kernel function, so in order to avoid
1796  // conflicts they should be included in the name.
1797  using NameT =
1799  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1800  kernel_single_task_wrapper<NameT, KernelType, PropertiesT>(KernelFunc);
1801 #ifndef __SYCL_DEVICE_ONLY__
1802  // No need to check if range is out of INT_MAX limits as it's compile-time
1803  // known constant.
1804  MNDRDesc.set(range<1>{1});
1805  processProperties<NameT, PropertiesT>(Props);
1806  StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(KernelFunc);
1807  setType(detail::CG::Kernel);
1808 #endif
1809  }
1810 
1811  void setStateExplicitKernelBundle();
1812  void setStateSpecConstSet();
1813  bool isStateExplicitKernelBundle() const;
1814 
1815  std::shared_ptr<detail::kernel_bundle_impl>
1816  getOrInsertHandlerKernelBundle(bool Insert) const;
1817 
1818  void setHandlerKernelBundle(kernel Kernel);
1819 
1820  void setHandlerKernelBundle(
1821  const std::shared_ptr<detail::kernel_bundle_impl> &NewKernelBundleImpPtr);
1822 
1823  template <typename FuncT>
1824  std::enable_if_t<detail::check_fn_signature<std::remove_reference_t<FuncT>,
1825  void()>::value ||
1826  detail::check_fn_signature<std::remove_reference_t<FuncT>,
1827  void(interop_handle)>::value>
1828  host_task_impl(FuncT &&Func) {
1829  throwIfActionIsCreated();
1830 
1831  MNDRDesc.set(range<1>(1));
1832  // Need to copy these rather than move so that we can check associated
1833  // accessors during finalize
1834  MArgs = MAssociatedAccesors;
1835 
1836  MHostTask.reset(new detail::HostTask(std::move(Func)));
1837 
1839  }
1840 
1844  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
1845  getCommandGraph() const;
1846 
1853  void setUserFacingNodeType(ext::oneapi::experimental::node_type Type);
1854 
1855 public:
1856  handler(const handler &) = delete;
1857  handler(handler &&) = delete;
1858  handler &operator=(const handler &) = delete;
1859  handler &operator=(handler &&) = delete;
1860 
1861  template <auto &SpecName>
1863  typename std::remove_reference_t<decltype(SpecName)>::value_type Value) {
1864 
1865  setStateSpecConstSet();
1866 
1867  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1868  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1869 
1870  detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1872  .set_specialization_constant<SpecName>(Value);
1873  }
1874 
1875  template <auto &SpecName>
1876  typename std::remove_reference_t<decltype(SpecName)>::value_type
1878 
1879  if (isStateExplicitKernelBundle())
1881  "Specialization constants cannot be read after "
1882  "explicitly setting the used kernel bundle");
1883 
1884  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1885  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1886 
1887  return detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1889  .get_specialization_constant<SpecName>();
1890  }
1891 
1892  void
1893  use_kernel_bundle(const kernel_bundle<bundle_state::executable> &ExecBundle);
1894 
1903  template <typename DataT, int Dims, access::mode AccMode,
1906  if (Acc.is_placeholder())
1907  associateWithHandler(&Acc, AccTarget);
1908  }
1909 
1919  template <typename DataT, int Dims, access::mode AccMode,
1923  dynamicParamAcc) {
1925  AccT Acc = *static_cast<AccT *>(
1926  detail::getValueFromDynamicParameter(dynamicParamAcc));
1927  if (Acc.is_placeholder())
1928  associateWithHandler(&Acc, AccTarget);
1929  }
1930 
1934  void depends_on(event Event);
1935 
1939  void depends_on(const std::vector<event> &Events);
1940 
1941  template <typename T>
1942  using remove_cv_ref_t = typename std::remove_cv_t<std::remove_reference_t<T>>;
1943 
1944  template <typename U, typename T>
1945  using is_same_type = std::is_same<remove_cv_ref_t<U>, remove_cv_ref_t<T>>;
1946 
1947  template <typename T> struct ShouldEnableSetArg {
1948  static constexpr bool value =
1949  std::is_trivially_copyable_v<std::remove_reference_t<T>>
1950 #if SYCL_LANGUAGE_VERSION && SYCL_LANGUAGE_VERSION <= 201707
1951  && std::is_standard_layout<std::remove_reference_t<T>>::value
1952 #endif
1953  || is_same_type<sampler, T>::value // Sampler
1955  std::is_pointer_v<remove_cv_ref_t<T>>) // USM
1956  || is_same_type<cl_mem, T>::value; // Interop
1957  };
1958 
1965  template <typename T>
1966  typename std::enable_if_t<ShouldEnableSetArg<T>::value, void>
1967  set_arg(int ArgIndex, T &&Arg) {
1968  setArgHelper(ArgIndex, std::move(Arg));
1969  }
1970 
1971  template <typename DataT, int Dims, access::mode AccessMode,
1973  void
1974  set_arg(int ArgIndex,
1976  setArgHelper(ArgIndex, std::move(Arg));
1977  }
1978 
1979  template <typename DataT, int Dims>
1980  void set_arg(int ArgIndex, local_accessor<DataT, Dims> Arg) {
1981  setArgHelper(ArgIndex, std::move(Arg));
1982  }
1983 
1984  // set_arg for graph dynamic_parameters
1985  template <typename T>
1986  void set_arg(int argIndex,
1988  setArgHelper(argIndex, dynamicParam);
1989  }
1990 
1996  template <typename... Ts> void set_args(Ts &&...Args) {
1997  setArgsHelper(0, std::move(Args)...);
1998  }
1999 
2007  template <typename KernelName = detail::auto_name, typename KernelType>
2008  void single_task(_KERNELFUNCPARAM(KernelFunc)) {
2009  single_task_lambda_impl<KernelName>(
2011  }
2012 
2013  template <typename KernelName = detail::auto_name, typename KernelType>
2014  void parallel_for(range<1> NumWorkItems, _KERNELFUNCPARAM(KernelFunc)) {
2015  parallel_for_lambda_impl<KernelName>(
2017  std::move(KernelFunc));
2018  }
2019 
2020  template <typename KernelName = detail::auto_name, typename KernelType>
2021  void parallel_for(range<2> NumWorkItems, _KERNELFUNCPARAM(KernelFunc)) {
2022  parallel_for_lambda_impl<KernelName>(
2024  std::move(KernelFunc));
2025  }
2026 
2027  template <typename KernelName = detail::auto_name, typename KernelType>
2028  void parallel_for(range<3> NumWorkItems, _KERNELFUNCPARAM(KernelFunc)) {
2029  parallel_for_lambda_impl<KernelName>(
2031  std::move(KernelFunc));
2032  }
2033 
2035  template <typename FuncT>
2036  std::enable_if_t<detail::check_fn_signature<std::remove_reference_t<FuncT>,
2037  void()>::value ||
2039  void(interop_handle)>::value>
2040  host_task(FuncT &&Func) {
2041  host_task_impl(Func);
2042  }
2043 
2057  template <typename KernelName = detail::auto_name, typename KernelType,
2058  int Dims>
2059  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL2020")
2060  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
2061  _KERNELFUNCPARAM(KernelFunc)) {
2062  throwIfActionIsCreated();
2063  using NameT =
2065  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2066  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
2067  using TransformedArgType = std::conditional_t<
2068  std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
2069  typename TransformUserItemType<Dims, LambdaArgType>::type>;
2070  (void)NumWorkItems;
2071  (void)WorkItemOffset;
2072  kernel_parallel_for_wrapper<NameT, TransformedArgType>(KernelFunc);
2073 #ifndef __SYCL_DEVICE_ONLY__
2074  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2075  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
2076  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
2077  std::move(KernelFunc));
2078  setType(detail::CG::Kernel);
2079  setNDRangeUsed(false);
2080 #endif
2081  }
2082 
2093  template <typename KernelName = detail::auto_name, typename KernelType,
2094  int Dims>
2096  _KERNELFUNCPARAM(KernelFunc)) {
2097  parallel_for_work_group_lambda_impl<KernelName>(
2099  KernelFunc);
2100  }
2101 
2114  template <typename KernelName = detail::auto_name, typename KernelType,
2115  int Dims>
2118  _KERNELFUNCPARAM(KernelFunc)) {
2119  parallel_for_work_group_lambda_impl<KernelName>(
2120  NumWorkGroups, WorkGroupSize,
2122  }
2123 
2130  void single_task(kernel Kernel) {
2131  throwIfActionIsCreated();
2132  // Ignore any set kernel bundles and use the one associated with the kernel
2133  setHandlerKernelBundle(Kernel);
2134  // No need to check if range is out of INT_MAX limits as it's compile-time
2135  // known constant
2136  MNDRDesc.set(range<1>{1});
2137  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2138  setType(detail::CG::Kernel);
2139  extractArgsAndReqs();
2140  MKernelName = getKernelName();
2141  }
2142 
2143  void parallel_for(range<1> NumWorkItems, kernel Kernel) {
2144  parallel_for_impl(NumWorkItems, Kernel);
2145  }
2146 
2147  void parallel_for(range<2> NumWorkItems, kernel Kernel) {
2148  parallel_for_impl(NumWorkItems, Kernel);
2149  }
2150 
2151  void parallel_for(range<3> NumWorkItems, kernel Kernel) {
2152  parallel_for_impl(NumWorkItems, Kernel);
2153  }
2154 
2163  template <int Dims>
2164  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
2165  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
2166  kernel Kernel) {
2167  throwIfActionIsCreated();
2168  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2169  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2170  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
2171  setType(detail::CG::Kernel);
2172  setNDRangeUsed(false);
2173  extractArgsAndReqs();
2174  MKernelName = getKernelName();
2175  }
2176 
2185  template <int Dims> void parallel_for(nd_range<Dims> NDRange, kernel Kernel) {
2186  throwIfActionIsCreated();
2187  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2188  detail::checkValueRange<Dims>(NDRange);
2189  MNDRDesc.set(std::move(NDRange));
2190  setType(detail::CG::Kernel);
2191  setNDRangeUsed(true);
2192  extractArgsAndReqs();
2193  MKernelName = getKernelName();
2194  }
2195 
2202  template <typename KernelName = detail::auto_name, typename KernelType>
2203  void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc)) {
2204  throwIfActionIsCreated();
2205  // Ignore any set kernel bundles and use the one associated with the kernel
2206  setHandlerKernelBundle(Kernel);
2207  using NameT =
2209  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2210  (void)Kernel;
2211  kernel_single_task<NameT>(KernelFunc);
2212 #ifndef __SYCL_DEVICE_ONLY__
2213  // No need to check if range is out of INT_MAX limits as it's compile-time
2214  // known constant
2215  MNDRDesc.set(range<1>{1});
2216  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2217  setType(detail::CG::Kernel);
2218  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2219  extractArgsAndReqs();
2220  MKernelName = getKernelName();
2221  } else
2222  StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(std::move(KernelFunc));
2223 #else
2224  detail::CheckDeviceCopyable<KernelType>();
2225 #endif
2226  }
2227 
2235  template <typename KernelName = detail::auto_name, typename KernelType,
2236  int Dims>
2237  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
2238  _KERNELFUNCPARAM(KernelFunc)) {
2239  throwIfActionIsCreated();
2240  // Ignore any set kernel bundles and use the one associated with the kernel
2241  setHandlerKernelBundle(Kernel);
2242  using NameT =
2244  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2245  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
2246  (void)Kernel;
2247  (void)NumWorkItems;
2248  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2249 #ifndef __SYCL_DEVICE_ONLY__
2250  detail::checkValueRange<Dims>(NumWorkItems);
2251  MNDRDesc.set(std::move(NumWorkItems));
2252  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2253  setType(detail::CG::Kernel);
2254  setNDRangeUsed(false);
2255  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2256  extractArgsAndReqs();
2257  MKernelName = getKernelName();
2258  } else
2259  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2260  std::move(KernelFunc));
2261 #endif
2262  }
2263 
2273  template <typename KernelName = detail::auto_name, typename KernelType,
2274  int Dims>
2275  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
2276  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
2277  id<Dims> WorkItemOffset, _KERNELFUNCPARAM(KernelFunc)) {
2278  throwIfActionIsCreated();
2279  // Ignore any set kernel bundles and use the one associated with the kernel
2280  setHandlerKernelBundle(Kernel);
2281  using NameT =
2283  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2284  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
2285  (void)Kernel;
2286  (void)NumWorkItems;
2287  (void)WorkItemOffset;
2288  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2289 #ifndef __SYCL_DEVICE_ONLY__
2290  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2291  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
2292  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2293  setType(detail::CG::Kernel);
2294  setNDRangeUsed(false);
2295  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2296  extractArgsAndReqs();
2297  MKernelName = getKernelName();
2298  } else
2299  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2300  std::move(KernelFunc));
2301 #endif
2302  }
2303 
2313  template <typename KernelName = detail::auto_name, typename KernelType,
2314  int Dims>
2315  void parallel_for(kernel Kernel, nd_range<Dims> NDRange,
2316  _KERNELFUNCPARAM(KernelFunc)) {
2317  throwIfActionIsCreated();
2318  // Ignore any set kernel bundles and use the one associated with the kernel
2319  setHandlerKernelBundle(Kernel);
2320  using NameT =
2322  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2323  using LambdaArgType =
2324  sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
2325  (void)Kernel;
2326  (void)NDRange;
2327  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2328 #ifndef __SYCL_DEVICE_ONLY__
2329  detail::checkValueRange<Dims>(NDRange);
2330  MNDRDesc.set(std::move(NDRange));
2331  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2332  setType(detail::CG::Kernel);
2333  setNDRangeUsed(true);
2334  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2335  extractArgsAndReqs();
2336  MKernelName = getKernelName();
2337  } else
2338  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2339  std::move(KernelFunc));
2340 #endif
2341  }
2342 
2356  template <typename KernelName = detail::auto_name, typename KernelType,
2357  int Dims>
2358  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2359  _KERNELFUNCPARAM(KernelFunc)) {
2360  throwIfActionIsCreated();
2361  // Ignore any set kernel bundles and use the one associated with the kernel
2362  setHandlerKernelBundle(Kernel);
2363  using NameT =
2365  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2366  using LambdaArgType =
2367  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
2368  (void)Kernel;
2369  (void)NumWorkGroups;
2370  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2371 #ifndef __SYCL_DEVICE_ONLY__
2372  detail::checkValueRange<Dims>(NumWorkGroups);
2373  MNDRDesc.setNumWorkGroups(NumWorkGroups);
2374  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2375  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2376  setType(detail::CG::Kernel);
2377 #endif // __SYCL_DEVICE_ONLY__
2378  }
2379 
2395  template <typename KernelName = detail::auto_name, typename KernelType,
2396  int Dims>
2397  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2399  _KERNELFUNCPARAM(KernelFunc)) {
2400  throwIfActionIsCreated();
2401  // Ignore any set kernel bundles and use the one associated with the kernel
2402  setHandlerKernelBundle(Kernel);
2403  using NameT =
2405  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2406  using LambdaArgType =
2407  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
2408  (void)Kernel;
2409  (void)NumWorkGroups;
2410  (void)WorkGroupSize;
2411  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2412 #ifndef __SYCL_DEVICE_ONLY__
2413  nd_range<Dims> ExecRange =
2414  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
2415  detail::checkValueRange<Dims>(ExecRange);
2416  MNDRDesc.set(std::move(ExecRange));
2417  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2418  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2419  setType(detail::CG::Kernel);
2420 #endif // __SYCL_DEVICE_ONLY__
2421  }
2422 
2423  template <typename KernelName = detail::auto_name, typename KernelType,
2424  typename PropertiesT>
2425  std::enable_if_t<
2427  single_task(PropertiesT Props, _KERNELFUNCPARAM(KernelFunc)) {
2428  single_task_lambda_impl<KernelName, KernelType, PropertiesT>(Props,
2429  KernelFunc);
2430  }
2431 
2432  template <typename KernelName = detail::auto_name, typename KernelType,
2433  typename PropertiesT>
2434  std::enable_if_t<
2436  parallel_for(range<1> NumWorkItems, PropertiesT Props,
2437  _KERNELFUNCPARAM(KernelFunc)) {
2438  parallel_for_lambda_impl<KernelName, KernelType, 1, PropertiesT>(
2439  NumWorkItems, Props, std::move(KernelFunc));
2440  }
2441 
2442  template <typename KernelName = detail::auto_name, typename KernelType,
2443  typename PropertiesT>
2444  std::enable_if_t<
2446  parallel_for(range<2> NumWorkItems, PropertiesT Props,
2447  _KERNELFUNCPARAM(KernelFunc)) {
2448  parallel_for_lambda_impl<KernelName, KernelType, 2, PropertiesT>(
2449  NumWorkItems, Props, std::move(KernelFunc));
2450  }
2451 
2452  template <typename KernelName = detail::auto_name, typename KernelType,
2453  typename PropertiesT>
2454  std::enable_if_t<
2456  parallel_for(range<3> NumWorkItems, PropertiesT Props,
2457  _KERNELFUNCPARAM(KernelFunc)) {
2458  parallel_for_lambda_impl<KernelName, KernelType, 3, PropertiesT>(
2459  NumWorkItems, Props, std::move(KernelFunc));
2460  }
2461 
2462  template <typename KernelName = detail::auto_name, typename KernelType,
2463  typename PropertiesT, int Dims>
2464  std::enable_if_t<
2466  parallel_for(nd_range<Dims> Range, PropertiesT Properties,
2467  _KERNELFUNCPARAM(KernelFunc)) {
2468  parallel_for_impl<KernelName>(Range, Properties, std::move(KernelFunc));
2469  }
2470 
2472 
2473  template <typename KernelName = detail::auto_name, typename PropertiesT,
2474  typename... RestT>
2475  std::enable_if_t<
2476  (sizeof...(RestT) > 1) &&
2477  detail::AreAllButLastReductions<RestT...>::value &&
2479  parallel_for(range<1> Range, PropertiesT Properties, RestT &&...Rest) {
2480  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2481  UnsupportedGraphFeatures::sycl_reductions>();
2482  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2483  std::forward<RestT>(Rest)...);
2484  }
2485 
2486  template <typename KernelName = detail::auto_name, typename PropertiesT,
2487  typename... RestT>
2488  std::enable_if_t<
2489  (sizeof...(RestT) > 1) &&
2490  detail::AreAllButLastReductions<RestT...>::value &&
2492  parallel_for(range<2> Range, PropertiesT Properties, RestT &&...Rest) {
2493  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2494  UnsupportedGraphFeatures::sycl_reductions>();
2495  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2496  std::forward<RestT>(Rest)...);
2497  }
2498 
2499  template <typename KernelName = detail::auto_name, typename PropertiesT,
2500  typename... RestT>
2501  std::enable_if_t<
2502  (sizeof...(RestT) > 1) &&
2503  detail::AreAllButLastReductions<RestT...>::value &&
2505  parallel_for(range<3> Range, PropertiesT Properties, RestT &&...Rest) {
2506  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2507  UnsupportedGraphFeatures::sycl_reductions>();
2508  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2509  std::forward<RestT>(Rest)...);
2510  }
2511 
2512  template <typename KernelName = detail::auto_name, typename... RestT>
2513  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2514  parallel_for(range<1> Range, RestT &&...Rest) {
2515  parallel_for<KernelName>(Range,
2517  std::forward<RestT>(Rest)...);
2518  }
2519 
2520  template <typename KernelName = detail::auto_name, typename... RestT>
2521  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2522  parallel_for(range<2> Range, RestT &&...Rest) {
2523  parallel_for<KernelName>(Range,
2525  std::forward<RestT>(Rest)...);
2526  }
2527 
2528  template <typename KernelName = detail::auto_name, typename... RestT>
2529  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2530  parallel_for(range<3> Range, RestT &&...Rest) {
2531  parallel_for<KernelName>(Range,
2533  std::forward<RestT>(Rest)...);
2534  }
2535 
2536  template <typename KernelName = detail::auto_name, int Dims,
2537  typename PropertiesT, typename... RestT>
2538  std::enable_if_t<
2539  (sizeof...(RestT) > 1) &&
2540  detail::AreAllButLastReductions<RestT...>::value &&
2542  parallel_for(nd_range<Dims> Range, PropertiesT Properties, RestT &&...Rest) {
2543  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2544  UnsupportedGraphFeatures::sycl_reductions>();
2545  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2546  std::forward<RestT>(Rest)...);
2547  }
2548 
2549  template <typename KernelName = detail::auto_name, int Dims,
2550  typename... RestT>
2551  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2552  parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
2553  parallel_for<KernelName>(Range,
2555  std::forward<RestT>(Rest)...);
2556  }
2557 
2559 
2560  template <typename KernelName = detail::auto_name, typename KernelType,
2561  int Dims, typename PropertiesT>
2562  void parallel_for_work_group(range<Dims> NumWorkGroups, PropertiesT Props,
2563  _KERNELFUNCPARAM(KernelFunc)) {
2564  parallel_for_work_group_lambda_impl<KernelName, KernelType, Dims,
2565  PropertiesT>(NumWorkGroups, Props,
2566  KernelFunc);
2567  }
2568 
2569  template <typename KernelName = detail::auto_name, typename KernelType,
2570  int Dims, typename PropertiesT>
2572  range<Dims> WorkGroupSize, PropertiesT Props,
2573  _KERNELFUNCPARAM(KernelFunc)) {
2574  parallel_for_work_group_lambda_impl<KernelName, KernelType, Dims,
2575  PropertiesT>(
2576  NumWorkGroups, WorkGroupSize, Props, KernelFunc);
2577  }
2578 
2579  // Clean up KERNELFUNC macro.
2580 #undef _KERNELFUNCPARAM
2581 
2582  // Explicit copy operations API
2583 
2591  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2592  access::target AccessTarget,
2595  std::shared_ptr<T_Dst> Dst) {
2596  if (Src.is_placeholder())
2597  checkIfPlaceholderIsBoundToHandler(Src);
2598 
2599  throwIfActionIsCreated();
2600  static_assert(isValidTargetForExplicitOp(AccessTarget),
2601  "Invalid accessor target for the copy method.");
2602  static_assert(isValidModeForSourceAccessor(AccessMode),
2603  "Invalid accessor mode for the copy method.");
2604  // Make sure data shared_ptr points to is not released until we finish
2605  // work with it.
2606  CGData.MSharedPtrStorage.push_back(Dst);
2607  typename std::shared_ptr<T_Dst>::element_type *RawDstPtr = Dst.get();
2608  copy(Src, RawDstPtr);
2609  }
2610 
2618  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2619  access::target AccessTarget,
2621  void
2622  copy(std::shared_ptr<T_Src> Src,
2624  if (Dst.is_placeholder())
2625  checkIfPlaceholderIsBoundToHandler(Dst);
2626 
2627  throwIfActionIsCreated();
2628  static_assert(isValidTargetForExplicitOp(AccessTarget),
2629  "Invalid accessor target for the copy method.");
2630  static_assert(isValidModeForDestinationAccessor(AccessMode),
2631  "Invalid accessor mode for the copy method.");
2632  // TODO: Add static_assert with is_device_copyable when vec is
2633  // device-copyable.
2634  // Make sure data shared_ptr points to is not released until we finish
2635  // work with it.
2636  CGData.MSharedPtrStorage.push_back(Src);
2637  typename std::shared_ptr<T_Src>::element_type *RawSrcPtr = Src.get();
2638  copy(RawSrcPtr, Dst);
2639  }
2640 
2648  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2649  access::target AccessTarget,
2652  T_Dst *Dst) {
2653  if (Src.is_placeholder())
2654  checkIfPlaceholderIsBoundToHandler(Src);
2655 
2656  throwIfActionIsCreated();
2657  static_assert(isValidTargetForExplicitOp(AccessTarget),
2658  "Invalid accessor target for the copy method.");
2659  static_assert(isValidModeForSourceAccessor(AccessMode),
2660  "Invalid accessor mode for the copy method.");
2661 #ifndef __SYCL_DEVICE_ONLY__
2662  if (MIsHost) {
2663  // TODO: Temporary implementation for host. Should be handled by memory
2664  // manager.
2665  copyAccToPtrHost(Src, Dst);
2666  return;
2667  }
2668 #endif
2669  setType(detail::CG::CopyAccToPtr);
2670 
2672  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2673 
2674  CGData.MRequirements.push_back(AccImpl.get());
2675  MSrcPtr = static_cast<void *>(AccImpl.get());
2676  MDstPtr = static_cast<void *>(Dst);
2677  // Store copy of accessor to the local storage to make sure it is alive
2678  // until we finish
2679  CGData.MAccStorage.push_back(std::move(AccImpl));
2680  }
2681 
2689  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2690  access::target AccessTarget,
2692  void
2693  copy(const T_Src *Src,
2695  if (Dst.is_placeholder())
2696  checkIfPlaceholderIsBoundToHandler(Dst);
2697 
2698  throwIfActionIsCreated();
2699  static_assert(isValidTargetForExplicitOp(AccessTarget),
2700  "Invalid accessor target for the copy method.");
2701  static_assert(isValidModeForDestinationAccessor(AccessMode),
2702  "Invalid accessor mode for the copy method.");
2703  // TODO: Add static_assert with is_device_copyable when vec is
2704  // device-copyable.
2705 #ifndef __SYCL_DEVICE_ONLY__
2706  if (MIsHost) {
2707  // TODO: Temporary implementation for host. Should be handled by memory
2708  // manager.
2709  copyPtrToAccHost(Src, Dst);
2710  return;
2711  }
2712 #endif
2713  setType(detail::CG::CopyPtrToAcc);
2714 
2716  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2717 
2718  CGData.MRequirements.push_back(AccImpl.get());
2719  MSrcPtr = const_cast<T_Src *>(Src);
2720  MDstPtr = static_cast<void *>(AccImpl.get());
2721  // Store copy of accessor to the local storage to make sure it is alive
2722  // until we finish
2723  CGData.MAccStorage.push_back(std::move(AccImpl));
2724  }
2725 
2733  template <
2734  typename T_Src, int Dims_Src, access::mode AccessMode_Src,
2735  access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
2736  access::mode AccessMode_Dst, access::target AccessTarget_Dst,
2739  void copy(accessor<T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src,
2740  IsPlaceholder_Src>
2741  Src,
2742  accessor<T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst,
2743  IsPlaceholder_Dst>
2744  Dst) {
2745  if (Src.is_placeholder())
2746  checkIfPlaceholderIsBoundToHandler(Src);
2747  if (Dst.is_placeholder())
2748  checkIfPlaceholderIsBoundToHandler(Dst);
2749 
2750  throwIfActionIsCreated();
2751  static_assert(isValidTargetForExplicitOp(AccessTarget_Src),
2752  "Invalid source accessor target for the copy method.");
2753  static_assert(isValidTargetForExplicitOp(AccessTarget_Dst),
2754  "Invalid destination accessor target for the copy method.");
2755  static_assert(isValidModeForSourceAccessor(AccessMode_Src),
2756  "Invalid source accessor mode for the copy method.");
2757  static_assert(isValidModeForDestinationAccessor(AccessMode_Dst),
2758  "Invalid destination accessor mode for the copy method.");
2759  if (Dst.get_size() < Src.get_size())
2760  throw sycl::invalid_object_error(
2761  "The destination accessor size is too small to copy the memory into.",
2762  PI_ERROR_INVALID_OPERATION);
2763 
2764  if (copyAccToAccHelper(Src, Dst))
2765  return;
2766  setType(detail::CG::CopyAccToAcc);
2767 
2768  detail::AccessorBaseHost *AccBaseSrc = (detail::AccessorBaseHost *)&Src;
2769  detail::AccessorImplPtr AccImplSrc = detail::getSyclObjImpl(*AccBaseSrc);
2770 
2771  detail::AccessorBaseHost *AccBaseDst = (detail::AccessorBaseHost *)&Dst;
2772  detail::AccessorImplPtr AccImplDst = detail::getSyclObjImpl(*AccBaseDst);
2773 
2774  CGData.MRequirements.push_back(AccImplSrc.get());
2775  CGData.MRequirements.push_back(AccImplDst.get());
2776  MSrcPtr = AccImplSrc.get();
2777  MDstPtr = AccImplDst.get();
2778  // Store copy of accessor to the local storage to make sure it is alive
2779  // until we finish
2780  CGData.MAccStorage.push_back(std::move(AccImplSrc));
2781  CGData.MAccStorage.push_back(std::move(AccImplDst));
2782  }
2783 
2788  template <typename T, int Dims, access::mode AccessMode,
2789  access::target AccessTarget,
2791  void
2793  if (Acc.is_placeholder())
2794  checkIfPlaceholderIsBoundToHandler(Acc);
2795 
2796  throwIfActionIsCreated();
2797  static_assert(isValidTargetForExplicitOp(AccessTarget),
2798  "Invalid accessor target for the update_host method.");
2799  setType(detail::CG::UpdateHost);
2800 
2802  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2803 
2804  MDstPtr = static_cast<void *>(AccImpl.get());
2805  CGData.MRequirements.push_back(AccImpl.get());
2806  CGData.MAccStorage.push_back(std::move(AccImpl));
2807  }
2808 
2809 public:
2818  template <typename T, int Dims, access::mode AccessMode,
2819  access::target AccessTarget,
2821  typename PropertyListT = property_list>
2822  void
2824  Dst,
2825  const T &Pattern) {
2826  assert(!MIsHost && "fill() should no longer be callable on a host device.");
2827 
2828  if (Dst.is_placeholder())
2829  checkIfPlaceholderIsBoundToHandler(Dst);
2830 
2831  throwIfActionIsCreated();
2832  setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill);
2833  // TODO add check:T must be an integral scalar value or a SYCL vector type
2834  static_assert(isValidTargetForExplicitOp(AccessTarget),
2835  "Invalid accessor target for the fill method.");
2836  // CG::Fill will result in piEnqueuFillBuffer/Image which requires that mem
2837  // data is contiguous. Thus we check range and offset when dim > 1
2838  // Images don't allow ranged accessors and are fine.
2839  if constexpr (isBackendSupportedFillSize(sizeof(T)) &&
2840  ((Dims <= 1) || isImageOrImageArray(AccessTarget))) {
2841  StageFillCG(Dst, Pattern);
2842  } else if constexpr (Dims == 0) {
2843  // Special case for zero-dim accessors.
2844  parallel_for<__fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
2845  range<1>(1), [=](id<1>) { Dst = Pattern; });
2846  } else {
2847  // Dim > 1
2848  bool OffsetUsable = (Dst.get_offset() == sycl::id<Dims>{});
2850  bool RangesUsable =
2851  (AccBase->getAccessRange() == AccBase->getMemoryRange());
2852  if (OffsetUsable && RangesUsable &&
2853  isBackendSupportedFillSize(sizeof(T))) {
2854  StageFillCG(Dst, Pattern);
2855  } else {
2856  range<Dims> Range = Dst.get_range();
2857  parallel_for<__fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
2858  Range, [=](id<Dims> Index) { Dst[Index] = Pattern; });
2859  }
2860  }
2861  }
2862 
2869  template <typename T> void fill(void *Ptr, const T &Pattern, size_t Count) {
2870  throwIfActionIsCreated();
2871  setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill);
2872  static_assert(is_device_copyable<T>::value,
2873  "Pattern must be device copyable");
2874  parallel_for<__usmfill<T>>(range<1>(Count), [=](id<1> Index) {
2875  T *CastedPtr = static_cast<T *>(Ptr);
2876  CastedPtr[Index] = Pattern;
2877  });
2878  }
2879 
2884  throwIfActionIsCreated();
2885  setType(detail::CG::Barrier);
2886  }
2887 
2894  void ext_oneapi_barrier(const std::vector<event> &WaitList);
2895 
2906  void memcpy(void *Dest, const void *Src, size_t Count);
2907 
2918  template <typename T> void copy(const T *Src, T *Dest, size_t Count) {
2919  this->memcpy(Dest, Src, Count * sizeof(T));
2920  }
2921 
2929  void memset(void *Dest, int Value, size_t Count);
2930 
2937  void prefetch(const void *Ptr, size_t Count);
2938 
2945  void mem_advise(const void *Ptr, size_t Length, int Advice);
2946 
2963  template <typename T = unsigned char,
2964  typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
2965  void ext_oneapi_memcpy2d(void *Dest, size_t DestPitch, const void *Src,
2966  size_t SrcPitch, size_t Width, size_t Height);
2967 
2981  template <typename T>
2982  void ext_oneapi_copy2d(const T *Src, size_t SrcPitch, T *Dest,
2983  size_t DestPitch, size_t Width, size_t Height);
2984 
3000  template <typename T = unsigned char,
3001  typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
3002  void ext_oneapi_memset2d(void *Dest, size_t DestPitch, int Value,
3003  size_t Width, size_t Height);
3004 
3017  template <typename T>
3018  void ext_oneapi_fill2d(void *Dest, size_t DestPitch, const T &Pattern,
3019  size_t Width, size_t Height);
3020 
3029  template <typename T, typename PropertyListT>
3031  const void *Src, size_t NumBytes = sizeof(T),
3032  size_t DestOffset = 0) {
3033  throwIfGraphAssociated<
3034  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
3035  sycl_ext_oneapi_device_global>();
3036  if (sizeof(T) < DestOffset + NumBytes)
3038  "Copy to device_global is out of bounds.");
3039 
3040  constexpr bool IsDeviceImageScoped = PropertyListT::template has_property<
3042 
3043  if (!detail::isDeviceGlobalUsedInKernel(&Dest)) {
3044  // If the corresponding device_global isn't used in any kernels, we fall
3045  // back to doing the memory operation on host-only.
3046  memcpyToHostOnlyDeviceGlobal(&Dest, Src, sizeof(T), IsDeviceImageScoped,
3047  NumBytes, DestOffset);
3048  return;
3049  }
3050 
3051  memcpyToDeviceGlobal(&Dest, Src, IsDeviceImageScoped, NumBytes, DestOffset);
3052  }
3053 
3062  template <typename T, typename PropertyListT>
3063  void
3064  memcpy(void *Dest,
3066  size_t NumBytes = sizeof(T), size_t SrcOffset = 0) {
3067  throwIfGraphAssociated<
3068  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
3069  sycl_ext_oneapi_device_global>();
3070  if (sizeof(T) < SrcOffset + NumBytes)
3072  "Copy from device_global is out of bounds.");
3073 
3074  constexpr bool IsDeviceImageScoped = PropertyListT::template has_property<
3076 
3078  // If the corresponding device_global isn't used in any kernels, we fall
3079  // back to doing the memory operation on host-only.
3080  memcpyFromHostOnlyDeviceGlobal(Dest, &Src, IsDeviceImageScoped, NumBytes,
3081  SrcOffset);
3082  return;
3083  }
3084 
3085  memcpyFromDeviceGlobal(Dest, &Src, IsDeviceImageScoped, NumBytes,
3086  SrcOffset);
3087  }
3088 
3098  template <typename T, typename PropertyListT>
3099  void copy(const std::remove_all_extents_t<T> *Src,
3101  size_t Count = sizeof(T) / sizeof(std::remove_all_extents_t<T>),
3102  size_t StartIndex = 0) {
3103  this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
3104  StartIndex * sizeof(std::remove_all_extents_t<T>));
3105  }
3106 
3117  template <typename T, typename PropertyListT>
3118  void
3120  std::remove_all_extents_t<T> *Dest,
3121  size_t Count = sizeof(T) / sizeof(std::remove_all_extents_t<T>),
3122  size_t StartIndex = 0) {
3123  this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
3124  StartIndex * sizeof(std::remove_all_extents_t<T>));
3125  }
3129  void ext_oneapi_graph(ext::oneapi::experimental::command_graph<
3131  Graph);
3132 
3141  void ext_oneapi_copy(
3143  const ext::oneapi::experimental::image_descriptor &DestImgDesc);
3144 
3165  void ext_oneapi_copy(
3166  void *Src, sycl::range<3> SrcOffset, sycl::range<3> SrcExtent,
3168  sycl::range<3> DestOffset,
3170  sycl::range<3> CopyExtent);
3171 
3181  void ext_oneapi_copy(
3184 
3206  void
3207  ext_oneapi_copy(ext::oneapi::experimental::image_mem_handle Src,
3208  sycl::range<3> SrcOffset,
3210  void *Dest, sycl::range<3> DestOffset,
3211  sycl::range<3> DestExtent, sycl::range<3> CopyExtent);
3212 
3223  void ext_oneapi_copy(
3224  void *Src, void *Dest,
3225  const ext::oneapi::experimental::image_descriptor &DeviceImgDesc,
3226  size_t DeviceRowPitch);
3227 
3250  void ext_oneapi_copy(
3251  void *Src, sycl::range<3> SrcOffset, void *Dest,
3252  sycl::range<3> DestOffset,
3253  const ext::oneapi::experimental::image_descriptor &DeviceImgDesc,
3254  size_t DeviceRowPitch, sycl::range<3> HostExtent,
3255  sycl::range<3> CopyExtent);
3256 
3261  void ext_oneapi_wait_external_semaphore(
3263  SemaphoreHandle);
3264 
3270  void ext_oneapi_signal_external_semaphore(
3272  SemaphoreHandle);
3273 
3274 private:
3275  std::shared_ptr<detail::handler_impl> MImpl;
3276  std::shared_ptr<detail::queue_impl> MQueue;
3277 
3282  mutable detail::CG::StorageInitHelper CGData;
3283  std::vector<detail::LocalAccessorImplPtr> MLocalAccStorage;
3284  std::vector<std::shared_ptr<detail::stream_impl>> MStreamStorage;
3286  std::vector<detail::ArgDesc> MArgs;
3290  std::vector<detail::ArgDesc> MAssociatedAccesors;
3292  detail::NDRDescT MNDRDesc;
3293  detail::string MKernelName;
3295  std::shared_ptr<detail::kernel_impl> MKernel;
3301  void *MSrcPtr = nullptr;
3303  void *MDstPtr = nullptr;
3305  size_t MLength = 0;
3307  std::vector<char> MPattern;
3309  std::unique_ptr<detail::HostKernelBase> MHostKernel;
3311  std::unique_ptr<detail::HostTask> MHostTask;
3314  std::vector<detail::EventImplPtr> MEventsWaitWithBarrier;
3315 
3317  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph;
3320  std::shared_ptr<ext::oneapi::experimental::detail::exec_graph_impl>
3321  MExecGraph;
3323  std::shared_ptr<ext::oneapi::experimental::detail::node_impl> MSubgraphNode;
3325  std::unique_ptr<detail::CG> MGraphNodeCG;
3326 
3327  bool MIsHost = false;
3328 
3329  detail::code_location MCodeLoc = {};
3330  bool MIsFinalized = false;
3331  event MLastEvent;
3332 
3333  // Make queue_impl class friend to be able to call finalize method.
3334  friend class detail::queue_impl;
3335  // Make accessor class friend to keep the list of associated accessors.
3336  template <typename DataT, int Dims, access::mode AccMode,
3338  typename PropertyListT>
3339  friend class accessor;
3341 
3342  template <typename DataT, int Dimensions, access::mode AccessMode,
3345  // Make stream class friend to be able to keep the list of associated streams
3346  friend class stream;
3347  friend class detail::stream_impl;
3348  // Make reduction friends to store buffers and arrays created for it
3349  // in handler from reduction methods.
3350  template <typename T, class BinaryOperation, int Dims, size_t Extent,
3351  bool ExplicitIdentity, typename RedOutVar>
3353 
3355  template <class FunctorTy>
3356  friend void detail::reduction::withAuxHandler(handler &CGH, FunctorTy Func);
3357 
3358  template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
3359  typename PropertiesT, typename... RestT>
3361  PropertiesT Properties,
3362  RestT... Rest);
3363 
3364  template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
3365  typename PropertiesT, typename... RestT>
3366  friend void
3368  PropertiesT Properties, RestT... Rest);
3369 
3370 #ifndef __SYCL_DEVICE_ONLY__
3373  access::target);
3378 #endif
3379 
3380  friend class ::MockHandler;
3381  friend class detail::queue_impl;
3382 
3383  // Make pipe class friend to be able to call ext_intel_read/write_host_pipe
3384  // method.
3385  template <class _name, class _dataT, int32_t _min_capacity,
3386  class _propertiesT, class>
3388 
3395  void ext_intel_read_host_pipe(const std::string &Name, void *Ptr, size_t Size,
3396  bool Block = false) {
3397  ext_intel_read_host_pipe(detail::string_view(Name), Ptr, Size, Block);
3398  }
3399  void ext_intel_read_host_pipe(detail::string_view Name, void *Ptr,
3400  size_t Size, bool Block = false);
3401 
3408  void ext_intel_write_host_pipe(const std::string &Name, void *Ptr,
3409  size_t Size, bool Block = false) {
3410  ext_intel_write_host_pipe(detail::string_view(Name), Ptr, Size, Block);
3411  }
3412  void ext_intel_write_host_pipe(detail::string_view Name, void *Ptr,
3413  size_t Size, bool Block = false);
3416 
3417  bool DisableRangeRounding();
3418 
3419  bool RangeRoundingTrace();
3420 
3421  void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
3422  size_t &MinRange);
3423 
3424  template <typename WrapperT, typename TransformedArgType, int Dims,
3425  typename KernelType,
3427  KernelType, TransformedArgType>::value> * = nullptr>
3428  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
3429  range<Dims> UserRange) {
3430  return detail::RoundedRangeKernelWithKH<TransformedArgType, Dims,
3431  KernelType>{UserRange, KernelFunc};
3432  }
3433 
3434  template <typename WrapperT, typename TransformedArgType, int Dims,
3435  typename KernelType,
3436  std::enable_if_t<!detail::KernelLambdaHasKernelHandlerArgT<
3437  KernelType, TransformedArgType>::value> * = nullptr>
3438  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
3439  range<Dims> UserRange) {
3440  return detail::RoundedRangeKernel<TransformedArgType, Dims, KernelType>{
3441  UserRange, KernelFunc};
3442  }
3443 
3444  const std::shared_ptr<detail::context_impl> &getContextImplPtr() const;
3445 
3446  // Checks if 2D memory operations are supported by the underlying platform.
3447  bool supportsUSMMemcpy2D();
3448  bool supportsUSMFill2D();
3449  bool supportsUSMMemset2D();
3450 
3451  // Helper function for getting a loose bound on work-items.
3452  id<2> computeFallbackKernelBounds(size_t Width, size_t Height);
3453 
3454  // Common function for launching a 2D USM memcpy kernel to avoid redefinitions
3455  // of the kernel from copy and memcpy.
3456  template <typename T>
3457  void commonUSMCopy2DFallbackKernel(const void *Src, size_t SrcPitch,
3458  void *Dest, size_t DestPitch, size_t Width,
3459  size_t Height) {
3460  // Otherwise the data is accessible on the device so we do the operation
3461  // there instead.
3462  // Limit number of work items to be resistant to big copies.
3463  id<2> Chunk = computeFallbackKernelBounds(Height, Width);
3464  id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
3465  parallel_for<__usmmemcpy2d<T>>(
3466  range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) {
3467  T *CastedDest = static_cast<T *>(Dest);
3468  const T *CastedSrc = static_cast<const T *>(Src);
3469  for (uint32_t I = 0; I < Iterations[0]; ++I) {
3470  for (uint32_t J = 0; J < Iterations[1]; ++J) {
3471  id<2> adjustedIndex = Index + Chunk * id<2>{I, J};
3472  if (adjustedIndex[0] < Height && adjustedIndex[1] < Width) {
3473  CastedDest[adjustedIndex[0] * DestPitch + adjustedIndex[1]] =
3474  CastedSrc[adjustedIndex[0] * SrcPitch + adjustedIndex[1]];
3475  }
3476  }
3477  }
3478  });
3479  }
3480 
3481  // Common function for launching a 2D USM memcpy host-task to avoid
3482  // redefinitions of the kernel from copy and memcpy.
3483  template <typename T>
3484  void commonUSMCopy2DFallbackHostTask(const void *Src, size_t SrcPitch,
3485  void *Dest, size_t DestPitch,
3486  size_t Width, size_t Height) {
3487  // If both pointers are host USM or unknown (assumed non-USM) we use a
3488  // host-task to satisfy dependencies.
3489  host_task([=] {
3490  const T *CastedSrc = static_cast<const T *>(Src);
3491  T *CastedDest = static_cast<T *>(Dest);
3492  for (size_t I = 0; I < Height; ++I) {
3493  const T *SrcItBegin = CastedSrc + SrcPitch * I;
3494  T *DestItBegin = CastedDest + DestPitch * I;
3495  std::copy(SrcItBegin, SrcItBegin + Width, DestItBegin);
3496  }
3497  });
3498  }
3499 
3500  // StageFillCG() Supporting function to fill()
3501  template <typename T, int Dims, access::mode AccessMode,
3502  access::target AccessTarget,
3504  typename PropertyListT = property_list>
3505  void StageFillCG(
3506  accessor<T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT>
3507  Dst,
3508  const T &Pattern) {
3509  setType(detail::CG::Fill);
3510  detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Dst;
3511  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
3512 
3513  MDstPtr = static_cast<void *>(AccImpl.get());
3514  CGData.MRequirements.push_back(AccImpl.get());
3515  CGData.MAccStorage.push_back(std::move(AccImpl));
3516 
3517  MPattern.resize(sizeof(T));
3518  auto PatternPtr = reinterpret_cast<T *>(MPattern.data());
3519  *PatternPtr = Pattern;
3520  }
3521 
3522  // Common function for launching a 2D USM fill kernel to avoid redefinitions
3523  // of the kernel from memset and fill.
3524  template <typename T>
3525  void commonUSMFill2DFallbackKernel(void *Dest, size_t DestPitch,
3526  const T &Pattern, size_t Width,
3527  size_t Height) {
3528  // Otherwise the data is accessible on the device so we do the operation
3529  // there instead.
3530  // Limit number of work items to be resistant to big fill operations.
3531  id<2> Chunk = computeFallbackKernelBounds(Height, Width);
3532  id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
3533  parallel_for<__usmfill2d<T>>(
3534  range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) {
3535  T *CastedDest = static_cast<T *>(Dest);
3536  for (uint32_t I = 0; I < Iterations[0]; ++I) {
3537  for (uint32_t J = 0; J < Iterations[1]; ++J) {
3538  id<2> adjustedIndex = Index + Chunk * id<2>{I, J};
3539  if (adjustedIndex[0] < Height && adjustedIndex[1] < Width) {
3540  CastedDest[adjustedIndex[0] * DestPitch + adjustedIndex[1]] =
3541  Pattern;
3542  }
3543  }
3544  }
3545  });
3546  }
3547 
3548  // Common function for launching a 2D USM fill kernel or host_task to avoid
3549  // redefinitions of the kernel from memset and fill.
3550  template <typename T>
3551  void commonUSMFill2DFallbackHostTask(void *Dest, size_t DestPitch,
3552  const T &Pattern, size_t Width,
3553  size_t Height) {
3554  // If the pointer is host USM or unknown (assumed non-USM) we use a
3555  // host-task to satisfy dependencies.
3556  host_task([=] {
3557  T *CastedDest = static_cast<T *>(Dest);
3558  for (size_t I = 0; I < Height; ++I) {
3559  T *ItBegin = CastedDest + DestPitch * I;
3560  std::fill(ItBegin, ItBegin + Width, Pattern);
3561  }
3562  });
3563  }
3564 
3565  // Implementation of ext_oneapi_memcpy2d using command for native 2D memcpy.
3566  void ext_oneapi_memcpy2d_impl(void *Dest, size_t DestPitch, const void *Src,
3567  size_t SrcPitch, size_t Width, size_t Height);
3568 
3569  // Untemplated version of ext_oneapi_fill2d using command for native 2D fill.
3570  void ext_oneapi_fill2d_impl(void *Dest, size_t DestPitch, const void *Value,
3571  size_t ValueSize, size_t Width, size_t Height);
3572 
3573  // Implementation of ext_oneapi_memset2d using command for native 2D memset.
3574  void ext_oneapi_memset2d_impl(void *Dest, size_t DestPitch, int Value,
3575  size_t Width, size_t Height);
3576 
3577  // Implementation of memcpy to device_global.
3578  void memcpyToDeviceGlobal(const void *DeviceGlobalPtr, const void *Src,
3579  bool IsDeviceImageScoped, size_t NumBytes,
3580  size_t Offset);
3581 
3582  // Implementation of memcpy from device_global.
3583  void memcpyFromDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
3584  bool IsDeviceImageScoped, size_t NumBytes,
3585  size_t Offset);
3586 
3587  // Implementation of memcpy to an unregistered device_global.
3588  void memcpyToHostOnlyDeviceGlobal(const void *DeviceGlobalPtr,
3589  const void *Src, size_t DeviceGlobalTSize,
3590  bool IsDeviceImageScoped, size_t NumBytes,
3591  size_t Offset);
3592 
3593  // Implementation of memcpy from an unregistered device_global.
3594  void memcpyFromHostOnlyDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
3595  bool IsDeviceImageScoped, size_t NumBytes,
3596  size_t Offset);
3597 
3598  template <typename T, int Dims, access::mode AccessMode,
3599  access::target AccessTarget,
3601  typename PropertyListT = property_list>
3602  void checkIfPlaceholderIsBoundToHandler(
3603  accessor<T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT>
3604  Acc) {
3605  auto *AccBase = reinterpret_cast<detail::AccessorBaseHost *>(&Acc);
3606  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
3607  detail::AccessorImplHost *Req = AccImpl.get();
3608  if (std::find_if(MAssociatedAccesors.begin(), MAssociatedAccesors.end(),
3609  [&](const detail::ArgDesc &AD) {
3610  return AD.MType ==
3611  detail::kernel_param_kind_t::kind_accessor &&
3612  AD.MPtr == Req &&
3613  AD.MSize == static_cast<int>(AccessTarget);
3614  }) == MAssociatedAccesors.end())
3616  "placeholder accessor must be bound by calling "
3617  "handler::require() before it can be used.");
3618  }
3619 
3620  // Set value of the gpu cache configuration for the kernel.
3621  void setKernelCacheConfig(sycl::detail::pi::PiKernelCacheConfig);
3622  // Set value of the kernel is cooperative flag
3623  void setKernelIsCooperative(bool);
3624 
3625  template <
3627  void throwIfGraphAssociated() const {
3628 
3629  if (getCommandGraph()) {
3630  std::string FeatureString =
3632  FeatureT);
3634  "The " + FeatureString +
3635  " feature is not yet available "
3636  "for use with the SYCL Graph extension.");
3637  }
3638  }
3639 
3640  // Set that an ND Range was used during a call to parallel_for
3641  void setNDRangeUsed(bool Value);
3642 };
3643 } // namespace _V1
3644 } // namespace sycl
The file contains implementations of accessor class.
Defines a shared array that can be used by kernels in queues.
Definition: buffer.hpp:169
CGTYPE
Type of the command group.
Definition: cg.hpp:56
RoundedRangeIDGenerator(const id< Dims > &Id, const range< Dims > &UserRange, const range< Dims > &RoundedRange)
Definition: handler.hpp:330
void operator()(item< Dims > It, kernel_handler KH) const
Definition: handler.hpp:392
void operator()(item< Dims > It) const
Definition: handler.hpp:377
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:45
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
Implementation details of command_graph<modifiable>.
Definition: graph_impl.hpp:794
Command group handler class.
Definition: handler.hpp:458
void fill(void *Ptr, const T &Pattern, size_t Count)
Fills the specified memory with the specified pattern.
Definition: handler.hpp:2869
void parallel_for(range< 2 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:2147
void parallel_for(kernel Kernel, range< Dims > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range.
Definition: handler.hpp:2237
void single_task(_KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:2008
void copy(const std::remove_all_extents_t< T > *Src, ext::oneapi::experimental::device_global< T, PropertyListT > &Dest, size_t Count=sizeof(T)/sizeof(std::remove_all_extents_t< T >), size_t StartIndex=0)
Copies elements of type std::remove_all_extents_t<T> from a USM memory region to a device_global.
Definition: handler.hpp:3099
void parallel_for(nd_range< Dims > NDRange, kernel Kernel)
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:2185
void parallel_for(range< 1 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2014
void parallel_for_work_group(range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2571
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 2 > Range, PropertiesT Properties, RestT &&...Rest)
Definition: handler.hpp:2492
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2594
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(nd_range< Dims > Range, PropertiesT Properties, RestT &&...Rest)
Definition: handler.hpp:2542
void copy(const ext::oneapi::experimental::device_global< T, PropertyListT > &Src, std::remove_all_extents_t< T > *Dest, size_t Count=sizeof(T)/sizeof(std::remove_all_extents_t< T >), size_t StartIndex=0)
Copies elements of type std::remove_all_extents_t<T> from a device_global to a USM memory region.
Definition: handler.hpp:3119
void parallel_for(range< 3 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2028
handler & operator=(handler &&)=delete
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 3 > Range, PropertiesT Properties, RestT &&...Rest)
Definition: handler.hpp:2505
handler(handler &&)=delete
void copy(accessor< T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src, IsPlaceholder_Src > Src, accessor< T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst, IsPlaceholder_Dst > Dst)
Copies the content of memory object accessed by Src to the memory object accessed by Dst.
Definition: handler.hpp:2739
void copy(const T *Src, T *Dest, size_t Count)
Copies data from one memory region to another, each is either a host pointer or a pointer within USM ...
Definition: handler.hpp:2918
void parallel_for_work_group(range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:2116
void parallel_for(range< 3 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:2151
void parallel_for(range< 1 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:2143
void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function.
Definition: handler.hpp:2203
handler & operator=(const handler &)=delete
void parallel_for(kernel Kernel, nd_range< Dims > NDRange, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:2315
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2397
void require(accessor< DataT, Dims, AccMode, AccTarget, isPlaceholder > Acc)
Requires access to the memory object associated with the placeholder accessor.
Definition: handler.hpp:1905
void fill(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT > Dst, const T &Pattern)
Fills memory pointed by accessor with the pattern given.
Definition: handler.hpp:2823
void update_host(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder > Acc)
Provides guarantees that the memory object accessed via Acc is updated on the host after command grou...
Definition: handler.hpp:2792
void set_arg(int argIndex, ext::oneapi::experimental::dynamic_parameter< T > &dynamicParam)
Definition: handler.hpp:1986
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 2 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2446
std::is_same< remove_cv_ref_t< U >, remove_cv_ref_t< T > > is_same_type
Definition: handler.hpp:1945
std::enable_if_t< ShouldEnableSetArg< T >::value, void > set_arg(int ArgIndex, T &&Arg)
Sets argument for OpenCL interoperability kernels.
Definition: handler.hpp:1967
void parallel_for_work_group(range< Dims > NumWorkGroups, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
}@
Definition: handler.hpp:2562
void require(ext::oneapi::experimental::dynamic_parameter< accessor< DataT, Dims, AccMode, AccTarget, isPlaceholder >> dynamicParamAcc)
Requires access to the memory object associated with the placeholder accessor contained in a dynamic_...
Definition: handler.hpp:1921
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > single_task(PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2427
void single_task(kernel Kernel)
Invokes a SYCL kernel.
Definition: handler.hpp:2130
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(nd_range< Dims > Range, RestT &&...Rest)
Definition: handler.hpp:2552
void copy(const T_Src *Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2693
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 3 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2456
void copy(std::shared_ptr< T_Src > Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2622
handler(const handler &)=delete
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, T_Dst *Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2651
typename std::remove_cv_t< std::remove_reference_t< T > > remove_cv_ref_t
Definition: handler.hpp:1942
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(nd_range< Dims > Range, PropertiesT Properties, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2466
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(range< 1 > Range, RestT &&...Rest)
Definition: handler.hpp:2514
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(range< 2 > Range, RestT &&...Rest)
Definition: handler.hpp:2522
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(range< 3 > Range, RestT &&...Rest)
Definition: handler.hpp:2530
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2358
void ext_oneapi_barrier()
Prevents any commands submitted afterward to this queue from executing until all commands previously ...
Definition: handler.hpp:2883
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 1 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2436
void memcpy(void *Dest, const ext::oneapi::experimental::device_global< T, PropertyListT > &Src, size_t NumBytes=sizeof(T), size_t SrcOffset=0)
Copies data from a device_global to USM memory.
Definition: handler.hpp:3064
void parallel_for_work_group(range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:2095
std::enable_if_t< detail::check_fn_signature< std::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< std::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
Definition: handler.hpp:2040
void set_specialization_constant(typename std::remove_reference_t< decltype(SpecName)>::value_type Value)
Definition: handler.hpp:1862
void set_args(Ts &&...Args)
Sets arguments for OpenCL interoperability kernels.
Definition: handler.hpp:1996
void memcpy(ext::oneapi::experimental::device_global< T, PropertyListT > &Dest, const void *Src, size_t NumBytes=sizeof(T), size_t DestOffset=0)
Copies data from a USM memory region to a device_global.
Definition: handler.hpp:3030
void parallel_for(range< 2 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2021
void set_arg(int ArgIndex, accessor< DataT, Dims, AccessMode, AccessTarget, IsPlaceholder > Arg)
Definition: handler.hpp:1974
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 1 > Range, PropertiesT Properties, RestT &&...Rest)
Reductions.
Definition: handler.hpp:2479
std::remove_reference_t< decltype(SpecName)>::value_type get_specialization_constant() const
Definition: handler.hpp:1877
void set_arg(int ArgIndex, local_accessor< DataT, Dims > Arg)
Definition: handler.hpp:1980
A unique identifier of an item in an index space.
Definition: id.hpp:36
Identifies an instance of the function object executing at each point in a range.
Definition: item.hpp:37
id< Dimensions > get_id() const
Definition: item.hpp:55
range< Dimensions > get_range() const
Definition: item.hpp:69
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:77
Identifies an instance of the function object executing at each point in an nd_range.
Definition: nd_item.hpp:48
id< Dimensions > get_global_id() const
Definition: nd_item.hpp:52
id< Dimensions > get_offset() const
Definition: nd_item.hpp:197
range< Dimensions > get_global_range() const
Definition: nd_item.hpp:164
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:22
Objects of the property_list class are containers for the SYCL properties.
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: range.hpp:26
size_t size() const
Definition: range.hpp:56
#define _KERNELFUNCPARAMTYPE
Definition: handler.hpp:77
#define __SYCL_KERNEL_ATTR__
Definition: handler.hpp:1545
#define _KERNELFUNCPARAM(a)
Definition: handler.hpp:81
__SYCL_EXTERN_STREAM_ATTRS ostream cout
Linked to standard output.
void withAuxHandler(handler &CGH, FunctorTy Func)
Definition: reduction.hpp:1166
void finalizeHandler(handler &CGH)
Definition: reduction.hpp:1165
void * getValueFromDynamicParameter(ext::oneapi::experimental::detail::dynamic_parameter_base &DynamicParamBase)
Definition: handler.cpp:74
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
Definition: accessor.cpp:16
size_t getLinearIndex(const T< Dims > &Index, const U< Dims > &Range)
Definition: common.hpp:367
decltype(member_ptr_helper(&F::operator())) argument_helper(int)
Definition: handler.hpp:195
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:313
static Arg member_ptr_helper(RetType(Func::*)(Arg) const)
bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr)
Definition: handler.cpp:39
static std::enable_if_t< std::is_unsigned_v< T >, bool > multiply_with_overflow_check(T &dst, T x, T y)
Definition: handler.hpp:409
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:30
std::enable_if_t< std::is_same_v< T, range< Dims > >||std::is_same_v< T, id< Dims > > > checkValueRange(const T &V)
Definition: handler.hpp:273
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType, ArgType >::value > runKernelWithArg(KernelType KernelName, ArgType Arg)
Definition: cg_types.hpp:210
void markBufferAsInternal(const std::shared_ptr< buffer_impl > &BufImpl)
Definition: helpers.cpp:70
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType >::value > runKernelWithoutArg(KernelType KernelName)
Definition: cg_types.hpp:196
std::shared_ptr< LocalAccessorImplHost > LocalAccessorImplPtr
Definition: accessor.hpp:601
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
bool range_size_fits_in_size_t(const range< Dims > &r)
Definition: handler.hpp:414
void associateWithHandler(handler &, AccessorBaseHost *, access::target)
void reduction_parallel_for(handler &CGH, range< Dims > NDRange, PropertiesT Properties, RestT... Rest)
Definition: reduction.hpp:2711
decltype(argument_helper< F, SuggestedArgType >(0)) lambda_arg_type
Definition: handler.hpp:201
std::shared_ptr< AccessorImplHost > AccessorImplPtr
Definition: accessor.hpp:532
const char * UnsupportedFeatureToString(UnsupportedGraphFeatures Feature)
Definition: graph.hpp:60
typename merged_properties< LHSPropertiesT, RHSPropertiesT >::type merged_properties_t
Definition: properties.hpp:225
properties< std::tuple< PropertyValueTs... > > properties_t
Definition: properties.hpp:212
@ executable
In executable state, the graph is ready to execute.
static constexpr bool has_property()
static constexpr auto get_property()
decltype(properties{}) empty_properties_t
Definition: properties.hpp:190
image_target
Definition: access.hpp:74
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS Dimensions
Definition: accessor.hpp:3233
signed char __SYCL2020_DEPRECATED
Definition: aliases.hpp:94
ext::intel::pipe< name, dataT, min_capacity > pipe
Definition: pipes.hpp:18
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS IsPlaceholder
Definition: accessor.hpp:3234
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
Definition: accessor.hpp:3233
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:87
autodecltype(x) x
const void value_type
Definition: multi_ptr.hpp:457
Definition: access.hpp:18
static sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern, size_t count)
Set pattern to the first count elements of type T starting from dev_ptr.
Definition: memory.hpp:185
_pi_kernel_cache_config
Definition: pi.h:808
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA
Definition: pi.h:814
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM
Definition: pi.h:812
C++ wrapper of extern "C" PI interfaces.
Predicate returning true if all template type parameters except the last one are reductions.
std::vector< detail::AccessorImplPtr > MAccStorage
Storage for accessors.
Definition: cg.hpp:101
std::vector< AccessorImplHost * > MRequirements
List of requirements that specify which memory is needed for the command group to be executed.
Definition: cg.hpp:107
static constexpr const char * getName()
Definition: kernel_desc.hpp:84
A struct to describe the properties of an image.
is_device_copyable is a user specializable class template to indicate that a type T is device copyabl...
Definition: types.hpp:2225