DPC++ Runtime
Runtime libraries for oneAPI DPC++
handler.hpp
Go to the documentation of this file.
1 //==-------- handler.hpp --- SYCL command group handler --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
11 #include <sycl/access/access.hpp>
12 #include <sycl/accessor.hpp>
13 #include <sycl/context.hpp>
14 #include <sycl/detail/cg.hpp>
15 #include <sycl/detail/cg_types.hpp>
16 #include <sycl/detail/common.hpp>
18 #include <sycl/detail/export.hpp>
21 #include <sycl/detail/pi.h>
22 #include <sycl/detail/pi.hpp>
24 #include <sycl/device.hpp>
25 #include <sycl/event.hpp>
26 #include <sycl/exception.hpp>
38 #include <sycl/group.hpp>
39 #include <sycl/id.hpp>
40 #include <sycl/interop_handle.hpp>
41 #include <sycl/item.hpp>
42 #include <sycl/kernel.hpp>
43 #include <sycl/kernel_bundle.hpp>
45 #include <sycl/kernel_handler.hpp>
46 #include <sycl/nd_item.hpp>
47 #include <sycl/nd_range.hpp>
48 #include <sycl/property_list.hpp>
49 #include <sycl/range.hpp>
50 #include <sycl/sampler.hpp>
51 #include <sycl/usm/usm_enums.hpp>
53 
54 #include <assert.h>
55 #include <functional>
56 #include <memory>
57 #include <stddef.h>
58 #include <stdint.h>
59 #include <string>
60 #include <tuple>
61 #include <type_traits>
62 #include <utility>
63 #include <vector>
64 
65 // TODO: refactor this header
66 // 41(!!!) includes of SYCL headers + 10 includes of standard headers.
67 // 3300+ lines of code
68 
69 // SYCL_LANGUAGE_VERSION is 4 digit year followed by 2 digit revision
70 #if !SYCL_LANGUAGE_VERSION || SYCL_LANGUAGE_VERSION < 202001
71 #define __SYCL_NONCONST_FUNCTOR__
72 #endif
73 
74 // replace _KERNELFUNCPARAM(KernelFunc) with KernelType KernelFunc
75 // or const KernelType &KernelFunc
76 #ifdef __SYCL_NONCONST_FUNCTOR__
77 #define _KERNELFUNCPARAMTYPE KernelType
78 #else
79 #define _KERNELFUNCPARAMTYPE const KernelType &
80 #endif
81 #define _KERNELFUNCPARAM(a) _KERNELFUNCPARAMTYPE a
82 
83 #if defined(__SYCL_UNNAMED_LAMBDA__)
84 // We can't use nested types (e.g. struct S defined inside main() routine) to
85 // name kernels. At the same time, we have to provide a unique kernel name for
86 // sycl::fill and the only thing we can use to introduce that uniqueness (in
87 // general) is the template parameter T which might be exactly that nested type.
88 // That means we cannot support sycl::fill(void *, T&, size_t) for such types in
89 // general. However, we can do better than that when unnamed lambdas are
90 // enabled, so do it here! See also https://github.com/intel/llvm/issues/469.
91 template <typename DataT, int Dimensions, sycl::access::mode AccessMode,
92  sycl::access::target AccessTarget,
94 using __fill = sycl::detail::auto_name;
95 template <typename T> using __usmfill = sycl::detail::auto_name;
96 template <typename T> using __usmfill2d = sycl::detail::auto_name;
97 template <typename T> using __usmmemcpy2d = sycl::detail::auto_name;
98 
99 template <typename T_Src, typename T_Dst, int Dims,
102 using __copyAcc2Ptr = sycl::detail::auto_name;
103 
104 template <typename T_Src, typename T_Dst, int Dims,
107 using __copyPtr2Acc = sycl::detail::auto_name;
108 
109 template <typename T_Src, int Dims_Src, sycl::access::mode AccessMode_Src,
110  sycl::access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
111  sycl::access::mode AccessMode_Dst,
112  sycl::access::target AccessTarget_Dst,
113  sycl::access::placeholder IsPlaceholder_Src,
114  sycl::access::placeholder IsPlaceholder_Dst>
115 using __copyAcc2Acc = sycl::detail::auto_name;
116 #else
117 // Limited fallback path for when unnamed lambdas aren't available. Cannot
118 // handle nested types.
119 template <typename DataT, int Dimensions, sycl::access::mode AccessMode,
120  sycl::access::target AccessTarget,
122 class __fill;
123 template <typename T> class __usmfill;
124 template <typename T> class __usmfill2d;
125 template <typename T> class __usmmemcpy2d;
126 
127 template <typename T_Src, typename T_Dst, int Dims,
131 
132 template <typename T_Src, typename T_Dst, int Dims,
136 
137 template <typename T_Src, int Dims_Src, sycl::access::mode AccessMode_Src,
138  sycl::access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
139  sycl::access::mode AccessMode_Dst,
140  sycl::access::target AccessTarget_Dst,
141  sycl::access::placeholder IsPlaceholder_Src,
142  sycl::access::placeholder IsPlaceholder_Dst>
144 #endif
145 
146 // For unit testing purposes
147 class MockHandler;
148 
149 namespace sycl {
150 inline namespace _V1 {
151 
152 // Forward declaration
153 
154 class handler;
155 template <typename T, int Dimensions, typename AllocatorT, typename Enable>
156 class buffer;
157 
158 namespace ext::intel::experimental {
159 template <class _name, class _dataT, int32_t _min_capacity, class _propertiesT,
160  class>
161 class pipe;
162 }
163 
164 namespace ext::oneapi::experimental::detail {
165 class graph_impl;
166 } // namespace ext::oneapi::experimental::detail
167 namespace detail {
168 
169 class handler_impl;
170 class kernel_impl;
171 class queue_impl;
172 class stream_impl;
173 template <typename DataT, int Dimensions, access::mode AccessMode,
175 class image_accessor;
176 template <typename RetType, typename Func, typename Arg>
177 static Arg member_ptr_helper(RetType (Func::*)(Arg) const);
178 
179 // Non-const version of the above template to match functors whose 'operator()'
180 // is declared w/o the 'const' qualifier.
181 template <typename RetType, typename Func, typename Arg>
182 static Arg member_ptr_helper(RetType (Func::*)(Arg));
183 
184 // Version with two arguments to handle the case when kernel_handler is passed
185 // to a lambda
186 template <typename RetType, typename Func, typename Arg1, typename Arg2>
187 static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2) const);
188 
189 // Non-const version of the above template to match functors whose 'operator()'
190 // is declared w/o the 'const' qualifier.
191 template <typename RetType, typename Func, typename Arg1, typename Arg2>
192 static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2));
193 
194 template <typename F, typename SuggestedArgType>
195 decltype(member_ptr_helper(&F::operator())) argument_helper(int);
196 
197 template <typename F, typename SuggestedArgType>
198 SuggestedArgType argument_helper(...);
199 
200 template <typename F, typename SuggestedArgType>
201 using lambda_arg_type = decltype(argument_helper<F, SuggestedArgType>(0));
202 
203 // Used when parallel_for range is rounded-up.
204 template <typename Name> class __pf_kernel_wrapper;
205 
206 template <typename Type> struct get_kernel_wrapper_name_t {
208 };
209 
210 __SYCL_EXPORT device getDeviceFromHandler(handler &);
211 
212 // Checks if a device_global has any registered kernel usage.
213 __SYCL_EXPORT bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr);
214 
215 #if __SYCL_ID_QUERIES_FIT_IN_INT__
216 template <typename T> struct NotIntMsg;
217 
218 template <int Dims> struct NotIntMsg<range<Dims>> {
219  constexpr static const char *Msg =
220  "Provided range is out of integer limits. Pass "
221  "`-fno-sycl-id-queries-fit-in-int' to disable range check.";
222 };
223 
224 template <int Dims> struct NotIntMsg<id<Dims>> {
225  constexpr static const char *Msg =
226  "Provided offset is out of integer limits. Pass "
227  "`-fno-sycl-id-queries-fit-in-int' to disable offset check.";
228 };
229 #endif
230 
231 // Helper for merging properties with ones defined in an optional kernel functor
232 // getter.
233 template <typename KernelType, typename PropertiesT, typename Cond = void>
235  using type = PropertiesT;
236 };
237 template <typename KernelType, typename PropertiesT>
239  KernelType, PropertiesT,
240  std::enable_if_t<ext::oneapi::experimental::detail::
241  HasKernelPropertiesGetMethod<KernelType>::value>> {
244  KernelType>::properties_t;
245  static_assert(
247  "get(sycl::ext::oneapi::experimental::properties_tag) member in kernel "
248  "functor class must return a valid property list.");
250  PropertiesT, get_method_properties>;
251 };
252 
253 #if __SYCL_ID_QUERIES_FIT_IN_INT__
254 template <typename T, typename ValT>
255 typename std::enable_if_t<std::is_same<ValT, size_t>::value ||
256  std::is_same<ValT, unsigned long long>::value>
257 checkValueRangeImpl(ValT V) {
258  static constexpr size_t Limit =
259  static_cast<size_t>((std::numeric_limits<int>::max)());
260  if (V > Limit)
261  throw sycl::exception(make_error_code(errc::nd_range), NotIntMsg<T>::Msg);
262 }
263 #endif
264 
265 template <int Dims, typename T>
266 typename std::enable_if_t<std::is_same_v<T, range<Dims>> ||
267  std::is_same_v<T, id<Dims>>>
268 checkValueRange(const T &V) {
269 #if __SYCL_ID_QUERIES_FIT_IN_INT__
270  for (size_t Dim = 0; Dim < Dims; ++Dim)
271  checkValueRangeImpl<T>(V[Dim]);
272 
273  {
274  unsigned long long Product = 1;
275  for (size_t Dim = 0; Dim < Dims; ++Dim) {
276  Product *= V[Dim];
277  // check value now to prevent product overflow in the end
278  checkValueRangeImpl<T>(Product);
279  }
280  }
281 #else
282  (void)V;
283 #endif
284 }
285 
286 template <int Dims>
287 void checkValueRange(const range<Dims> &R, const id<Dims> &O) {
288 #if __SYCL_ID_QUERIES_FIT_IN_INT__
289  checkValueRange<Dims>(R);
290  checkValueRange<Dims>(O);
291 
292  for (size_t Dim = 0; Dim < Dims; ++Dim) {
293  unsigned long long Sum = R[Dim] + O[Dim];
294 
295  checkValueRangeImpl<range<Dims>>(Sum);
296  }
297 #else
298  (void)R;
299  (void)O;
300 #endif
301 }
302 
303 template <int Dims, typename T>
304 typename std::enable_if_t<std::is_same_v<T, nd_range<Dims>>>
305 checkValueRange(const T &V) {
306 #if __SYCL_ID_QUERIES_FIT_IN_INT__
307  checkValueRange<Dims>(V.get_global_range());
308  checkValueRange<Dims>(V.get_local_range());
309  checkValueRange<Dims>(V.get_offset());
310 
311  checkValueRange<Dims>(V.get_global_range(), V.get_offset());
312 #else
313  (void)V;
314 #endif
315 }
316 
317 template <int Dims> class RoundedRangeIDGenerator {
318  id<Dims> Id;
319  id<Dims> InitId;
320  range<Dims> UserRange;
321  range<Dims> RoundedRange;
322  bool Done = false;
323 
324 public:
325  RoundedRangeIDGenerator(const id<Dims> &Id, const range<Dims> &UserRange,
326  const range<Dims> &RoundedRange)
327  : Id(Id), InitId(Id), UserRange(UserRange), RoundedRange(RoundedRange) {
328  for (int i = 0; i < Dims; ++i)
329  if (Id[i] >= UserRange[i])
330  Done = true;
331  }
332 
333  explicit operator bool() { return !Done; }
334 
335  void updateId() {
336  for (int i = 0; i < Dims; ++i) {
337  Id[i] += RoundedRange[i];
338  if (Id[i] < UserRange[i])
339  return;
340  Id[i] = InitId[i];
341  }
342  Done = true;
343  }
344 
345  id<Dims> getId() { return Id; }
346 
347  template <typename KernelType> auto getItem() {
348  if constexpr (std::is_invocable_v<KernelType, item<Dims> &> ||
349  std::is_invocable_v<KernelType, item<Dims> &, kernel_handler>)
350  return detail::Builder::createItem<Dims, true>(UserRange, getId(), {});
351  else {
352  static_assert(std::is_invocable_v<KernelType, item<Dims, false> &> ||
353  std::is_invocable_v<KernelType, item<Dims, false> &,
354  kernel_handler>,
355  "Kernel must be invocable with an item!");
356  return detail::Builder::createItem<Dims, false>(UserRange, getId());
357  }
358  }
359 };
360 
361 // TODO: The wrappers can be optimized further so that the body
362 // essentially looks like this:
363 // for (auto z = it[2]; z < UserRange[2]; z += it.get_range(2))
364 // for (auto y = it[1]; y < UserRange[1]; y += it.get_range(1))
365 // for (auto x = it[0]; x < UserRange[0]; x += it.get_range(0))
366 // KernelFunc({x,y,z});
367 template <typename TransformedArgType, int Dims, typename KernelType>
369 public:
371  KernelType KernelFunc;
372  void operator()(item<Dims> It) const {
373  auto RoundedRange = It.get_range();
374  for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen;
375  Gen.updateId()) {
376  auto item = Gen.template getItem<KernelType>();
377  KernelFunc(item);
378  }
379  }
380 };
381 
382 template <typename TransformedArgType, int Dims, typename KernelType>
384 public:
386  KernelType KernelFunc;
387  void operator()(item<Dims> It, kernel_handler KH) const {
388  auto RoundedRange = It.get_range();
389  for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen;
390  Gen.updateId()) {
391  auto item = Gen.template getItem<KernelType>();
392  KernelFunc(item, KH);
393  }
394  }
395 };
396 
397 using std::enable_if_t;
398 using sycl::detail::queue_impl;
399 
400 // Returns true if x*y will overflow in T;
401 // otherwise, returns false and stores x*y in dst.
402 template <typename T>
403 static std::enable_if_t<std::is_unsigned_v<T>, bool>
404 multiply_with_overflow_check(T &dst, T x, T y) {
405  dst = x * y;
406  return (y != 0) && (x > (std::numeric_limits<T>::max)() / y);
407 }
408 
409 template <int Dims> bool range_size_fits_in_size_t(const range<Dims> &r) {
410  size_t acc = 1;
411  for (int i = 0; i < Dims; ++i) {
412  bool did_overflow = multiply_with_overflow_check(acc, acc, r[i]);
413  if (did_overflow)
414  return false;
415  }
416  return true;
417 }
418 
419 } // namespace detail
420 
454 class __SYCL_EXPORT handler {
455 private:
460  handler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost);
461 
471  handler(std::shared_ptr<detail::queue_impl> Queue,
472  std::shared_ptr<detail::queue_impl> PrimaryQueue,
473  std::shared_ptr<detail::queue_impl> SecondaryQueue, bool IsHost);
474 
481  handler(std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph);
482 
484  template <typename T, typename F = typename std::remove_const_t<
485  typename std::remove_reference_t<T>>>
486  F *storePlainArg(T &&Arg) {
487  CGData.MArgsStorage.emplace_back(sizeof(T));
488  auto Storage = reinterpret_cast<F *>(CGData.MArgsStorage.back().data());
489  *Storage = Arg;
490  return Storage;
491  }
492 
493  void setType(detail::CG::CGTYPE Type) { MCGType = Type; }
494 
495  detail::CG::CGTYPE getType() { return MCGType; }
496 
497  void throwIfActionIsCreated() {
498  if (detail::CG::None != getType())
500  "Attempt to set multiple actions for the "
501  "command group. Command group must consist of "
502  "a single kernel or explicit memory operation.");
503  }
504 
505  constexpr static int AccessTargetMask = 0x7ff;
509  template <typename KernelName, typename KernelType>
510  void throwOnLocalAccessorMisuse() const {
511  using NameT =
513  using KI = sycl::detail::KernelInfo<NameT>;
514 
515  auto *KernelArgs = &KI::getParamDesc(0);
516 
517  for (unsigned I = 0; I < KI::getNumParams(); ++I) {
518  const detail::kernel_param_kind_t &Kind = KernelArgs[I].kind;
519  const access::target AccTarget =
520  static_cast<access::target>(KernelArgs[I].info & AccessTargetMask);
522  (AccTarget == target::local))
523  throw sycl::exception(
525  "A local accessor must not be used in a SYCL kernel function "
526  "that is invoked via single_task or via the simple form of "
527  "parallel_for that takes a range parameter.");
528  }
529  }
530 
533  void
534  extractArgsAndReqsFromLambda(char *LambdaPtr, size_t KernelArgsNum,
535  const detail::kernel_param_desc_t *KernelArgs,
536  bool IsESIMD);
537 
539  void extractArgsAndReqs();
540 
541  void processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
542  const int Size, const size_t Index, size_t &IndexShift,
543  bool IsKernelCreatedFromSource, bool IsESIMD);
544 
546  std::string getKernelName();
547 
548  template <typename LambdaNameT> bool lambdaAndKernelHaveEqualName() {
549  // TODO It is unclear a kernel and a lambda/functor must to be equal or not
550  // for parallel_for with sycl::kernel and lambda/functor together
551  // Now if they are equal we extract argumets from lambda/functor for the
552  // kernel. Else it is necessary use set_atg(s) for resolve the order and
553  // values of arguments for the kernel.
554  assert(MKernel && "MKernel is not initialized");
555  const std::string LambdaName = detail::KernelInfo<LambdaNameT>::getName();
556  const std::string KernelName = getKernelName();
557  return LambdaName == KernelName;
558  }
559 
562  void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
563 
570  event finalize();
571 
577  void addStream(const std::shared_ptr<detail::stream_impl> &Stream) {
578  MStreamStorage.push_back(Stream);
579  }
580 
586  void addReduction(const std::shared_ptr<const void> &ReduObj);
587 
593  template <typename T, int Dimensions, typename AllocatorT, typename Enable>
594  void
595  addReduction(const std::shared_ptr<buffer<T, Dimensions, AllocatorT, Enable>>
596  &ReduBuf) {
598  addReduction(std::shared_ptr<const void>(ReduBuf));
599  }
600 
601  ~handler() = default;
602 
603  // TODO: Private and unusued. Remove when ABI break is allowed.
604  bool is_host() { return MIsHost; }
605 
606 #ifdef __SYCL_DEVICE_ONLY__
607  // In device compilation accessor isn't inherited from host base classes, so
608  // can't detect by it. Since we don't expect it to be ever called in device
609  // execution, just use blind void *.
610  void associateWithHandler(void *AccBase, access::target AccTarget);
611  void associateWithHandler(void *AccBase, image_target AccTarget);
612 #else
613  void associateWithHandlerCommon(detail::AccessorImplPtr AccImpl,
614  int AccTarget);
616  access::target AccTarget);
618  image_target AccTarget);
620  image_target AccTarget);
621 #endif
622 
623  // Recursively calls itself until arguments pack is fully processed.
624  // The version for regular(standard layout) argument.
625  template <typename T, typename... Ts>
626  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&...Args) {
627  set_arg(ArgIndex, std::move(Arg));
628  setArgsHelper(++ArgIndex, std::move(Args)...);
629  }
630 
631  void setArgsHelper(int) {}
632 
633  void setLocalAccessorArgHelper(int ArgIndex,
634  detail::LocalAccessorBaseHost &LocalAccBase) {
635  detail::LocalAccessorImplPtr LocalAccImpl =
636  detail::getSyclObjImpl(LocalAccBase);
637  detail::LocalAccessorImplHost *Req = LocalAccImpl.get();
638  MLocalAccStorage.push_back(std::move(LocalAccImpl));
639  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
640  static_cast<int>(access::target::local), ArgIndex);
641  }
642 
643  // setArgHelper for local accessor argument (legacy accessor interface)
644  template <typename DataT, int Dims, access::mode AccessMode,
646  void setArgHelper(int ArgIndex,
647  accessor<DataT, Dims, AccessMode, access::target::local,
648  IsPlaceholder> &&Arg) {
649  (void)ArgIndex;
650  (void)Arg;
651 #ifndef __SYCL_DEVICE_ONLY__
652  setLocalAccessorArgHelper(ArgIndex, Arg);
653 #endif
654  }
655 
656  // setArgHelper for local accessor argument (up to date accessor interface)
657  template <typename DataT, int Dims>
658  void setArgHelper(int ArgIndex, local_accessor<DataT, Dims> &&Arg) {
659  (void)ArgIndex;
660  (void)Arg;
661 #ifndef __SYCL_DEVICE_ONLY__
662  setLocalAccessorArgHelper(ArgIndex, Arg);
663 #endif
664  }
665 
666  // setArgHelper for non local accessor argument.
667  template <typename DataT, int Dims, access::mode AccessMode,
669  typename std::enable_if_t<AccessTarget != access::target::local, void>
670  setArgHelper(
671  int ArgIndex,
675  detail::AccessorImplHost *Req = AccImpl.get();
676  // Add accessor to the list of requirements.
677  CGData.MRequirements.push_back(Req);
678  // Store copy of the accessor.
679  CGData.MAccStorage.push_back(std::move(AccImpl));
680  // Add accessor to the list of arguments.
681  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
682  static_cast<int>(AccessTarget), ArgIndex);
683  }
684 
685  template <typename T> void setArgHelper(int ArgIndex, T &&Arg) {
686  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
687 
688  if (!std::is_same<cl_mem, T>::value && std::is_pointer<T>::value) {
689  MArgs.emplace_back(detail::kernel_param_kind_t::kind_pointer, StoredArg,
690  sizeof(T), ArgIndex);
691  } else {
693  StoredArg, sizeof(T), ArgIndex);
694  }
695  }
696 
697  void setArgHelper(int ArgIndex, sampler &&Arg) {
698  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
699  MArgs.emplace_back(detail::kernel_param_kind_t::kind_sampler, StoredArg,
700  sizeof(sampler), ArgIndex);
701  }
702 
703  // TODO: Unusued. Remove when ABI break is allowed.
704  void verifyKernelInvoc(const kernel &Kernel) {
705  std::ignore = Kernel;
706  return;
707  }
708 
709  /* The kernel passed to StoreLambda can take an id, an item or an nd_item as
710  * its argument. Since esimd plugin directly invokes the kernel (doesn’t use
711  * piKernelSetArg), the kernel argument type must be known to the plugin.
712  * However, passing kernel argument type to the plugin requires changing ABI
713  * in HostKernel class. To overcome this problem, helpers below wrap the
714  * “original” kernel with a functor that always takes an nd_item as argument.
715  * A functor is used instead of a lambda because extractArgsAndReqsFromLambda
716  * needs access to the “original” kernel and keeps references to its internal
717  * data, i.e. the kernel passed as argument cannot be local in scope. The
718  * functor itself is again encapsulated in a std::function since functor’s
719  * type is unknown to the plugin.
720  */
721 
722  // For 'id, item w/wo offset, nd_item' kernel arguments
723  template <class KernelType, class NormalizedKernelType, int Dims>
724  KernelType *ResetHostKernelHelper(const KernelType &KernelFunc) {
725  NormalizedKernelType NormalizedKernel(KernelFunc);
726  auto NormalizedKernelFunc =
727  std::function<void(const sycl::nd_item<Dims> &)>(NormalizedKernel);
728  auto HostKernelPtr = new detail::HostKernel<decltype(NormalizedKernelFunc),
729  sycl::nd_item<Dims>, Dims>(
730  std::move(NormalizedKernelFunc));
731  MHostKernel.reset(HostKernelPtr);
732  return &HostKernelPtr->MKernel.template target<NormalizedKernelType>()
733  ->MKernelFunc;
734  }
735 
736  // For 'sycl::id<Dims>' kernel argument
737  template <class KernelType, typename ArgT, int Dims>
738  std::enable_if_t<std::is_same_v<ArgT, sycl::id<Dims>>, KernelType *>
739  ResetHostKernel(const KernelType &KernelFunc) {
740  struct NormalizedKernelType {
741  KernelType MKernelFunc;
742  NormalizedKernelType(const KernelType &KernelFunc)
743  : MKernelFunc(KernelFunc) {}
744  void operator()(const nd_item<Dims> &Arg) {
745  detail::runKernelWithArg(MKernelFunc, Arg.get_global_id());
746  }
747  };
748  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
749  KernelFunc);
750  }
751 
752  // For 'sycl::nd_item<Dims>' kernel argument
753  template <class KernelType, typename ArgT, int Dims>
754  std::enable_if_t<std::is_same_v<ArgT, sycl::nd_item<Dims>>, KernelType *>
755  ResetHostKernel(const KernelType &KernelFunc) {
756  struct NormalizedKernelType {
757  KernelType MKernelFunc;
758  NormalizedKernelType(const KernelType &KernelFunc)
759  : MKernelFunc(KernelFunc) {}
760  void operator()(const nd_item<Dims> &Arg) {
761  detail::runKernelWithArg(MKernelFunc, Arg);
762  }
763  };
764  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
765  KernelFunc);
766  }
767 
768  // For 'sycl::item<Dims, without_offset>' kernel argument
769  template <class KernelType, typename ArgT, int Dims>
770  std::enable_if_t<std::is_same_v<ArgT, sycl::item<Dims, false>>, KernelType *>
771  ResetHostKernel(const KernelType &KernelFunc) {
772  struct NormalizedKernelType {
773  KernelType MKernelFunc;
774  NormalizedKernelType(const KernelType &KernelFunc)
775  : MKernelFunc(KernelFunc) {}
776  void operator()(const nd_item<Dims> &Arg) {
777  sycl::item<Dims, false> Item = detail::Builder::createItem<Dims, false>(
778  Arg.get_global_range(), Arg.get_global_id());
779  detail::runKernelWithArg(MKernelFunc, Item);
780  }
781  };
782  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
783  KernelFunc);
784  }
785 
786  // For 'sycl::item<Dims, with_offset>' kernel argument
787  template <class KernelType, typename ArgT, int Dims>
788  std::enable_if_t<std::is_same_v<ArgT, sycl::item<Dims, true>>, KernelType *>
789  ResetHostKernel(const KernelType &KernelFunc) {
790  struct NormalizedKernelType {
791  KernelType MKernelFunc;
792  NormalizedKernelType(const KernelType &KernelFunc)
793  : MKernelFunc(KernelFunc) {}
794  void operator()(const nd_item<Dims> &Arg) {
795  sycl::item<Dims, true> Item = detail::Builder::createItem<Dims, true>(
796  Arg.get_global_range(), Arg.get_global_id(), Arg.get_offset());
797  detail::runKernelWithArg(MKernelFunc, Item);
798  }
799  };
800  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
801  KernelFunc);
802  }
803 
804  // For 'void' kernel argument (single_task)
805  template <class KernelType, typename ArgT, int Dims>
806  typename std::enable_if_t<std::is_same_v<ArgT, void>, KernelType *>
807  ResetHostKernel(const KernelType &KernelFunc) {
808  struct NormalizedKernelType {
809  KernelType MKernelFunc;
810  NormalizedKernelType(const KernelType &KernelFunc)
811  : MKernelFunc(KernelFunc) {}
812  void operator()(const nd_item<Dims> &Arg) {
813  (void)Arg;
814  detail::runKernelWithoutArg(MKernelFunc);
815  }
816  };
817  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
818  KernelFunc);
819  }
820 
821  // For 'sycl::group<Dims>' kernel argument
822  // 'wrapper'-based approach using 'NormalizedKernelType' struct is not used
823  // for 'void(sycl::group<Dims>)' since 'void(sycl::group<Dims>)' is not
824  // supported in ESIMD.
825  template <class KernelType, typename ArgT, int Dims>
826  std::enable_if_t<std::is_same_v<ArgT, sycl::group<Dims>>, KernelType *>
827  ResetHostKernel(const KernelType &KernelFunc) {
828  MHostKernel.reset(
830  return (KernelType *)(MHostKernel->getPtr());
831  }
832 
840  void verifyUsedKernelBundle(const std::string &KernelName);
841 
848  template <typename KernelName, typename KernelType, int Dims,
849  typename LambdaArgType>
850  void StoreLambda(KernelType KernelFunc) {
852  constexpr bool IsCallableWithKernelHandler =
854  LambdaArgType>::value;
855 
856  if (IsCallableWithKernelHandler && MIsHost) {
857  throw sycl::feature_not_supported(
858  "kernel_handler is not yet supported by host device.",
859  PI_ERROR_INVALID_OPERATION);
860  }
861 
862  KernelType *KernelPtr =
863  ResetHostKernel<KernelType, LambdaArgType, Dims>(KernelFunc);
864 
865  constexpr bool KernelHasName =
866  KI::getName() != nullptr && KI::getName()[0] != '\0';
867 
868  // Some host compilers may have different captures from Clang. Currently
869  // there is no stable way of handling this when extracting the captures, so
870  // a static assert is made to fail for incompatible kernel lambdas.
871  static_assert(
872  !KernelHasName || sizeof(KernelFunc) == KI::getKernelSize(),
873  "Unexpected kernel lambda size. This can be caused by an "
874  "external host compiler producing a lambda with an "
875  "unexpected layout. This is a limitation of the compiler."
876  "In many cases the difference is related to capturing constexpr "
877  "variables. In such cases removing constexpr specifier aligns the "
878  "captures between the host compiler and the device compiler."
879  "\n"
880  "In case of MSVC, passing "
881  "-fsycl-host-compiler-options='/std:c++latest' "
882  "might also help.");
883 
884  // Empty name indicates that the compilation happens without integration
885  // header, so don't perform things that require it.
886  if (KernelHasName) {
887  // TODO support ESIMD in no-integration-header case too.
888  MArgs.clear();
889  extractArgsAndReqsFromLambda(reinterpret_cast<char *>(KernelPtr),
890  KI::getNumParams(), &KI::getParamDesc(0),
891  KI::isESIMD());
892  MKernelName = KI::getName();
893  } else {
894  // In case w/o the integration header it is necessary to process
895  // accessors from the list(which are associated with this handler) as
896  // arguments.
897  MArgs = std::move(MAssociatedAccesors);
898  }
899 
900  // If the kernel lambda is callable with a kernel_handler argument, manifest
901  // the associated kernel handler.
902  if (IsCallableWithKernelHandler) {
903  getOrInsertHandlerKernelBundle(/*Insert=*/true);
904  }
905  }
906 
910  template <
911  typename KernelName,
912  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
913  void processProperties(PropertiesT Props) {
915  static_assert(
917  "Template type is not a property list.");
918  static_assert(
919  !PropertiesT::template has_property<
921  (PropertiesT::template has_property<
923  KI::isESIMD()),
924  "Floating point control property is supported for ESIMD kernels only.");
925  if constexpr (PropertiesT::template has_property<
927  auto Config = Props.template get_property<
930  setKernelCacheConfig(PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM);
931  } else if (Config == sycl::ext::intel::experimental::large_data) {
932  setKernelCacheConfig(PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA);
933  }
934  } else {
935  std::ignore = Props;
936  }
937 
938  constexpr bool UsesRootSync = PropertiesT::template has_property<
940  setKernelIsCooperative(UsesRootSync);
941  }
942 
947  template <int Dims_Src, int Dims_Dst>
948  static bool IsCopyingRectRegionAvailable(const range<Dims_Src> Src,
949  const range<Dims_Dst> Dst) {
950  if (Dims_Src > Dims_Dst)
951  return false;
952  for (size_t I = 0; I < Dims_Src; ++I)
953  if (Src[I] > Dst[I])
954  return false;
955  return true;
956  }
957 
963  template <typename TSrc, int DimSrc, access::mode ModeSrc,
964  access::target TargetSrc, typename TDst, int DimDst,
965  access::mode ModeDst, access::target TargetDst,
966  access::placeholder IsPHSrc, access::placeholder IsPHDst>
967  std::enable_if_t<(DimSrc > 0) && (DimDst > 0), bool>
970  if (!MIsHost &&
971  IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range()))
972  return false;
973 
974  range<1> LinearizedRange(Src.size());
975  parallel_for<__copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
976  ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
977  LinearizedRange, [=](id<1> Id) {
978  size_t Index = Id[0];
979  id<DimSrc> SrcId = detail::getDelinearizedId(Src.get_range(), Index);
980  id<DimDst> DstId = detail::getDelinearizedId(Dst.get_range(), Index);
981  Dst[DstId] = Src[SrcId];
982  });
983  return true;
984  }
985 
993  template <typename TSrc, int DimSrc, access::mode ModeSrc,
994  access::target TargetSrc, typename TDst, int DimDst,
995  access::mode ModeDst, access::target TargetDst,
996  access::placeholder IsPHSrc, access::placeholder IsPHDst>
997  std::enable_if_t<DimSrc == 0 || DimDst == 0, bool>
1000  if (!MIsHost)
1001  return false;
1002 
1003  single_task<__copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
1004  ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
1005  [=]() { *(Dst.get_pointer()) = *(Src.get_pointer()); });
1006  return true;
1007  }
1008 
1009 #ifndef __SYCL_DEVICE_ONLY__
1015  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
1016  access::target AccTarget, access::placeholder IsPH>
1017  std::enable_if_t<(Dim > 0)>
1018  copyAccToPtrHost(accessor<TSrc, Dim, AccMode, AccTarget, IsPH> Src,
1019  TDst *Dst) {
1020  range<Dim> Range = Src.get_range();
1021  parallel_for<__copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
1022  Range, [=](id<Dim> Index) {
1023  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
1024  using TSrcNonConst = typename std::remove_const_t<TSrc>;
1025  (reinterpret_cast<TSrcNonConst *>(Dst))[LinearIndex] = Src[Index];
1026  });
1027  }
1028 
1034  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
1035  access::target AccTarget, access::placeholder IsPH>
1036  std::enable_if_t<Dim == 0>
1037  copyAccToPtrHost(accessor<TSrc, Dim, AccMode, AccTarget, IsPH> Src,
1038  TDst *Dst) {
1039  single_task<__copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
1040  [=]() {
1041  using TSrcNonConst = typename std::remove_const_t<TSrc>;
1042  *(reinterpret_cast<TSrcNonConst *>(Dst)) = *(Src.get_pointer());
1043  });
1044  }
1045 
1050  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
1051  access::target AccTarget, access::placeholder IsPH>
1052  std::enable_if_t<(Dim > 0)>
1053  copyPtrToAccHost(TSrc *Src,
1055  range<Dim> Range = Dst.get_range();
1056  parallel_for<__copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
1057  Range, [=](id<Dim> Index) {
1058  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
1059  Dst[Index] = (reinterpret_cast<const TDst *>(Src))[LinearIndex];
1060  });
1061  }
1062 
1068  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
1069  access::target AccTarget, access::placeholder IsPH>
1070  std::enable_if_t<Dim == 0>
1071  copyPtrToAccHost(TSrc *Src,
1073  single_task<__copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
1074  [=]() {
1075  *(Dst.get_pointer()) = *(reinterpret_cast<const TDst *>(Src));
1076  });
1077  }
1078 #endif // __SYCL_DEVICE_ONLY__
1079 
1080  constexpr static bool isConstOrGlobal(access::target AccessTarget) {
1081  return AccessTarget == access::target::device ||
1082  AccessTarget == access::target::constant_buffer;
1083  }
1084 
1085  constexpr static bool isImageOrImageArray(access::target AccessTarget) {
1086  return AccessTarget == access::target::image ||
1087  AccessTarget == access::target::image_array;
1088  }
1089 
1090  constexpr static bool
1091  isValidTargetForExplicitOp(access::target AccessTarget) {
1092  return isConstOrGlobal(AccessTarget) || isImageOrImageArray(AccessTarget);
1093  }
1094 
1095  constexpr static bool isValidModeForSourceAccessor(access::mode AccessMode) {
1096  return AccessMode == access::mode::read ||
1098  }
1099 
1100  constexpr static bool
1101  isValidModeForDestinationAccessor(access::mode AccessMode) {
1102  return AccessMode == access::mode::write ||
1106  }
1107 
1108  // PI APIs only support select fill sizes: 1, 2, 4, 8, 16, 32, 64, 128
1109  constexpr static bool isBackendSupportedFillSize(size_t Size) {
1110  return Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16 ||
1111  Size == 32 || Size == 64 || Size == 128;
1112  }
1113 
1114  template <int Dims, typename LambdaArgType> struct TransformUserItemType {
1115  using type = std::conditional_t<
1116  std::is_convertible_v<nd_item<Dims>, LambdaArgType>, nd_item<Dims>,
1117  std::conditional_t<std::is_convertible_v<item<Dims>, LambdaArgType>,
1118  item<Dims>, LambdaArgType>>;
1119  };
1120 
1121  std::optional<std::array<size_t, 3>> getMaxWorkGroups();
1122  // We need to use this version to support gcc 7.5.0. Remove when minimal
1123  // supported gcc version is bumped.
1124  std::tuple<std::array<size_t, 3>, bool> getMaxWorkGroups_v2();
1125 
1126  template <int Dims>
1127  std::tuple<range<Dims>, bool> getRoundedRange(range<Dims> UserRange) {
1128  range<Dims> RoundedRange = UserRange;
1129  // Disable the rounding-up optimizations under these conditions:
1130  // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
1131  // 2. The kernel is provided via an interoperability method (this uses a
1132  // different code path).
1133  // 3. The range is already a multiple of the rounding factor.
1134  //
1135  // Cases 2 and 3 could be supported with extra effort.
1136  // As an optimization for the common case it is an
1137  // implementation choice to not support those scenarios.
1138  // Note that "this_item" is a free function, i.e. not tied to any
1139  // specific id or item. When concurrent parallel_fors are executing
1140  // on a device it is difficult to tell which parallel_for the call is
1141  // being made from. One could replicate portions of the
1142  // call-graph to make this_item calls kernel-specific but this is
1143  // not considered worthwhile.
1144 
1145  // Perform range rounding if rounding-up is enabled.
1146  if (this->DisableRangeRounding())
1147  return {range<Dims>{}, false};
1148 
1149  // Range should be a multiple of this for reasonable performance.
1150  size_t MinFactorX = 16;
1151  // Range should be a multiple of this for improved performance.
1152  size_t GoodFactor = 32;
1153  // Range should be at least this to make rounding worthwhile.
1154  size_t MinRangeX = 1024;
1155 
1156  // Check if rounding parameters have been set through environment:
1157  // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
1158  this->GetRangeRoundingSettings(MinFactorX, GoodFactor, MinRangeX);
1159 
1160  // In SYCL, each dimension of a global range size is specified by
1161  // a size_t, which can be up to 64 bits. All backends should be
1162  // able to accept a kernel launch with a 32-bit global range size
1163  // (i.e. do not throw an error). The OpenCL CPU backend will
1164  // accept every 64-bit global range, but the GPU backends will not
1165  // generally accept every 64-bit global range. So, when we get a
1166  // non-32-bit global range, we wrap the old kernel in a new kernel
1167  // that has each work item peform multiple invocations the old
1168  // kernel in a 32-bit global range.
1170  id<Dims> MaxNWGs = [&] {
1171  auto [MaxWGs, HasMaxWGs] = getMaxWorkGroups_v2();
1172  if (!HasMaxWGs) {
1173  id<Dims> Default;
1174  for (int i = 0; i < Dims; ++i)
1175  Default[i] = (std::numeric_limits<int32_t>::max)();
1176  return Default;
1177  }
1178 
1179  id<Dims> IdResult;
1180  size_t Limit = (std::numeric_limits<int>::max)();
1181  for (int i = 0; i < Dims; ++i)
1182  IdResult[i] = (std::min)(Limit, MaxWGs[Dims - i - 1]);
1183  return IdResult;
1184  }();
1186  range<Dims> MaxRange;
1187  for (int i = 0; i < Dims; ++i) {
1188  auto DesiredSize = MaxNWGs[i] * GoodFactor;
1189  MaxRange[i] =
1190  DesiredSize <= M ? DesiredSize : (M / GoodFactor) * GoodFactor;
1191  }
1192 
1193  bool DidAdjust = false;
1194  auto Adjust = [&](int Dim, size_t Value) {
1195  if (this->RangeRoundingTrace())
1196  std::cout << "parallel_for range adjusted at dim " << Dim << " from "
1197  << RoundedRange[Dim] << " to " << Value << std::endl;
1198  RoundedRange[Dim] = Value;
1199  DidAdjust = true;
1200  };
1201 
1202  // Perform range rounding if there are sufficient work-items to
1203  // need rounding and the user-specified range is not a multiple of
1204  // a "good" value.
1205  if (RoundedRange[0] % MinFactorX != 0 && RoundedRange[0] >= MinRangeX) {
1206  // It is sufficient to round up just the first dimension.
1207  // Multiplying the rounded-up value of the first dimension
1208  // by the values of the remaining dimensions (if any)
1209  // will yield a rounded-up value for the total range.
1210  Adjust(0, ((RoundedRange[0] + GoodFactor - 1) / GoodFactor) * GoodFactor);
1211  }
1212 
1213  for (int i = 0; i < Dims; ++i)
1214  if (RoundedRange[i] > MaxRange[i])
1215  Adjust(i, MaxRange[i]);
1216 
1217  if (!DidAdjust)
1218  return {range<Dims>{}, false};
1219  return {RoundedRange, true};
1220  }
1221 
1233  template <
1234  typename KernelName, typename KernelType, int Dims,
1235  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1236  void parallel_for_lambda_impl(range<Dims> UserRange, PropertiesT Props,
1237  KernelType KernelFunc) {
1238  throwIfActionIsCreated();
1239  throwOnLocalAccessorMisuse<KernelName, KernelType>();
1240  if (!range_size_fits_in_size_t(UserRange))
1242  "The total number of work-items in "
1243  "a range must fit within size_t");
1244 
1245  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
1246 
1247  // If 1D kernel argument is an integral type, convert it to sycl::item<1>
1248  // If user type is convertible from sycl::item/sycl::nd_item, use
1249  // sycl::item/sycl::nd_item to transport item information
1250  using TransformedArgType = std::conditional_t<
1251  std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
1252  typename TransformUserItemType<Dims, LambdaArgType>::type>;
1253 
1254  static_assert(!std::is_same_v<TransformedArgType, sycl::nd_item<Dims>>,
1255  "Kernel argument cannot have a sycl::nd_item type in "
1256  "sycl::parallel_for with sycl::range");
1257 
1258 #if defined(SYCL2020_CONFORMANT_APIS) || \
1259  defined(__INTEL_PREVIEW_BREAKING_CHANGES)
1260  static_assert(std::is_convertible_v<item<Dims>, LambdaArgType> ||
1261  std::is_convertible_v<item<Dims, false>, LambdaArgType>,
1262  "sycl::parallel_for(sycl::range) kernel must have the "
1263  "first argument of sycl::item type, or of a type which is "
1264  "implicitly convertible from sycl::item");
1265 
1266  using RefLambdaArgType = std::add_lvalue_reference_t<LambdaArgType>;
1267  static_assert(
1268  (std::is_invocable_v<KernelType, RefLambdaArgType> ||
1269  std::is_invocable_v<KernelType, RefLambdaArgType, kernel_handler>),
1270  "SYCL kernel lambda/functor has an unexpected signature, it should be "
1271  "invocable with sycl::item and optionally sycl::kernel_handler");
1272 #endif
1273 
1274  // TODO: Properties may change the kernel function, so in order to avoid
1275  // conflicts they should be included in the name.
1276  using NameT =
1278 
1279  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1280 
1281  // Range rounding can be disabled by the user.
1282  // Range rounding is not done on the host device.
1283  // Range rounding is supported only for newer SYCL standards.
1284 #if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \
1285  !defined(DPCPP_HOST_DEVICE_OPENMP) && \
1286  !defined(DPCPP_HOST_DEVICE_PERF_NATIVE) && SYCL_LANGUAGE_VERSION >= 202001
1287  auto [RoundedRange, HasRoundedRange] = getRoundedRange(UserRange);
1288  if (HasRoundedRange) {
1289  using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
1290  auto Wrapper =
1291  getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
1292  KernelFunc, UserRange);
1293 
1294  using KName = std::conditional_t<std::is_same<KernelType, NameT>::value,
1295  decltype(Wrapper), NameWT>;
1296 
1297  kernel_parallel_for_wrapper<KName, TransformedArgType, decltype(Wrapper),
1298  PropertiesT>(Wrapper);
1299 #ifndef __SYCL_DEVICE_ONLY__
1300  // We are executing over the rounded range, but there are still
1301  // items/ids that are are constructed in ther range rounded
1302  // kernel use items/ids in the user range, which means that
1303  // __SYCL_ASSUME_INT can still be violated. So check the bounds
1304  // of the user range, instead of the rounded range.
1305  detail::checkValueRange<Dims>(UserRange);
1306  MNDRDesc.set(RoundedRange);
1307  StoreLambda<KName, decltype(Wrapper), Dims, TransformedArgType>(
1308  std::move(Wrapper));
1309  setType(detail::CG::Kernel);
1310 #endif
1311  } else
1312 #endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ &&
1313  // !DPCPP_HOST_DEVICE_OPENMP && !DPCPP_HOST_DEVICE_PERF_NATIVE &&
1314  // SYCL_LANGUAGE_VERSION >= 202001
1315  {
1316  (void)UserRange;
1317  (void)Props;
1318  kernel_parallel_for_wrapper<NameT, TransformedArgType, KernelType,
1319  PropertiesT>(KernelFunc);
1320 #ifndef __SYCL_DEVICE_ONLY__
1321  processProperties<NameT, PropertiesT>(Props);
1322  detail::checkValueRange<Dims>(UserRange);
1323  MNDRDesc.set(std::move(UserRange));
1324  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1325  std::move(KernelFunc));
1326  setType(detail::CG::Kernel);
1327 #endif
1328  }
1329  }
1330 
1344  template <typename KernelName, typename KernelType, int Dims,
1345  typename PropertiesT>
1346  void parallel_for_impl(nd_range<Dims> ExecutionRange, PropertiesT Props,
1347  _KERNELFUNCPARAM(KernelFunc)) {
1348  throwIfActionIsCreated();
1349  // TODO: Properties may change the kernel function, so in order to avoid
1350  // conflicts they should be included in the name.
1351  using NameT =
1353  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1354  using LambdaArgType =
1355  sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
1356 #if defined(SYCL2020_CONFORMANT_APIS) || \
1357  defined(__INTEL_PREVIEW_BREAKING_CHANGES)
1358  static_assert(
1359  std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
1360  "Kernel argument of a sycl::parallel_for with sycl::nd_range "
1361  "must be either sycl::nd_item or be convertible from sycl::nd_item");
1362  using TransformedArgType = sycl::nd_item<Dims>;
1363 #else
1364  // If user type is convertible from sycl::item/sycl::nd_item, use
1365  // sycl::item/sycl::nd_item to transport item information
1366  using TransformedArgType =
1367  typename TransformUserItemType<Dims, LambdaArgType>::type;
1368 #endif
1369 
1370  (void)ExecutionRange;
1371  (void)Props;
1372  kernel_parallel_for_wrapper<NameT, TransformedArgType, KernelType,
1373  PropertiesT>(KernelFunc);
1374 #ifndef __SYCL_DEVICE_ONLY__
1375  processProperties<NameT, PropertiesT>(Props);
1376  detail::checkValueRange<Dims>(ExecutionRange);
1377  MNDRDesc.set(std::move(ExecutionRange));
1378  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1379  std::move(KernelFunc));
1380  setType(detail::CG::Kernel);
1381 #endif
1382  }
1383 
1391  template <int Dims>
1392  void parallel_for_impl(range<Dims> NumWorkItems, kernel Kernel) {
1393  throwIfActionIsCreated();
1394  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1395  detail::checkValueRange<Dims>(NumWorkItems);
1396  MNDRDesc.set(std::move(NumWorkItems));
1397  setType(detail::CG::Kernel);
1398  extractArgsAndReqs();
1399  MKernelName = getKernelName();
1400  }
1401 
1412  template <
1413  typename KernelName, typename KernelType, int Dims,
1414  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1415  void parallel_for_work_group_lambda_impl(range<Dims> NumWorkGroups,
1416  PropertiesT Props,
1417  _KERNELFUNCPARAM(KernelFunc)) {
1418  throwIfActionIsCreated();
1419  // TODO: Properties may change the kernel function, so in order to avoid
1420  // conflicts they should be included in the name.
1421  using NameT =
1423  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1424  using LambdaArgType =
1425  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
1426  (void)NumWorkGroups;
1427  (void)Props;
1428  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType, KernelType,
1429  PropertiesT>(KernelFunc);
1430 #ifndef __SYCL_DEVICE_ONLY__
1431  processProperties<NameT, PropertiesT>(Props);
1432  detail::checkValueRange<Dims>(NumWorkGroups);
1433  MNDRDesc.setNumWorkGroups(NumWorkGroups);
1434  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1435  setType(detail::CG::Kernel);
1436 #endif // __SYCL_DEVICE_ONLY__
1437  }
1438 
1451  template <
1452  typename KernelName, typename KernelType, int Dims,
1453  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1454  void parallel_for_work_group_lambda_impl(range<Dims> NumWorkGroups,
1456  PropertiesT Props,
1457  _KERNELFUNCPARAM(KernelFunc)) {
1458  throwIfActionIsCreated();
1459  // TODO: Properties may change the kernel function, so in order to avoid
1460  // conflicts they should be included in the name.
1461  using NameT =
1463  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1464  using LambdaArgType =
1465  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
1466  (void)NumWorkGroups;
1467  (void)WorkGroupSize;
1468  (void)Props;
1469  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType, KernelType,
1470  PropertiesT>(KernelFunc);
1471 #ifndef __SYCL_DEVICE_ONLY__
1472  processProperties<NameT, PropertiesT>(Props);
1473  nd_range<Dims> ExecRange =
1474  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
1475  detail::checkValueRange<Dims>(ExecRange);
1476  MNDRDesc.set(std::move(ExecRange));
1477  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1478  setType(detail::CG::Kernel);
1479 #endif // __SYCL_DEVICE_ONLY__
1480  }
1481 
1482 #ifdef SYCL_LANGUAGE_VERSION
1483 #define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel]]
1484 #else
1485 #define __SYCL_KERNEL_ATTR__
1486 #endif
1487 
1488  // NOTE: the name of this function - "kernel_single_task" - is used by the
1489  // Front End to determine kernel invocation kind.
1490  template <typename KernelName, typename KernelType, typename... Props>
1491 #ifdef __SYCL_DEVICE_ONLY__
1492  [[__sycl_detail__::add_ir_attributes_function(
1493  "sycl-single-task",
1495  nullptr,
1497 #endif
1499  kernel_single_task(_KERNELFUNCPARAM(KernelFunc)) {
1500 #ifdef __SYCL_DEVICE_ONLY__
1501  KernelFunc();
1502 #else
1503  (void)KernelFunc;
1504 #endif
1505  }
1506 
1507  // NOTE: the name of this function - "kernel_single_task" - is used by the
1508  // Front End to determine kernel invocation kind.
1509  template <typename KernelName, typename KernelType, typename... Props>
1510 #ifdef __SYCL_DEVICE_ONLY__
1511  [[__sycl_detail__::add_ir_attributes_function(
1512  "sycl-single-task",
1514  nullptr,
1516 #endif
1518  kernel_single_task(_KERNELFUNCPARAM(KernelFunc), kernel_handler KH) {
1519 #ifdef __SYCL_DEVICE_ONLY__
1520  KernelFunc(KH);
1521 #else
1522  (void)KernelFunc;
1523  (void)KH;
1524 #endif
1525  }
1526 
1527  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1528  // Front End to determine kernel invocation kind.
1529  template <typename KernelName, typename ElementType, typename KernelType,
1530  typename... Props>
1531 #ifdef __SYCL_DEVICE_ONLY__
1532  [[__sycl_detail__::add_ir_attributes_function(
1535 #endif
1537  kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc)) {
1538 #ifdef __SYCL_DEVICE_ONLY__
1539  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1540 #else
1541  (void)KernelFunc;
1542 #endif
1543  }
1544 
1545  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1546  // Front End to determine kernel invocation kind.
1547  template <typename KernelName, typename ElementType, typename KernelType,
1548  typename... Props>
1549 #ifdef __SYCL_DEVICE_ONLY__
1550  [[__sycl_detail__::add_ir_attributes_function(
1553 #endif
1555  kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc), kernel_handler KH) {
1556 #ifdef __SYCL_DEVICE_ONLY__
1557  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1558 #else
1559  (void)KernelFunc;
1560  (void)KH;
1561 #endif
1562  }
1563 
1564  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1565  // used by the Front End to determine kernel invocation kind.
1566  template <typename KernelName, typename ElementType, typename KernelType,
1567  typename... Props>
1568 #ifdef __SYCL_DEVICE_ONLY__
1569  [[__sycl_detail__::add_ir_attributes_function(
1572 #endif
1574  kernel_parallel_for_work_group(_KERNELFUNCPARAM(KernelFunc)) {
1575 #ifdef __SYCL_DEVICE_ONLY__
1576  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1577 #else
1578  (void)KernelFunc;
1579 #endif
1580  }
1581 
1582  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1583  // used by the Front End to determine kernel invocation kind.
1584  template <typename KernelName, typename ElementType, typename KernelType,
1585  typename... Props>
1586 #ifdef __SYCL_DEVICE_ONLY__
1587  [[__sycl_detail__::add_ir_attributes_function(
1590 #endif
1592  kernel_parallel_for_work_group(_KERNELFUNCPARAM(KernelFunc),
1593  kernel_handler KH) {
1594 #ifdef __SYCL_DEVICE_ONLY__
1595  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1596 #else
1597  (void)KernelFunc;
1598  (void)KH;
1599 #endif
1600  }
1601 
1602  template <typename... Props> struct KernelPropertiesUnpackerImpl {
1603  // Just pass extra Props... as template parameters to the underlying
1604  // Caller->* member functions. Don't have reflection so try to use
1605  // templates as much as possible to reduce the amount of boilerplate code
1606  // needed. All the type checks are expected to be done at the Caller's
1607  // methods side.
1608 
1609  template <typename... TypesToForward, typename... ArgsTy>
1610  static void kernel_single_task_unpack(handler *h, ArgsTy... Args) {
1611  h->kernel_single_task<TypesToForward..., Props...>(Args...);
1612  }
1613 
1614  template <typename... TypesToForward, typename... ArgsTy>
1615  static void kernel_parallel_for_unpack(handler *h, ArgsTy... Args) {
1616  h->kernel_parallel_for<TypesToForward..., Props...>(Args...);
1617  }
1618 
1619  template <typename... TypesToForward, typename... ArgsTy>
1620  static void kernel_parallel_for_work_group_unpack(handler *h,
1621  ArgsTy... Args) {
1622  h->kernel_parallel_for_work_group<TypesToForward..., Props...>(Args...);
1623  }
1624  };
1625 
1626  template <typename PropertiesT>
1627  struct KernelPropertiesUnpacker : public KernelPropertiesUnpackerImpl<> {
1628  // This should always fail outside the specialization below but must be
1629  // dependent to avoid failing even if not instantiated.
1630  static_assert(
1631  ext::oneapi::experimental::is_property_list<PropertiesT>::value,
1632  "Template type is not a property list.");
1633  };
1634 
1635  template <typename... Props>
1636  struct KernelPropertiesUnpacker<
1638  : public KernelPropertiesUnpackerImpl<Props...> {};
1639 
1640  // Helper function to
1641  //
1642  // * Make use of the KernelPropertiesUnpacker above
1643  // * Decide if we need an extra kernel_handler parameter
1644  //
1645  // The interface uses a \p Lambda callback to propagate that information back
1646  // to the caller as we need the caller to communicate:
1647  //
1648  // * Name of the method to call
1649  // * Provide explicit template type parameters for the call
1650  //
1651  // Couldn't think of a better way to achieve both.
1652  template <typename KernelName, typename KernelType, typename PropertiesT,
1653  bool HasKernelHandlerArg, typename FuncTy>
1654  void unpack(_KERNELFUNCPARAM(KernelFunc), FuncTy Lambda) {
1655 #ifdef __SYCL_DEVICE_ONLY__
1656  detail::CheckDeviceCopyable<KernelType>();
1657 #endif // __SYCL_DEVICE_ONLY__
1658  using MergedPropertiesT =
1659  typename detail::GetMergedKernelProperties<KernelType,
1660  PropertiesT>::type;
1661  using Unpacker = KernelPropertiesUnpacker<MergedPropertiesT>;
1662 #ifndef __SYCL_DEVICE_ONLY__
1663  // If there are properties provided by get method then process them.
1664  if constexpr (ext::oneapi::experimental::detail::
1665  HasKernelPropertiesGetMethod<
1666  _KERNELFUNCPARAMTYPE>::value) {
1667  processProperties<KernelName>(
1668  KernelFunc.get(ext::oneapi::experimental::properties_tag{}));
1669  }
1670 #endif
1671  if constexpr (HasKernelHandlerArg) {
1672  kernel_handler KH;
1673  Lambda(Unpacker{}, this, KernelFunc, KH);
1674  } else {
1675  Lambda(Unpacker{}, this, KernelFunc);
1676  }
1677  }
1678 
1679  // NOTE: to support kernel_handler argument in kernel lambdas, only
1680  // kernel_***_wrapper functions must be called in this code
1681 
1682  template <
1683  typename KernelName, typename KernelType,
1684  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1685  void kernel_single_task_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1686  unpack<KernelName, KernelType, PropertiesT,
1688  KernelFunc, [&](auto Unpacker, auto... args) {
1689  Unpacker.template kernel_single_task_unpack<KernelName, KernelType>(
1690  args...);
1691  });
1692  }
1693 
1694  template <
1695  typename KernelName, typename ElementType, typename KernelType,
1696  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1697  void kernel_parallel_for_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1698  unpack<KernelName, KernelType, PropertiesT,
1699  detail::KernelLambdaHasKernelHandlerArgT<KernelType,
1700  ElementType>::value>(
1701  KernelFunc, [&](auto Unpacker, auto... args) {
1702  Unpacker.template kernel_parallel_for_unpack<KernelName, ElementType,
1703  KernelType>(args...);
1704  });
1705  }
1706 
1707  template <
1708  typename KernelName, typename ElementType, typename KernelType,
1709  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1710  void kernel_parallel_for_work_group_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1711  unpack<KernelName, KernelType, PropertiesT,
1712  detail::KernelLambdaHasKernelHandlerArgT<KernelType,
1713  ElementType>::value>(
1714  KernelFunc, [&](auto Unpacker, auto... args) {
1715  Unpacker.template kernel_parallel_for_work_group_unpack<
1716  KernelName, ElementType, KernelType>(args...);
1717  });
1718  }
1719 
1727  template <
1728  typename KernelName, typename KernelType,
1729  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1730  void single_task_lambda_impl(PropertiesT Props,
1731  _KERNELFUNCPARAM(KernelFunc)) {
1732  (void)Props;
1733  throwIfActionIsCreated();
1734  throwOnLocalAccessorMisuse<KernelName, KernelType>();
1735  // TODO: Properties may change the kernel function, so in order to avoid
1736  // conflicts they should be included in the name.
1737  using NameT =
1739  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1740  kernel_single_task_wrapper<NameT, KernelType, PropertiesT>(KernelFunc);
1741 #ifndef __SYCL_DEVICE_ONLY__
1742  // No need to check if range is out of INT_MAX limits as it's compile-time
1743  // known constant.
1744  MNDRDesc.set(range<1>{1});
1745  processProperties<NameT, PropertiesT>(Props);
1746  StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(KernelFunc);
1747  setType(detail::CG::Kernel);
1748 #endif
1749  }
1750 
1751  void setStateExplicitKernelBundle();
1752  void setStateSpecConstSet();
1753  bool isStateExplicitKernelBundle() const;
1754 
1755  std::shared_ptr<detail::kernel_bundle_impl>
1756  getOrInsertHandlerKernelBundle(bool Insert) const;
1757 
1758  void setHandlerKernelBundle(kernel Kernel);
1759 
1760  void setHandlerKernelBundle(
1761  const std::shared_ptr<detail::kernel_bundle_impl> &NewKernelBundleImpPtr);
1762 
1763  template <typename FuncT>
1764  std::enable_if_t<detail::check_fn_signature<std::remove_reference_t<FuncT>,
1765  void()>::value ||
1766  detail::check_fn_signature<std::remove_reference_t<FuncT>,
1767  void(interop_handle)>::value>
1768  host_task_impl(FuncT &&Func) {
1769  throwIfActionIsCreated();
1770 
1771  MNDRDesc.set(range<1>(1));
1772  MArgs = std::move(MAssociatedAccesors);
1773 
1774  MHostTask.reset(new detail::HostTask(std::move(Func)));
1775 
1777  }
1778 
1782  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
1783  getCommandGraph() const;
1784 
1791  void setUserFacingNodeType(ext::oneapi::experimental::node_type Type);
1792 
1793 public:
1794  handler(const handler &) = delete;
1795  handler(handler &&) = delete;
1796  handler &operator=(const handler &) = delete;
1797  handler &operator=(handler &&) = delete;
1798 
1799  template <auto &SpecName>
1801  typename std::remove_reference_t<decltype(SpecName)>::value_type Value) {
1802 
1803  setStateSpecConstSet();
1804 
1805  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1806  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1807 
1808  detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1810  .set_specialization_constant<SpecName>(Value);
1811  }
1812 
1813  template <auto &SpecName>
1814  typename std::remove_reference_t<decltype(SpecName)>::value_type
1816 
1817  if (isStateExplicitKernelBundle())
1819  "Specialization constants cannot be read after "
1820  "explicitly setting the used kernel bundle");
1821 
1822  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1823  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1824 
1825  return detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1827  .get_specialization_constant<SpecName>();
1828  }
1829 
1830  void
1831  use_kernel_bundle(const kernel_bundle<bundle_state::executable> &ExecBundle);
1832 
1841  template <typename DataT, int Dims, access::mode AccMode,
1842  access::target AccTarget, access::placeholder isPlaceholder>
1844  if (Acc.is_placeholder())
1845  associateWithHandler(&Acc, AccTarget);
1846  }
1847 
1851  void depends_on(event Event);
1852 
1856  void depends_on(const std::vector<event> &Events);
1857 
1858  template <typename T>
1859  using remove_cv_ref_t = typename std::remove_cv_t<std::remove_reference_t<T>>;
1860 
1861  template <typename U, typename T>
1862  using is_same_type = std::is_same<remove_cv_ref_t<U>, remove_cv_ref_t<T>>;
1863 
1864  template <typename T> struct ShouldEnableSetArg {
1865  static constexpr bool value =
1866  std::is_trivially_copyable_v<std::remove_reference_t<T>>
1867 #if SYCL_LANGUAGE_VERSION && SYCL_LANGUAGE_VERSION <= 201707
1868  && std::is_standard_layout<std::remove_reference_t<T>>::value
1869 #endif
1870  || is_same_type<sampler, T>::value // Sampler
1872  std::is_pointer_v<remove_cv_ref_t<T>>) // USM
1873  || is_same_type<cl_mem, T>::value; // Interop
1874  };
1875 
1882  template <typename T>
1883  typename std::enable_if_t<ShouldEnableSetArg<T>::value, void>
1884  set_arg(int ArgIndex, T &&Arg) {
1885  setArgHelper(ArgIndex, std::move(Arg));
1886  }
1887 
1888  template <typename DataT, int Dims, access::mode AccessMode,
1890  void
1891  set_arg(int ArgIndex,
1893  setArgHelper(ArgIndex, std::move(Arg));
1894  }
1895 
1896  template <typename DataT, int Dims>
1897  void set_arg(int ArgIndex, local_accessor<DataT, Dims> Arg) {
1898  setArgHelper(ArgIndex, std::move(Arg));
1899  }
1900 
1906  template <typename... Ts> void set_args(Ts &&...Args) {
1907  setArgsHelper(0, std::move(Args)...);
1908  }
1909 
1917  template <typename KernelName = detail::auto_name, typename KernelType>
1918  void single_task(_KERNELFUNCPARAM(KernelFunc)) {
1919  single_task_lambda_impl<KernelName>(
1921  }
1922 
1923  template <typename KernelName = detail::auto_name, typename KernelType>
1924  void parallel_for(range<1> NumWorkItems, _KERNELFUNCPARAM(KernelFunc)) {
1925  parallel_for_lambda_impl<KernelName>(
1927  std::move(KernelFunc));
1928  }
1929 
1930  template <typename KernelName = detail::auto_name, typename KernelType>
1931  void parallel_for(range<2> NumWorkItems, _KERNELFUNCPARAM(KernelFunc)) {
1932  parallel_for_lambda_impl<KernelName>(
1934  std::move(KernelFunc));
1935  }
1936 
1937  template <typename KernelName = detail::auto_name, typename KernelType>
1938  void parallel_for(range<3> NumWorkItems, _KERNELFUNCPARAM(KernelFunc)) {
1939  parallel_for_lambda_impl<KernelName>(
1941  std::move(KernelFunc));
1942  }
1943 
1945  template <typename FuncT>
1946  std::enable_if_t<detail::check_fn_signature<std::remove_reference_t<FuncT>,
1947  void()>::value ||
1949  void(interop_handle)>::value>
1950  host_task(FuncT &&Func) {
1951  host_task_impl(Func);
1952  }
1953 
1967  template <typename KernelName = detail::auto_name, typename KernelType,
1968  int Dims>
1969  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL2020")
1970  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
1971  _KERNELFUNCPARAM(KernelFunc)) {
1972  throwIfActionIsCreated();
1973  using NameT =
1975  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1976  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
1977  using TransformedArgType = std::conditional_t<
1978  std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
1979  typename TransformUserItemType<Dims, LambdaArgType>::type>;
1980  (void)NumWorkItems;
1981  (void)WorkItemOffset;
1982  kernel_parallel_for_wrapper<NameT, TransformedArgType>(KernelFunc);
1983 #ifndef __SYCL_DEVICE_ONLY__
1984  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
1985  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
1986  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1987  std::move(KernelFunc));
1988  setType(detail::CG::Kernel);
1989 #endif
1990  }
1991 
2002  template <typename KernelName = detail::auto_name, typename KernelType,
2003  int Dims>
2005  _KERNELFUNCPARAM(KernelFunc)) {
2006  parallel_for_work_group_lambda_impl<KernelName>(
2008  KernelFunc);
2009  }
2010 
2023  template <typename KernelName = detail::auto_name, typename KernelType,
2024  int Dims>
2027  _KERNELFUNCPARAM(KernelFunc)) {
2028  parallel_for_work_group_lambda_impl<KernelName>(
2029  NumWorkGroups, WorkGroupSize,
2031  }
2032 
2039  void single_task(kernel Kernel) {
2040  throwIfActionIsCreated();
2041  // Ignore any set kernel bundles and use the one associated with the kernel
2042  setHandlerKernelBundle(Kernel);
2043  // No need to check if range is out of INT_MAX limits as it's compile-time
2044  // known constant
2045  MNDRDesc.set(range<1>{1});
2046  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2047  setType(detail::CG::Kernel);
2048  extractArgsAndReqs();
2049  MKernelName = getKernelName();
2050  }
2051 
2052  void parallel_for(range<1> NumWorkItems, kernel Kernel) {
2053  parallel_for_impl(NumWorkItems, Kernel);
2054  }
2055 
2056  void parallel_for(range<2> NumWorkItems, kernel Kernel) {
2057  parallel_for_impl(NumWorkItems, Kernel);
2058  }
2059 
2060  void parallel_for(range<3> NumWorkItems, kernel Kernel) {
2061  parallel_for_impl(NumWorkItems, Kernel);
2062  }
2063 
2072  template <int Dims>
2073  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
2074  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
2075  kernel Kernel) {
2076  throwIfActionIsCreated();
2077  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2078  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2079  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
2080  setType(detail::CG::Kernel);
2081  extractArgsAndReqs();
2082  MKernelName = getKernelName();
2083  }
2084 
2093  template <int Dims> void parallel_for(nd_range<Dims> NDRange, kernel Kernel) {
2094  throwIfActionIsCreated();
2095  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2096  detail::checkValueRange<Dims>(NDRange);
2097  MNDRDesc.set(std::move(NDRange));
2098  setType(detail::CG::Kernel);
2099  extractArgsAndReqs();
2100  MKernelName = getKernelName();
2101  }
2102 
2109  template <typename KernelName = detail::auto_name, typename KernelType>
2110  void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc)) {
2111  throwIfActionIsCreated();
2112  // Ignore any set kernel bundles and use the one associated with the kernel
2113  setHandlerKernelBundle(Kernel);
2114  using NameT =
2116  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2117  (void)Kernel;
2118  kernel_single_task<NameT>(KernelFunc);
2119 #ifndef __SYCL_DEVICE_ONLY__
2120  // No need to check if range is out of INT_MAX limits as it's compile-time
2121  // known constant
2122  MNDRDesc.set(range<1>{1});
2123  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2124  setType(detail::CG::Kernel);
2125  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2126  extractArgsAndReqs();
2127  MKernelName = getKernelName();
2128  } else
2129  StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(std::move(KernelFunc));
2130 #else
2131  detail::CheckDeviceCopyable<KernelType>();
2132 #endif
2133  }
2134 
2142  template <typename KernelName = detail::auto_name, typename KernelType,
2143  int Dims>
2144  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
2145  _KERNELFUNCPARAM(KernelFunc)) {
2146  throwIfActionIsCreated();
2147  // Ignore any set kernel bundles and use the one associated with the kernel
2148  setHandlerKernelBundle(Kernel);
2149  using NameT =
2151  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2152  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
2153  (void)Kernel;
2154  (void)NumWorkItems;
2155  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2156 #ifndef __SYCL_DEVICE_ONLY__
2157  detail::checkValueRange<Dims>(NumWorkItems);
2158  MNDRDesc.set(std::move(NumWorkItems));
2159  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2160  setType(detail::CG::Kernel);
2161  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2162  extractArgsAndReqs();
2163  MKernelName = getKernelName();
2164  } else
2165  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2166  std::move(KernelFunc));
2167 #endif
2168  }
2169 
2179  template <typename KernelName = detail::auto_name, typename KernelType,
2180  int Dims>
2181  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
2182  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
2183  id<Dims> WorkItemOffset, _KERNELFUNCPARAM(KernelFunc)) {
2184  throwIfActionIsCreated();
2185  // Ignore any set kernel bundles and use the one associated with the kernel
2186  setHandlerKernelBundle(Kernel);
2187  using NameT =
2189  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2190  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
2191  (void)Kernel;
2192  (void)NumWorkItems;
2193  (void)WorkItemOffset;
2194  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2195 #ifndef __SYCL_DEVICE_ONLY__
2196  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2197  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
2198  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2199  setType(detail::CG::Kernel);
2200  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2201  extractArgsAndReqs();
2202  MKernelName = getKernelName();
2203  } else
2204  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2205  std::move(KernelFunc));
2206 #endif
2207  }
2208 
2218  template <typename KernelName = detail::auto_name, typename KernelType,
2219  int Dims>
2220  void parallel_for(kernel Kernel, nd_range<Dims> NDRange,
2221  _KERNELFUNCPARAM(KernelFunc)) {
2222  throwIfActionIsCreated();
2223  // Ignore any set kernel bundles and use the one associated with the kernel
2224  setHandlerKernelBundle(Kernel);
2225  using NameT =
2227  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2228  using LambdaArgType =
2229  sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
2230  (void)Kernel;
2231  (void)NDRange;
2232  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2233 #ifndef __SYCL_DEVICE_ONLY__
2234  detail::checkValueRange<Dims>(NDRange);
2235  MNDRDesc.set(std::move(NDRange));
2236  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2237  setType(detail::CG::Kernel);
2238  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2239  extractArgsAndReqs();
2240  MKernelName = getKernelName();
2241  } else
2242  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2243  std::move(KernelFunc));
2244 #endif
2245  }
2246 
2260  template <typename KernelName = detail::auto_name, typename KernelType,
2261  int Dims>
2262  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2263  _KERNELFUNCPARAM(KernelFunc)) {
2264  throwIfActionIsCreated();
2265  // Ignore any set kernel bundles and use the one associated with the kernel
2266  setHandlerKernelBundle(Kernel);
2267  using NameT =
2269  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2270  using LambdaArgType =
2271  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
2272  (void)Kernel;
2273  (void)NumWorkGroups;
2274  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2275 #ifndef __SYCL_DEVICE_ONLY__
2276  detail::checkValueRange<Dims>(NumWorkGroups);
2277  MNDRDesc.setNumWorkGroups(NumWorkGroups);
2278  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2279  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2280  setType(detail::CG::Kernel);
2281 #endif // __SYCL_DEVICE_ONLY__
2282  }
2283 
2299  template <typename KernelName = detail::auto_name, typename KernelType,
2300  int Dims>
2301  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2303  _KERNELFUNCPARAM(KernelFunc)) {
2304  throwIfActionIsCreated();
2305  // Ignore any set kernel bundles and use the one associated with the kernel
2306  setHandlerKernelBundle(Kernel);
2307  using NameT =
2309  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2310  using LambdaArgType =
2311  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
2312  (void)Kernel;
2313  (void)NumWorkGroups;
2314  (void)WorkGroupSize;
2315  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2316 #ifndef __SYCL_DEVICE_ONLY__
2317  nd_range<Dims> ExecRange =
2318  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
2319  detail::checkValueRange<Dims>(ExecRange);
2320  MNDRDesc.set(std::move(ExecRange));
2321  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2322  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2323  setType(detail::CG::Kernel);
2324 #endif // __SYCL_DEVICE_ONLY__
2325  }
2326 
2327  template <typename KernelName = detail::auto_name, typename KernelType,
2328  typename PropertiesT>
2329  std::enable_if_t<
2331  single_task(PropertiesT Props, _KERNELFUNCPARAM(KernelFunc)) {
2332  single_task_lambda_impl<KernelName, KernelType, PropertiesT>(Props,
2333  KernelFunc);
2334  }
2335 
2336  template <typename KernelName = detail::auto_name, typename KernelType,
2337  typename PropertiesT>
2338  std::enable_if_t<
2340  parallel_for(range<1> NumWorkItems, PropertiesT Props,
2341  _KERNELFUNCPARAM(KernelFunc)) {
2342  parallel_for_lambda_impl<KernelName, KernelType, 1, PropertiesT>(
2343  NumWorkItems, Props, std::move(KernelFunc));
2344  }
2345 
2346  template <typename KernelName = detail::auto_name, typename KernelType,
2347  typename PropertiesT>
2348  std::enable_if_t<
2350  parallel_for(range<2> NumWorkItems, PropertiesT Props,
2351  _KERNELFUNCPARAM(KernelFunc)) {
2352  parallel_for_lambda_impl<KernelName, KernelType, 2, PropertiesT>(
2353  NumWorkItems, Props, std::move(KernelFunc));
2354  }
2355 
2356  template <typename KernelName = detail::auto_name, typename KernelType,
2357  typename PropertiesT>
2358  std::enable_if_t<
2360  parallel_for(range<3> NumWorkItems, PropertiesT Props,
2361  _KERNELFUNCPARAM(KernelFunc)) {
2362  parallel_for_lambda_impl<KernelName, KernelType, 3, PropertiesT>(
2363  NumWorkItems, Props, std::move(KernelFunc));
2364  }
2365 
2366  template <typename KernelName = detail::auto_name, typename KernelType,
2367  typename PropertiesT, int Dims>
2368  std::enable_if_t<
2370  parallel_for(nd_range<Dims> Range, PropertiesT Properties,
2371  _KERNELFUNCPARAM(KernelFunc)) {
2372  parallel_for_impl<KernelName>(Range, Properties, std::move(KernelFunc));
2373  }
2374 
2376 
2377  template <typename KernelName = detail::auto_name, typename PropertiesT,
2378  typename... RestT>
2379  std::enable_if_t<
2380  (sizeof...(RestT) > 1) &&
2381  detail::AreAllButLastReductions<RestT...>::value &&
2383  parallel_for(range<1> Range, PropertiesT Properties, RestT &&...Rest) {
2384  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2385  UnsupportedGraphFeatures::sycl_reductions>();
2386  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2387  std::forward<RestT>(Rest)...);
2388  }
2389 
2390  template <typename KernelName = detail::auto_name, typename PropertiesT,
2391  typename... RestT>
2392  std::enable_if_t<
2393  (sizeof...(RestT) > 1) &&
2394  detail::AreAllButLastReductions<RestT...>::value &&
2396  parallel_for(range<2> Range, PropertiesT Properties, RestT &&...Rest) {
2397  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2398  UnsupportedGraphFeatures::sycl_reductions>();
2399  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2400  std::forward<RestT>(Rest)...);
2401  }
2402 
2403  template <typename KernelName = detail::auto_name, typename PropertiesT,
2404  typename... RestT>
2405  std::enable_if_t<
2406  (sizeof...(RestT) > 1) &&
2407  detail::AreAllButLastReductions<RestT...>::value &&
2409  parallel_for(range<3> Range, PropertiesT Properties, RestT &&...Rest) {
2410  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2411  UnsupportedGraphFeatures::sycl_reductions>();
2412  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2413  std::forward<RestT>(Rest)...);
2414  }
2415 
2416  template <typename KernelName = detail::auto_name, typename... RestT>
2417  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2418  parallel_for(range<1> Range, RestT &&...Rest) {
2419  parallel_for<KernelName>(Range,
2421  std::forward<RestT>(Rest)...);
2422  }
2423 
2424  template <typename KernelName = detail::auto_name, typename... RestT>
2425  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2426  parallel_for(range<2> Range, RestT &&...Rest) {
2427  parallel_for<KernelName>(Range,
2429  std::forward<RestT>(Rest)...);
2430  }
2431 
2432  template <typename KernelName = detail::auto_name, typename... RestT>
2433  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2434  parallel_for(range<3> Range, RestT &&...Rest) {
2435  parallel_for<KernelName>(Range,
2437  std::forward<RestT>(Rest)...);
2438  }
2439 
2440  template <typename KernelName = detail::auto_name, int Dims,
2441  typename PropertiesT, typename... RestT>
2442  std::enable_if_t<
2443  (sizeof...(RestT) > 1) &&
2444  detail::AreAllButLastReductions<RestT...>::value &&
2446  parallel_for(nd_range<Dims> Range, PropertiesT Properties, RestT &&...Rest) {
2447  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2448  UnsupportedGraphFeatures::sycl_reductions>();
2449  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2450  std::forward<RestT>(Rest)...);
2451  }
2452 
2453  template <typename KernelName = detail::auto_name, int Dims,
2454  typename... RestT>
2455  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2456  parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
2457  parallel_for<KernelName>(Range,
2459  std::forward<RestT>(Rest)...);
2460  }
2461 
2463 
2464  template <typename KernelName = detail::auto_name, typename KernelType,
2465  int Dims, typename PropertiesT>
2466  void parallel_for_work_group(range<Dims> NumWorkGroups, PropertiesT Props,
2467  _KERNELFUNCPARAM(KernelFunc)) {
2468  parallel_for_work_group_lambda_impl<KernelName, KernelType, Dims,
2469  PropertiesT>(NumWorkGroups, Props,
2470  KernelFunc);
2471  }
2472 
2473  template <typename KernelName = detail::auto_name, typename KernelType,
2474  int Dims, typename PropertiesT>
2476  range<Dims> WorkGroupSize, PropertiesT Props,
2477  _KERNELFUNCPARAM(KernelFunc)) {
2478  parallel_for_work_group_lambda_impl<KernelName, KernelType, Dims,
2479  PropertiesT>(
2480  NumWorkGroups, WorkGroupSize, Props, KernelFunc);
2481  }
2482 
2483  // Clean up KERNELFUNC macro.
2484 #undef _KERNELFUNCPARAM
2485 
2486  // Explicit copy operations API
2487 
2495  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2496  access::target AccessTarget,
2499  std::shared_ptr<T_Dst> Dst) {
2500  if (Src.is_placeholder())
2501  checkIfPlaceholderIsBoundToHandler(Src);
2502 
2503  throwIfActionIsCreated();
2504  static_assert(isValidTargetForExplicitOp(AccessTarget),
2505  "Invalid accessor target for the copy method.");
2506  static_assert(isValidModeForSourceAccessor(AccessMode),
2507  "Invalid accessor mode for the copy method.");
2508  // Make sure data shared_ptr points to is not released until we finish
2509  // work with it.
2510  CGData.MSharedPtrStorage.push_back(Dst);
2511  typename std::shared_ptr<T_Dst>::element_type *RawDstPtr = Dst.get();
2512  copy(Src, RawDstPtr);
2513  }
2514 
2522  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2523  access::target AccessTarget,
2525  void
2526  copy(std::shared_ptr<T_Src> Src,
2528  if (Dst.is_placeholder())
2529  checkIfPlaceholderIsBoundToHandler(Dst);
2530 
2531  throwIfActionIsCreated();
2532  static_assert(isValidTargetForExplicitOp(AccessTarget),
2533  "Invalid accessor target for the copy method.");
2534  static_assert(isValidModeForDestinationAccessor(AccessMode),
2535  "Invalid accessor mode for the copy method.");
2536  // TODO: Add static_assert with is_device_copyable when vec is
2537  // device-copyable.
2538  // Make sure data shared_ptr points to is not released until we finish
2539  // work with it.
2540  CGData.MSharedPtrStorage.push_back(Src);
2541  typename std::shared_ptr<T_Src>::element_type *RawSrcPtr = Src.get();
2542  copy(RawSrcPtr, Dst);
2543  }
2544 
2552  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2553  access::target AccessTarget,
2556  T_Dst *Dst) {
2557  if (Src.is_placeholder())
2558  checkIfPlaceholderIsBoundToHandler(Src);
2559 
2560  throwIfActionIsCreated();
2561  static_assert(isValidTargetForExplicitOp(AccessTarget),
2562  "Invalid accessor target for the copy method.");
2563  static_assert(isValidModeForSourceAccessor(AccessMode),
2564  "Invalid accessor mode for the copy method.");
2565 #ifndef __SYCL_DEVICE_ONLY__
2566  if (MIsHost) {
2567  // TODO: Temporary implementation for host. Should be handled by memory
2568  // manager.
2569  copyAccToPtrHost(Src, Dst);
2570  return;
2571  }
2572 #endif
2573  setType(detail::CG::CopyAccToPtr);
2574 
2576  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2577 
2578  CGData.MRequirements.push_back(AccImpl.get());
2579  MSrcPtr = static_cast<void *>(AccImpl.get());
2580  MDstPtr = static_cast<void *>(Dst);
2581  // Store copy of accessor to the local storage to make sure it is alive
2582  // until we finish
2583  CGData.MAccStorage.push_back(std::move(AccImpl));
2584  }
2585 
2593  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2594  access::target AccessTarget,
2596  void
2597  copy(const T_Src *Src,
2599  if (Dst.is_placeholder())
2600  checkIfPlaceholderIsBoundToHandler(Dst);
2601 
2602  throwIfActionIsCreated();
2603  static_assert(isValidTargetForExplicitOp(AccessTarget),
2604  "Invalid accessor target for the copy method.");
2605  static_assert(isValidModeForDestinationAccessor(AccessMode),
2606  "Invalid accessor mode for the copy method.");
2607  // TODO: Add static_assert with is_device_copyable when vec is
2608  // device-copyable.
2609 #ifndef __SYCL_DEVICE_ONLY__
2610  if (MIsHost) {
2611  // TODO: Temporary implementation for host. Should be handled by memory
2612  // manager.
2613  copyPtrToAccHost(Src, Dst);
2614  return;
2615  }
2616 #endif
2617  setType(detail::CG::CopyPtrToAcc);
2618 
2620  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2621 
2622  CGData.MRequirements.push_back(AccImpl.get());
2623  MSrcPtr = const_cast<T_Src *>(Src);
2624  MDstPtr = static_cast<void *>(AccImpl.get());
2625  // Store copy of accessor to the local storage to make sure it is alive
2626  // until we finish
2627  CGData.MAccStorage.push_back(std::move(AccImpl));
2628  }
2629 
2637  template <
2638  typename T_Src, int Dims_Src, access::mode AccessMode_Src,
2639  access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
2640  access::mode AccessMode_Dst, access::target AccessTarget_Dst,
2643  void copy(accessor<T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src,
2644  IsPlaceholder_Src>
2645  Src,
2646  accessor<T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst,
2647  IsPlaceholder_Dst>
2648  Dst) {
2649  if (Src.is_placeholder())
2650  checkIfPlaceholderIsBoundToHandler(Src);
2651  if (Dst.is_placeholder())
2652  checkIfPlaceholderIsBoundToHandler(Dst);
2653 
2654  throwIfActionIsCreated();
2655  static_assert(isValidTargetForExplicitOp(AccessTarget_Src),
2656  "Invalid source accessor target for the copy method.");
2657  static_assert(isValidTargetForExplicitOp(AccessTarget_Dst),
2658  "Invalid destination accessor target for the copy method.");
2659  static_assert(isValidModeForSourceAccessor(AccessMode_Src),
2660  "Invalid source accessor mode for the copy method.");
2661  static_assert(isValidModeForDestinationAccessor(AccessMode_Dst),
2662  "Invalid destination accessor mode for the copy method.");
2663  if (Dst.get_size() < Src.get_size())
2664  throw sycl::invalid_object_error(
2665  "The destination accessor size is too small to copy the memory into.",
2666  PI_ERROR_INVALID_OPERATION);
2667 
2668  if (copyAccToAccHelper(Src, Dst))
2669  return;
2670  setType(detail::CG::CopyAccToAcc);
2671 
2672  detail::AccessorBaseHost *AccBaseSrc = (detail::AccessorBaseHost *)&Src;
2673  detail::AccessorImplPtr AccImplSrc = detail::getSyclObjImpl(*AccBaseSrc);
2674 
2675  detail::AccessorBaseHost *AccBaseDst = (detail::AccessorBaseHost *)&Dst;
2676  detail::AccessorImplPtr AccImplDst = detail::getSyclObjImpl(*AccBaseDst);
2677 
2678  CGData.MRequirements.push_back(AccImplSrc.get());
2679  CGData.MRequirements.push_back(AccImplDst.get());
2680  MSrcPtr = AccImplSrc.get();
2681  MDstPtr = AccImplDst.get();
2682  // Store copy of accessor to the local storage to make sure it is alive
2683  // until we finish
2684  CGData.MAccStorage.push_back(std::move(AccImplSrc));
2685  CGData.MAccStorage.push_back(std::move(AccImplDst));
2686  }
2687 
2692  template <typename T, int Dims, access::mode AccessMode,
2693  access::target AccessTarget,
2695  void
2697  if (Acc.is_placeholder())
2698  checkIfPlaceholderIsBoundToHandler(Acc);
2699 
2700  throwIfActionIsCreated();
2701  static_assert(isValidTargetForExplicitOp(AccessTarget),
2702  "Invalid accessor target for the update_host method.");
2703  setType(detail::CG::UpdateHost);
2704 
2706  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2707 
2708  MDstPtr = static_cast<void *>(AccImpl.get());
2709  CGData.MRequirements.push_back(AccImpl.get());
2710  CGData.MAccStorage.push_back(std::move(AccImpl));
2711  }
2712 
2713 public:
2722  template <typename T, int Dims, access::mode AccessMode,
2723  access::target AccessTarget,
2725  typename PropertyListT = property_list>
2726  void
2728  Dst,
2729  const T &Pattern) {
2730  assert(!MIsHost && "fill() should no longer be callable on a host device.");
2731 
2732  if (Dst.is_placeholder())
2733  checkIfPlaceholderIsBoundToHandler(Dst);
2734 
2735  throwIfActionIsCreated();
2736  setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill);
2737  // TODO add check:T must be an integral scalar value or a SYCL vector type
2738  static_assert(isValidTargetForExplicitOp(AccessTarget),
2739  "Invalid accessor target for the fill method.");
2740  // CG::Fill will result in piEnqueuFillBuffer/Image which requires that mem
2741  // data is contiguous. Thus we check range and offset when dim > 1
2742  // Images don't allow ranged accessors and are fine.
2743  if constexpr (isBackendSupportedFillSize(sizeof(T)) &&
2744  ((Dims <= 1) || isImageOrImageArray(AccessTarget))) {
2745  StageFillCG(Dst, Pattern);
2746  } else if constexpr (Dims == 0) {
2747  // Special case for zero-dim accessors.
2748  parallel_for<__fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
2749  range<1>(1), [=](id<1>) { Dst = Pattern; });
2750  } else {
2751  // Dim > 1
2752  bool OffsetUsable = (Dst.get_offset() == sycl::id<Dims>{});
2754  bool RangesUsable =
2755  (AccBase->getAccessRange() == AccBase->getMemoryRange());
2756  if (OffsetUsable && RangesUsable &&
2757  isBackendSupportedFillSize(sizeof(T))) {
2758  StageFillCG(Dst, Pattern);
2759  } else {
2760  range<Dims> Range = Dst.get_range();
2761  parallel_for<__fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
2762  Range, [=](id<Dims> Index) { Dst[Index] = Pattern; });
2763  }
2764  }
2765  }
2766 
2773  template <typename T> void fill(void *Ptr, const T &Pattern, size_t Count) {
2774  throwIfActionIsCreated();
2775  setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill);
2776  static_assert(is_device_copyable<T>::value,
2777  "Pattern must be device copyable");
2778  parallel_for<__usmfill<T>>(range<1>(Count), [=](id<1> Index) {
2779  T *CastedPtr = static_cast<T *>(Ptr);
2780  CastedPtr[Index] = Pattern;
2781  });
2782  }
2783 
2788  throwIfActionIsCreated();
2789  setType(detail::CG::Barrier);
2790  }
2791 
2798  void ext_oneapi_barrier(const std::vector<event> &WaitList);
2799 
2810  void memcpy(void *Dest, const void *Src, size_t Count);
2811 
2822  template <typename T> void copy(const T *Src, T *Dest, size_t Count) {
2823  this->memcpy(Dest, Src, Count * sizeof(T));
2824  }
2825 
2833  void memset(void *Dest, int Value, size_t Count);
2834 
2841  void prefetch(const void *Ptr, size_t Count);
2842 
2849  void mem_advise(const void *Ptr, size_t Length, int Advice);
2850 
2867  template <typename T = unsigned char,
2868  typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
2869  void ext_oneapi_memcpy2d(void *Dest, size_t DestPitch, const void *Src,
2870  size_t SrcPitch, size_t Width, size_t Height) {
2871  throwIfGraphAssociated<
2872  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
2873  sycl_ext_oneapi_memcpy2d>();
2874  throwIfActionIsCreated();
2875  if (Width > DestPitch)
2877  "Destination pitch must be greater than or equal "
2878  "to the width specified in 'ext_oneapi_memcpy2d'");
2879  if (Width > SrcPitch)
2881  "Source pitch must be greater than or equal "
2882  "to the width specified in 'ext_oneapi_memcpy2d'");
2883 
2884  // Get the type of the pointers.
2885  context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
2886  usm::alloc SrcAllocType = get_pointer_type(Src, Ctx);
2887  usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);
2888  bool SrcIsHost =
2889  SrcAllocType == usm::alloc::unknown || SrcAllocType == usm::alloc::host;
2890  bool DestIsHost = DestAllocType == usm::alloc::unknown ||
2891  DestAllocType == usm::alloc::host;
2892 
2893  // Do the following:
2894  // 1. If both are host, use host_task to copy.
2895  // 2. If either pointer is host or the backend supports native memcpy2d, use
2896  // special command.
2897  // 3. Otherwise, launch a kernel for copying.
2898  if (SrcIsHost && DestIsHost) {
2899  commonUSMCopy2DFallbackHostTask<T>(Src, SrcPitch, Dest, DestPitch, Width,
2900  Height);
2901  } else if (SrcIsHost || DestIsHost || supportsUSMMemcpy2D()) {
2902  ext_oneapi_memcpy2d_impl(Dest, DestPitch, Src, SrcPitch, Width, Height);
2903  } else {
2904  commonUSMCopy2DFallbackKernel<T>(Src, SrcPitch, Dest, DestPitch, Width,
2905  Height);
2906  }
2907  }
2908 
2922  template <typename T>
2923  void ext_oneapi_copy2d(const T *Src, size_t SrcPitch, T *Dest,
2924  size_t DestPitch, size_t Width, size_t Height) {
2925  if (Width > DestPitch)
2927  "Destination pitch must be greater than or equal "
2928  "to the width specified in 'ext_oneapi_copy2d'");
2929  if (Width > SrcPitch)
2931  "Source pitch must be greater than or equal "
2932  "to the width specified in 'ext_oneapi_copy2d'");
2933 
2934  // Get the type of the pointers.
2935  context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
2936  usm::alloc SrcAllocType = get_pointer_type(Src, Ctx);
2937  usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);
2938  bool SrcIsHost =
2939  SrcAllocType == usm::alloc::unknown || SrcAllocType == usm::alloc::host;
2940  bool DestIsHost = DestAllocType == usm::alloc::unknown ||
2941  DestAllocType == usm::alloc::host;
2942 
2943  // Do the following:
2944  // 1. If both are host, use host_task to copy.
2945  // 2. If either pointer is host or of the backend supports native memcpy2d,
2946  // use special command.
2947  // 3. Otherwise, launch a kernel for copying.
2948  if (SrcIsHost && DestIsHost) {
2949  commonUSMCopy2DFallbackHostTask<T>(Src, SrcPitch, Dest, DestPitch, Width,
2950  Height);
2951  } else if (SrcIsHost || DestIsHost || supportsUSMMemcpy2D()) {
2952  ext_oneapi_memcpy2d_impl(Dest, DestPitch * sizeof(T), Src,
2953  SrcPitch * sizeof(T), Width * sizeof(T), Height);
2954  } else {
2955  commonUSMCopy2DFallbackKernel<T>(Src, SrcPitch, Dest, DestPitch, Width,
2956  Height);
2957  }
2958  }
2959 
2975  template <typename T = unsigned char,
2976  typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
2977  void ext_oneapi_memset2d(void *Dest, size_t DestPitch, int Value,
2978  size_t Width, size_t Height) {
2979  throwIfActionIsCreated();
2980  if (Width > DestPitch)
2982  "Destination pitch must be greater than or equal "
2983  "to the width specified in 'ext_oneapi_memset2d'");
2984  T CharVal = static_cast<T>(Value);
2985 
2986  context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
2987  usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);
2988 
2989  // If the backends supports 2D fill we use that. Otherwise we use a fallback
2990  // kernel. If the target is on host we will always do the operation on host.
2991  if (DestAllocType == usm::alloc::unknown ||
2992  DestAllocType == usm::alloc::host)
2993  commonUSMFill2DFallbackHostTask(Dest, DestPitch, CharVal, Width, Height);
2994  else if (supportsUSMMemset2D())
2995  ext_oneapi_memset2d_impl(Dest, DestPitch, Value, Width, Height);
2996  else
2997  commonUSMFill2DFallbackKernel(Dest, DestPitch, CharVal, Width, Height);
2998  }
2999 
3012  template <typename T>
3013  void ext_oneapi_fill2d(void *Dest, size_t DestPitch, const T &Pattern,
3014  size_t Width, size_t Height) {
3015  throwIfActionIsCreated();
3016  static_assert(is_device_copyable<T>::value,
3017  "Pattern must be device copyable");
3018  if (Width > DestPitch)
3020  "Destination pitch must be greater than or equal "
3021  "to the width specified in 'ext_oneapi_fill2d'");
3022 
3023  context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
3024  usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);
3025 
3026  // If the backends supports 2D fill we use that. Otherwise we use a fallback
3027  // kernel. If the target is on host we will always do the operation on host.
3028  if (DestAllocType == usm::alloc::unknown ||
3029  DestAllocType == usm::alloc::host)
3030  commonUSMFill2DFallbackHostTask(Dest, DestPitch, Pattern, Width, Height);
3031  else if (supportsUSMFill2D())
3032  ext_oneapi_fill2d_impl(Dest, DestPitch, &Pattern, sizeof(T), Width,
3033  Height);
3034  else
3035  commonUSMFill2DFallbackKernel(Dest, DestPitch, Pattern, Width, Height);
3036  }
3037 
3046  template <typename T, typename PropertyListT>
3048  const void *Src, size_t NumBytes = sizeof(T),
3049  size_t DestOffset = 0) {
3050  throwIfGraphAssociated<
3051  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
3052  sycl_ext_oneapi_device_global>();
3053  if (sizeof(T) < DestOffset + NumBytes)
3055  "Copy to device_global is out of bounds.");
3056 
3057  constexpr bool IsDeviceImageScoped = PropertyListT::template has_property<
3059 
3060  if (!detail::isDeviceGlobalUsedInKernel(&Dest)) {
3061  // If the corresponding device_global isn't used in any kernels, we fall
3062  // back to doing the memory operation on host-only.
3063  memcpyToHostOnlyDeviceGlobal(&Dest, Src, sizeof(T), IsDeviceImageScoped,
3064  NumBytes, DestOffset);
3065  return;
3066  }
3067 
3068  memcpyToDeviceGlobal(&Dest, Src, IsDeviceImageScoped, NumBytes, DestOffset);
3069  }
3070 
3079  template <typename T, typename PropertyListT>
3080  void
3081  memcpy(void *Dest,
3083  size_t NumBytes = sizeof(T), size_t SrcOffset = 0) {
3084  throwIfGraphAssociated<
3085  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
3086  sycl_ext_oneapi_device_global>();
3087  if (sizeof(T) < SrcOffset + NumBytes)
3089  "Copy from device_global is out of bounds.");
3090 
3091  constexpr bool IsDeviceImageScoped = PropertyListT::template has_property<
3093 
3095  // If the corresponding device_global isn't used in any kernels, we fall
3096  // back to doing the memory operation on host-only.
3097  memcpyFromHostOnlyDeviceGlobal(Dest, &Src, IsDeviceImageScoped, NumBytes,
3098  SrcOffset);
3099  return;
3100  }
3101 
3102  memcpyFromDeviceGlobal(Dest, &Src, IsDeviceImageScoped, NumBytes,
3103  SrcOffset);
3104  }
3105 
3115  template <typename T, typename PropertyListT>
3116  void copy(const std::remove_all_extents_t<T> *Src,
3118  size_t Count = sizeof(T) / sizeof(std::remove_all_extents_t<T>),
3119  size_t StartIndex = 0) {
3120  this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
3121  StartIndex * sizeof(std::remove_all_extents_t<T>));
3122  }
3123 
3134  template <typename T, typename PropertyListT>
3135  void
3137  std::remove_all_extents_t<T> *Dest,
3138  size_t Count = sizeof(T) / sizeof(std::remove_all_extents_t<T>),
3139  size_t StartIndex = 0) {
3140  this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
3141  StartIndex * sizeof(std::remove_all_extents_t<T>));
3142  }
3146  void ext_oneapi_graph(ext::oneapi::experimental::command_graph<
3148  Graph);
3149 
3158  void ext_oneapi_copy(
3160  const ext::oneapi::experimental::image_descriptor &DestImgDesc);
3161 
3182  void ext_oneapi_copy(
3183  void *Src, sycl::range<3> SrcOffset, sycl::range<3> SrcExtent,
3185  sycl::range<3> DestOffset,
3187  sycl::range<3> CopyExtent);
3188 
3198  void ext_oneapi_copy(
3201 
3223  void
3224  ext_oneapi_copy(ext::oneapi::experimental::image_mem_handle Src,
3225  sycl::range<3> SrcOffset,
3227  void *Dest, sycl::range<3> DestOffset,
3228  sycl::range<3> DestExtent, sycl::range<3> CopyExtent);
3229 
3240  void ext_oneapi_copy(
3241  void *Src, void *Dest,
3242  const ext::oneapi::experimental::image_descriptor &DeviceImgDesc,
3243  size_t DeviceRowPitch);
3244 
3267  void ext_oneapi_copy(
3268  void *Src, sycl::range<3> SrcOffset, void *Dest,
3269  sycl::range<3> DestOffset,
3270  const ext::oneapi::experimental::image_descriptor &DeviceImgDesc,
3271  size_t DeviceRowPitch, sycl::range<3> HostExtent,
3272  sycl::range<3> CopyExtent);
3273 
3278  void ext_oneapi_wait_external_semaphore(
3280  SemaphoreHandle);
3281 
3287  void ext_oneapi_signal_external_semaphore(
3289  SemaphoreHandle);
3290 
3291 private:
3292  std::shared_ptr<detail::handler_impl> MImpl;
3293  std::shared_ptr<detail::queue_impl> MQueue;
3294 
3299  mutable detail::CG::StorageInitHelper CGData;
3300  std::vector<detail::LocalAccessorImplPtr> MLocalAccStorage;
3301  std::vector<std::shared_ptr<detail::stream_impl>> MStreamStorage;
3303  std::vector<detail::ArgDesc> MArgs;
3307  std::vector<detail::ArgDesc> MAssociatedAccesors;
3309  detail::NDRDescT MNDRDesc;
3310  std::string MKernelName;
3312  std::shared_ptr<detail::kernel_impl> MKernel;
3318  void *MSrcPtr = nullptr;
3320  void *MDstPtr = nullptr;
3322  size_t MLength = 0;
3324  std::vector<char> MPattern;
3326  std::unique_ptr<detail::HostKernelBase> MHostKernel;
3328  std::unique_ptr<detail::HostTask> MHostTask;
3331  std::vector<detail::EventImplPtr> MEventsWaitWithBarrier;
3332 
3334  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph;
3337  std::shared_ptr<ext::oneapi::experimental::detail::exec_graph_impl>
3338  MExecGraph;
3340  std::shared_ptr<ext::oneapi::experimental::detail::node_impl> MSubgraphNode;
3342  std::unique_ptr<detail::CG> MGraphNodeCG;
3343 
3344  bool MIsHost = false;
3345 
3346  detail::code_location MCodeLoc = {};
3347  bool MIsFinalized = false;
3348  event MLastEvent;
3349 
3350  // Make queue_impl class friend to be able to call finalize method.
3351  friend class detail::queue_impl;
3352  // Make accessor class friend to keep the list of associated accessors.
3353  template <typename DataT, int Dims, access::mode AccMode,
3354  access::target AccTarget, access::placeholder isPlaceholder,
3355  typename PropertyListT>
3356  friend class accessor;
3358 
3359  template <typename DataT, int Dimensions, access::mode AccessMode,
3362  // Make stream class friend to be able to keep the list of associated streams
3363  friend class stream;
3364  friend class detail::stream_impl;
3365  // Make reduction friends to store buffers and arrays created for it
3366  // in handler from reduction methods.
3367  template <typename T, class BinaryOperation, int Dims, size_t Extent,
3368  bool ExplicitIdentity, typename RedOutVar>
3370 
3372  template <class FunctorTy>
3373  friend void detail::reduction::withAuxHandler(handler &CGH, FunctorTy Func);
3374 
3375  template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
3376  typename PropertiesT, typename... RestT>
3378  PropertiesT Properties,
3379  RestT... Rest);
3380 
3381  template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
3382  typename PropertiesT, typename... RestT>
3383  friend void
3385  PropertiesT Properties, RestT... Rest);
3386 
3387 #ifndef __SYCL_DEVICE_ONLY__
3390  access::target);
3395 #endif
3396 
3397  friend class ::MockHandler;
3398  friend class detail::queue_impl;
3399 
3400  // Make pipe class friend to be able to call ext_intel_read/write_host_pipe
3401  // method.
3402  template <class _name, class _dataT, int32_t _min_capacity,
3403  class _propertiesT, class>
3405 
3412  void ext_intel_read_host_pipe(const std::string &Name, void *Ptr, size_t Size,
3413  bool Block = false);
3414 
3421  void ext_intel_write_host_pipe(const std::string &Name, void *Ptr,
3422  size_t Size, bool Block = false);
3424 
3425  bool DisableRangeRounding();
3426 
3427  bool RangeRoundingTrace();
3428 
3429  void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
3430  size_t &MinRange);
3431 
3432  template <typename WrapperT, typename TransformedArgType, int Dims,
3433  typename KernelType,
3435  KernelType, TransformedArgType>::value> * = nullptr>
3436  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
3437  range<Dims> UserRange) {
3438  return detail::RoundedRangeKernelWithKH<TransformedArgType, Dims,
3439  KernelType>{UserRange, KernelFunc};
3440  }
3441 
3442  template <typename WrapperT, typename TransformedArgType, int Dims,
3443  typename KernelType,
3444  std::enable_if_t<!detail::KernelLambdaHasKernelHandlerArgT<
3445  KernelType, TransformedArgType>::value> * = nullptr>
3446  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
3447  range<Dims> UserRange) {
3448  return detail::RoundedRangeKernel<TransformedArgType, Dims, KernelType>{
3449  UserRange, KernelFunc};
3450  }
3451 
3452  const std::shared_ptr<detail::context_impl> &getContextImplPtr() const;
3453 
3454  // Checks if 2D memory operations are supported by the underlying platform.
3455  bool supportsUSMMemcpy2D();
3456  bool supportsUSMFill2D();
3457  bool supportsUSMMemset2D();
3458 
3459  // Helper function for getting a loose bound on work-items.
3460  id<2> computeFallbackKernelBounds(size_t Width, size_t Height);
3461 
3462  // Common function for launching a 2D USM memcpy kernel to avoid redefinitions
3463  // of the kernel from copy and memcpy.
3464  template <typename T>
3465  void commonUSMCopy2DFallbackKernel(const void *Src, size_t SrcPitch,
3466  void *Dest, size_t DestPitch, size_t Width,
3467  size_t Height) {
3468  // Otherwise the data is accessible on the device so we do the operation
3469  // there instead.
3470  // Limit number of work items to be resistant to big copies.
3471  id<2> Chunk = computeFallbackKernelBounds(Height, Width);
3472  id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
3473  parallel_for<__usmmemcpy2d<T>>(
3474  range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) {
3475  T *CastedDest = static_cast<T *>(Dest);
3476  const T *CastedSrc = static_cast<const T *>(Src);
3477  for (uint32_t I = 0; I < Iterations[0]; ++I) {
3478  for (uint32_t J = 0; J < Iterations[1]; ++J) {
3479  id<2> adjustedIndex = Index + Chunk * id<2>{I, J};
3480  if (adjustedIndex[0] < Height && adjustedIndex[1] < Width) {
3481  CastedDest[adjustedIndex[0] * DestPitch + adjustedIndex[1]] =
3482  CastedSrc[adjustedIndex[0] * SrcPitch + adjustedIndex[1]];
3483  }
3484  }
3485  }
3486  });
3487  }
3488 
3489  // Common function for launching a 2D USM memcpy host-task to avoid
3490  // redefinitions of the kernel from copy and memcpy.
3491  template <typename T>
3492  void commonUSMCopy2DFallbackHostTask(const void *Src, size_t SrcPitch,
3493  void *Dest, size_t DestPitch,
3494  size_t Width, size_t Height) {
3495  // If both pointers are host USM or unknown (assumed non-USM) we use a
3496  // host-task to satisfy dependencies.
3497  host_task([=] {
3498  const T *CastedSrc = static_cast<const T *>(Src);
3499  T *CastedDest = static_cast<T *>(Dest);
3500  for (size_t I = 0; I < Height; ++I) {
3501  const T *SrcItBegin = CastedSrc + SrcPitch * I;
3502  T *DestItBegin = CastedDest + DestPitch * I;
3503  std::copy(SrcItBegin, SrcItBegin + Width, DestItBegin);
3504  }
3505  });
3506  }
3507 
3508  // StageFillCG() Supporting function to fill()
3509  template <typename T, int Dims, access::mode AccessMode,
3510  access::target AccessTarget,
3512  typename PropertyListT = property_list>
3513  void StageFillCG(
3514  accessor<T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT>
3515  Dst,
3516  const T &Pattern) {
3517  setType(detail::CG::Fill);
3518  detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Dst;
3519  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
3520 
3521  MDstPtr = static_cast<void *>(AccImpl.get());
3522  CGData.MRequirements.push_back(AccImpl.get());
3523  CGData.MAccStorage.push_back(std::move(AccImpl));
3524 
3525  MPattern.resize(sizeof(T));
3526  auto PatternPtr = reinterpret_cast<T *>(MPattern.data());
3527  *PatternPtr = Pattern;
3528  }
3529 
3530  // Common function for launching a 2D USM fill kernel to avoid redefinitions
3531  // of the kernel from memset and fill.
3532  template <typename T>
3533  void commonUSMFill2DFallbackKernel(void *Dest, size_t DestPitch,
3534  const T &Pattern, size_t Width,
3535  size_t Height) {
3536  // Otherwise the data is accessible on the device so we do the operation
3537  // there instead.
3538  // Limit number of work items to be resistant to big fill operations.
3539  id<2> Chunk = computeFallbackKernelBounds(Height, Width);
3540  id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
3541  parallel_for<__usmfill2d<T>>(
3542  range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) {
3543  T *CastedDest = static_cast<T *>(Dest);
3544  for (uint32_t I = 0; I < Iterations[0]; ++I) {
3545  for (uint32_t J = 0; J < Iterations[1]; ++J) {
3546  id<2> adjustedIndex = Index + Chunk * id<2>{I, J};
3547  if (adjustedIndex[0] < Height && adjustedIndex[1] < Width) {
3548  CastedDest[adjustedIndex[0] * DestPitch + adjustedIndex[1]] =
3549  Pattern;
3550  }
3551  }
3552  }
3553  });
3554  }
3555 
3556  // Common function for launching a 2D USM fill kernel or host_task to avoid
3557  // redefinitions of the kernel from memset and fill.
3558  template <typename T>
3559  void commonUSMFill2DFallbackHostTask(void *Dest, size_t DestPitch,
3560  const T &Pattern, size_t Width,
3561  size_t Height) {
3562  // If the pointer is host USM or unknown (assumed non-USM) we use a
3563  // host-task to satisfy dependencies.
3564  host_task([=] {
3565  T *CastedDest = static_cast<T *>(Dest);
3566  for (size_t I = 0; I < Height; ++I) {
3567  T *ItBegin = CastedDest + DestPitch * I;
3568  std::fill(ItBegin, ItBegin + Width, Pattern);
3569  }
3570  });
3571  }
3572 
3573  // Implementation of ext_oneapi_memcpy2d using command for native 2D memcpy.
3574  void ext_oneapi_memcpy2d_impl(void *Dest, size_t DestPitch, const void *Src,
3575  size_t SrcPitch, size_t Width, size_t Height);
3576 
3577  // Untemplated version of ext_oneapi_fill2d using command for native 2D fill.
3578  void ext_oneapi_fill2d_impl(void *Dest, size_t DestPitch, const void *Value,
3579  size_t ValueSize, size_t Width, size_t Height);
3580 
3581  // Implementation of ext_oneapi_memset2d using command for native 2D memset.
3582  void ext_oneapi_memset2d_impl(void *Dest, size_t DestPitch, int Value,
3583  size_t Width, size_t Height);
3584 
3585  // Implementation of memcpy to device_global.
3586  void memcpyToDeviceGlobal(const void *DeviceGlobalPtr, const void *Src,
3587  bool IsDeviceImageScoped, size_t NumBytes,
3588  size_t Offset);
3589 
3590  // Implementation of memcpy from device_global.
3591  void memcpyFromDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
3592  bool IsDeviceImageScoped, size_t NumBytes,
3593  size_t Offset);
3594 
3595  // Implementation of memcpy to an unregistered device_global.
3596  void memcpyToHostOnlyDeviceGlobal(const void *DeviceGlobalPtr,
3597  const void *Src, size_t DeviceGlobalTSize,
3598  bool IsDeviceImageScoped, size_t NumBytes,
3599  size_t Offset);
3600 
3601  // Implementation of memcpy from an unregistered device_global.
3602  void memcpyFromHostOnlyDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
3603  bool IsDeviceImageScoped, size_t NumBytes,
3604  size_t Offset);
3605 
3606  template <typename T, int Dims, access::mode AccessMode,
3607  access::target AccessTarget,
3609  typename PropertyListT = property_list>
3610  void checkIfPlaceholderIsBoundToHandler(
3611  accessor<T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT>
3612  Acc) {
3613  auto *AccBase = reinterpret_cast<detail::AccessorBaseHost *>(&Acc);
3614  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
3615  detail::AccessorImplHost *Req = AccImpl.get();
3616  if (std::find_if(MAssociatedAccesors.begin(), MAssociatedAccesors.end(),
3617  [&](const detail::ArgDesc &AD) {
3618  return AD.MType ==
3619  detail::kernel_param_kind_t::kind_accessor &&
3620  AD.MPtr == Req &&
3621  AD.MSize == static_cast<int>(AccessTarget);
3622  }) == MAssociatedAccesors.end())
3624  "placeholder accessor must be bound by calling "
3625  "handler::require() before it can be used.");
3626  }
3627 
3628  // Set value of the gpu cache configuration for the kernel.
3629  void setKernelCacheConfig(sycl::detail::pi::PiKernelCacheConfig);
3630  // Set value of the kernel is cooperative flag
3631  void setKernelIsCooperative(bool);
3632 
3633  template <
3635  void throwIfGraphAssociated() const {
3636 
3637  if (getCommandGraph()) {
3638  std::string FeatureString =
3640  FeatureT);
3642  "The " + FeatureString +
3643  " feature is not yet available "
3644  "for use with the SYCL Graph extension.");
3645  }
3646  }
3647 };
3648 } // namespace _V1
3649 } // namespace sycl
The file contains implementations of accessor class.
Defines a shared array that can be used by kernels in queues.
Definition: buffer.hpp:170
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:51
CGTYPE
Type of the command group.
Definition: cg.hpp:56
RoundedRangeIDGenerator(const id< Dims > &Id, const range< Dims > &UserRange, const range< Dims > &RoundedRange)
Definition: handler.hpp:325
void operator()(item< Dims > It, kernel_handler KH) const
Definition: handler.hpp:387
void operator()(item< Dims > It) const
Definition: handler.hpp:372
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:42
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:59
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
Implementation details of command_graph<modifiable>.
Definition: graph_impl.hpp:565
Command group handler class.
Definition: handler.hpp:454
void fill(void *Ptr, const T &Pattern, size_t Count)
Fills the specified memory with the specified pattern.
Definition: handler.hpp:2773
void parallel_for(range< 2 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:2056
void parallel_for(kernel Kernel, range< Dims > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range.
Definition: handler.hpp:2144
void single_task(_KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:1918
void copy(const std::remove_all_extents_t< T > *Src, ext::oneapi::experimental::device_global< T, PropertyListT > &Dest, size_t Count=sizeof(T)/sizeof(std::remove_all_extents_t< T >), size_t StartIndex=0)
Copies elements of type std::remove_all_extents_t<T> from a USM memory region to a device_global.
Definition: handler.hpp:3116
void parallel_for(nd_range< Dims > NDRange, kernel Kernel)
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:2093
void parallel_for(range< 1 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:1924
void parallel_for_work_group(range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2475
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 2 > Range, PropertiesT Properties, RestT &&...Rest)
Definition: handler.hpp:2396
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2498
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(nd_range< Dims > Range, PropertiesT Properties, RestT &&...Rest)
Definition: handler.hpp:2446
void copy(const ext::oneapi::experimental::device_global< T, PropertyListT > &Src, std::remove_all_extents_t< T > *Dest, size_t Count=sizeof(T)/sizeof(std::remove_all_extents_t< T >), size_t StartIndex=0)
Copies elements of type std::remove_all_extents_t<T> from a device_global to a USM memory region.
Definition: handler.hpp:3136
void ext_oneapi_memset2d(void *Dest, size_t DestPitch, int Value, size_t Width, size_t Height)
Fills the memory pointed by a USM pointer with the value specified.
Definition: handler.hpp:2977
void parallel_for(range< 3 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:1938
handler & operator=(handler &&)=delete
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 3 > Range, PropertiesT Properties, RestT &&...Rest)
Definition: handler.hpp:2409
handler(handler &&)=delete
void copy(accessor< T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src, IsPlaceholder_Src > Src, accessor< T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst, IsPlaceholder_Dst > Dst)
Copies the content of memory object accessed by Src to the memory object accessed by Dst.
Definition: handler.hpp:2643
void copy(const T *Src, T *Dest, size_t Count)
Copies data from one memory region to another, each is either a host pointer or a pointer within USM ...
Definition: handler.hpp:2822
void parallel_for_work_group(range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:2025
void ext_oneapi_fill2d(void *Dest, size_t DestPitch, const T &Pattern, size_t Width, size_t Height)
Fills the memory pointed by a USM pointer with the value specified.
Definition: handler.hpp:3013
void parallel_for(range< 3 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:2060
void parallel_for(range< 1 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:2052
void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function.
Definition: handler.hpp:2110
handler & operator=(const handler &)=delete
void parallel_for(kernel Kernel, nd_range< Dims > NDRange, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:2220
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2301
void require(accessor< DataT, Dims, AccMode, AccTarget, isPlaceholder > Acc)
Requires access to the memory object associated with the placeholder accessor.
Definition: handler.hpp:1843
void fill(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT > Dst, const T &Pattern)
Fills memory pointed by accessor with the pattern given.
Definition: handler.hpp:2727
void update_host(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder > Acc)
Provides guarantees that the memory object accessed via Acc is updated on the host after command grou...
Definition: handler.hpp:2696
void ext_oneapi_copy2d(const T *Src, size_t SrcPitch, T *Dest, size_t DestPitch, size_t Width, size_t Height)
Copies data from one 2D memory region to another, both pointed by USM pointers.
Definition: handler.hpp:2923
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 2 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2350
std::is_same< remove_cv_ref_t< U >, remove_cv_ref_t< T > > is_same_type
Definition: handler.hpp:1862
std::enable_if_t< ShouldEnableSetArg< T >::value, void > set_arg(int ArgIndex, T &&Arg)
Sets argument for OpenCL interoperability kernels.
Definition: handler.hpp:1884
void ext_oneapi_memcpy2d(void *Dest, size_t DestPitch, const void *Src, size_t SrcPitch, size_t Width, size_t Height)
Copies data from one 2D memory region to another, both pointed by USM pointers.
Definition: handler.hpp:2869
void parallel_for_work_group(range< Dims > NumWorkGroups, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
}@
Definition: handler.hpp:2466
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > single_task(PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2331
void single_task(kernel Kernel)
Invokes a SYCL kernel.
Definition: handler.hpp:2039
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(nd_range< Dims > Range, RestT &&...Rest)
Definition: handler.hpp:2456
void copy(const T_Src *Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2597
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 3 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2360
void copy(std::shared_ptr< T_Src > Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2526
handler(const handler &)=delete
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, T_Dst *Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2555
typename std::remove_cv_t< std::remove_reference_t< T > > remove_cv_ref_t
Definition: handler.hpp:1859
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(nd_range< Dims > Range, PropertiesT Properties, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2370
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(range< 1 > Range, RestT &&...Rest)
Definition: handler.hpp:2418
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(range< 2 > Range, RestT &&...Rest)
Definition: handler.hpp:2426
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(range< 3 > Range, RestT &&...Rest)
Definition: handler.hpp:2434
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2262
void ext_oneapi_barrier()
Prevents any commands submitted afterward to this queue from executing until all commands previously ...
Definition: handler.hpp:2787
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 1 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2340
void memcpy(void *Dest, const ext::oneapi::experimental::device_global< T, PropertyListT > &Src, size_t NumBytes=sizeof(T), size_t SrcOffset=0)
Copies data from a device_global to USM memory.
Definition: handler.hpp:3081
void parallel_for_work_group(range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:2004
std::enable_if_t< detail::check_fn_signature< std::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< std::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
Definition: handler.hpp:1950
void set_specialization_constant(typename std::remove_reference_t< decltype(SpecName)>::value_type Value)
Definition: handler.hpp:1800
void set_args(Ts &&...Args)
Sets arguments for OpenCL interoperability kernels.
Definition: handler.hpp:1906
void memcpy(ext::oneapi::experimental::device_global< T, PropertyListT > &Dest, const void *Src, size_t NumBytes=sizeof(T), size_t DestOffset=0)
Copies data from a USM memory region to a device_global.
Definition: handler.hpp:3047
void parallel_for(range< 2 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:1931
void set_arg(int ArgIndex, accessor< DataT, Dims, AccessMode, AccessTarget, IsPlaceholder > Arg)
Definition: handler.hpp:1891
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 1 > Range, PropertiesT Properties, RestT &&...Rest)
Reductions.
Definition: handler.hpp:2383
std::remove_reference_t< decltype(SpecName)>::value_type get_specialization_constant() const
Definition: handler.hpp:1815
void set_arg(int ArgIndex, local_accessor< DataT, Dims > Arg)
Definition: handler.hpp:1897
A unique identifier of an item in an index space.
Definition: id.hpp:36
Identifies an instance of the function object executing at each point in a range.
Definition: item.hpp:37
id< Dimensions > get_id() const
Definition: item.hpp:55
range< Dimensions > get_range() const
Definition: item.hpp:69
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:74
Identifies an instance of the function object executing at each point in an nd_range.
Definition: nd_item.hpp:544
id< Dimensions > get_global_id() const
Definition: nd_item.hpp:550
id< Dimensions > get_offset() const
Definition: nd_item.hpp:615
range< Dimensions > get_global_range() const
Definition: nd_item.hpp:602
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:22
Objects of the property_list class are containers for the SYCL properties.
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: range.hpp:26
size_t size() const
Definition: range.hpp:56
#define _KERNELFUNCPARAMTYPE
Definition: handler.hpp:77
#define __SYCL_KERNEL_ATTR__
Definition: handler.hpp:1485
#define _KERNELFUNCPARAM(a)
Definition: handler.hpp:81
__SYCL_EXTERN_STREAM_ATTRS ostream cout
Linked to standard output.
void withAuxHandler(handler &CGH, FunctorTy Func)
Definition: reduction.hpp:1192
void finalizeHandler(handler &CGH)
Definition: reduction.hpp:1191
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
Definition: accessor.cpp:16
size_t getLinearIndex(const T< Dims > &Index, const U< Dims > &Range)
Definition: common.hpp:367
decltype(member_ptr_helper(&F::operator())) argument_helper(int)
Definition: handler.hpp:195
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:323
static Arg member_ptr_helper(RetType(Func::*)(Arg) const)
bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr)
Definition: handler.cpp:37
static std::enable_if_t< std::is_unsigned_v< T >, bool > multiply_with_overflow_check(T &dst, T x, T y)
Definition: handler.hpp:404
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:30
std::enable_if_t< std::is_same_v< T, range< Dims > >||std::is_same_v< T, id< Dims > > > checkValueRange(const T &V)
Definition: handler.hpp:268
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType, ArgType >::value > runKernelWithArg(KernelType KernelName, ArgType Arg)
Definition: cg_types.hpp:210
void markBufferAsInternal(const std::shared_ptr< buffer_impl > &BufImpl)
Definition: helpers.cpp:69
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType >::value > runKernelWithoutArg(KernelType KernelName)
Definition: cg_types.hpp:196
std::shared_ptr< LocalAccessorImplHost > LocalAccessorImplPtr
Definition: accessor.hpp:601
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
bool range_size_fits_in_size_t(const range< Dims > &r)
Definition: handler.hpp:409
void associateWithHandler(handler &, AccessorBaseHost *, access::target)
void reduction_parallel_for(handler &CGH, range< Dims > NDRange, PropertiesT Properties, RestT... Rest)
Definition: reduction.hpp:2737
decltype(argument_helper< F, SuggestedArgType >(0)) lambda_arg_type
Definition: handler.hpp:201
std::shared_ptr< AccessorImplHost > AccessorImplPtr
Definition: accessor.hpp:532
const char * UnsupportedFeatureToString(UnsupportedGraphFeatures Feature)
Definition: graph.hpp:48
typename merged_properties< LHSPropertiesT, RHSPropertiesT >::type merged_properties_t
Definition: properties.hpp:225
properties< std::tuple< PropertyValueTs... > > properties_t
Definition: properties.hpp:212
@ executable
In executable state, the graph is ready to execute.
static constexpr bool has_property()
static constexpr auto get_property()
decltype(properties{}) empty_properties_t
Definition: properties.hpp:190
image_target
Definition: access.hpp:74
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS Dimensions
Definition: accessor.hpp:3233
signed char __SYCL2020_DEPRECATED
Definition: aliases.hpp:94
usm::alloc get_pointer_type(const void *ptr, const context &ctxt)
Query the allocation type from a USM pointer.
Definition: usm_impl.cpp:574
ext::intel::pipe< name, dataT, min_capacity > pipe
Definition: pipes.hpp:18
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS IsPlaceholder
Definition: accessor.hpp:3234
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
Definition: accessor.hpp:3233
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:94
std::conditional_t< std::is_same_v< ElementType, half >, sycl::detail::half_impl::BIsRepresentationT, ElementType > element_type
Definition: multi_ptr.hpp:752
void prefetch(size_t NumElements) const
Definition: multi_ptr.hpp:1076
Definition: access.hpp:18
static sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern, size_t count)
Set pattern to the first count elements of type T starting from dev_ptr.
Definition: memory.hpp:172
_pi_kernel_cache_config
Definition: pi.h:781
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA
Definition: pi.h:787
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM
Definition: pi.h:785
C++ wrapper of extern "C" PI interfaces.
Predicate returning true if all template type parameters except the last one are reductions.
Definition: reduction.hpp:77
std::vector< detail::AccessorImplPtr > MAccStorage
Storage for accessors.
Definition: cg.hpp:101
std::vector< AccessorImplHost * > MRequirements
List of requirements that specify which memory is needed for the command group to be executed.
Definition: cg.hpp:107
static constexpr const char * getName()
Definition: kernel_desc.hpp:84
A struct to describe the properties of an image.
is_device_copyable is a user specializable class template to indicate that a type T is device copyabl...
Definition: types.hpp:2778