DPC++ Runtime
Runtime libraries for oneAPI DPC++
handler.hpp
Go to the documentation of this file.
1 //==-------- handler.hpp --- SYCL command group handler --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
11 #include <sycl/access/access.hpp>
12 #include <sycl/accessor.hpp>
13 #include <sycl/context.hpp>
14 #include <sycl/detail/cg.hpp>
15 #include <sycl/detail/cg_types.hpp>
16 #include <sycl/detail/common.hpp>
18 #include <sycl/detail/export.hpp>
21 #include <sycl/detail/pi.h>
22 #include <sycl/detail/pi.hpp>
24 #include <sycl/detail/string.hpp>
26 #include <sycl/device.hpp>
27 #include <sycl/event.hpp>
28 #include <sycl/exception.hpp>
40 #include <sycl/group.hpp>
41 #include <sycl/id.hpp>
42 #include <sycl/item.hpp>
43 #include <sycl/kernel.hpp>
44 #include <sycl/kernel_bundle.hpp>
46 #include <sycl/kernel_handler.hpp>
47 #include <sycl/nd_item.hpp>
48 #include <sycl/nd_range.hpp>
49 #include <sycl/property_list.hpp>
50 #include <sycl/range.hpp>
51 #include <sycl/sampler.hpp>
52 
53 #include <assert.h>
54 #include <functional>
55 #include <memory>
56 #include <stddef.h>
57 #include <stdint.h>
58 #include <string>
59 #include <tuple>
60 #include <type_traits>
61 #include <utility>
62 #include <vector>
63 
64 // TODO: refactor this header
65 // 41(!!!) includes of SYCL headers + 10 includes of standard headers.
66 // 3300+ lines of code
67 
68 // SYCL_LANGUAGE_VERSION is 4 digit year followed by 2 digit revision
69 #if !SYCL_LANGUAGE_VERSION || SYCL_LANGUAGE_VERSION < 202001
70 #define __SYCL_NONCONST_FUNCTOR__
71 #endif
72 
73 // replace _KERNELFUNCPARAM(KernelFunc) with KernelType KernelFunc
74 // or const KernelType &KernelFunc
75 #ifdef __SYCL_NONCONST_FUNCTOR__
76 #define _KERNELFUNCPARAMTYPE KernelType
77 #else
78 #define _KERNELFUNCPARAMTYPE const KernelType &
79 #endif
80 #define _KERNELFUNCPARAM(a) _KERNELFUNCPARAMTYPE a
81 
82 #if defined(__SYCL_UNNAMED_LAMBDA__)
83 // We can't use nested types (e.g. struct S defined inside main() routine) to
84 // name kernels. At the same time, we have to provide a unique kernel name for
85 // sycl::fill and the only thing we can use to introduce that uniqueness (in
86 // general) is the template parameter T which might be exactly that nested type.
87 // That means we cannot support sycl::fill(void *, T&, size_t) for such types in
88 // general. However, we can do better than that when unnamed lambdas are
89 // enabled, so do it here! See also https://github.com/intel/llvm/issues/469.
90 template <typename DataT, int Dimensions, sycl::access::mode AccessMode,
91  sycl::access::target AccessTarget,
93 using __fill = sycl::detail::auto_name;
94 template <typename T> using __usmfill = sycl::detail::auto_name;
95 template <typename T> using __usmfill2d = sycl::detail::auto_name;
96 template <typename T> using __usmmemcpy2d = sycl::detail::auto_name;
97 
98 template <typename T_Src, typename T_Dst, int Dims,
101 using __copyAcc2Ptr = sycl::detail::auto_name;
102 
103 template <typename T_Src, typename T_Dst, int Dims,
106 using __copyPtr2Acc = sycl::detail::auto_name;
107 
108 template <typename T_Src, int Dims_Src, sycl::access::mode AccessMode_Src,
109  sycl::access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
110  sycl::access::mode AccessMode_Dst,
111  sycl::access::target AccessTarget_Dst,
112  sycl::access::placeholder IsPlaceholder_Src,
113  sycl::access::placeholder IsPlaceholder_Dst>
114 using __copyAcc2Acc = sycl::detail::auto_name;
115 #else
116 // Limited fallback path for when unnamed lambdas aren't available. Cannot
117 // handle nested types.
118 template <typename DataT, int Dimensions, sycl::access::mode AccessMode,
119  sycl::access::target AccessTarget,
121 class __fill;
122 template <typename T> class __usmfill;
123 template <typename T> class __usmfill2d;
124 template <typename T> class __usmmemcpy2d;
125 
126 template <typename T_Src, typename T_Dst, int Dims,
130 
131 template <typename T_Src, typename T_Dst, int Dims,
135 
136 template <typename T_Src, int Dims_Src, sycl::access::mode AccessMode_Src,
137  sycl::access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
138  sycl::access::mode AccessMode_Dst,
139  sycl::access::target AccessTarget_Dst,
140  sycl::access::placeholder IsPlaceholder_Src,
141  sycl::access::placeholder IsPlaceholder_Dst>
143 #endif
144 
145 // For unit testing purposes
146 class MockHandler;
147 
148 namespace sycl {
149 inline namespace _V1 {
150 
151 // Forward declaration
152 
153 class handler;
154 template <typename T, int Dimensions, typename AllocatorT, typename Enable>
155 class buffer;
156 
157 namespace ext::intel::experimental {
158 template <class _name, class _dataT, int32_t _min_capacity, class _propertiesT,
159  class>
160 class pipe;
161 }
162 
163 namespace ext ::oneapi ::experimental {
164 struct image_descriptor;
165 } // namespace ext::oneapi::experimental
166 
167 namespace ext::oneapi::experimental::detail {
168 class graph_impl;
169 } // namespace ext::oneapi::experimental::detail
170 namespace detail {
171 
172 class handler_impl;
173 class kernel_impl;
174 class queue_impl;
175 class stream_impl;
176 template <typename DataT, int Dimensions, access::mode AccessMode,
178 class image_accessor;
179 class HandlerAccess;
180 template <typename RetType, typename Func, typename Arg>
181 static Arg member_ptr_helper(RetType (Func::*)(Arg) const);
182 
183 // Non-const version of the above template to match functors whose 'operator()'
184 // is declared w/o the 'const' qualifier.
185 template <typename RetType, typename Func, typename Arg>
186 static Arg member_ptr_helper(RetType (Func::*)(Arg));
187 
188 // Version with two arguments to handle the case when kernel_handler is passed
189 // to a lambda
190 template <typename RetType, typename Func, typename Arg1, typename Arg2>
191 static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2) const);
192 
193 // Non-const version of the above template to match functors whose 'operator()'
194 // is declared w/o the 'const' qualifier.
195 template <typename RetType, typename Func, typename Arg1, typename Arg2>
196 static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2));
197 
198 template <typename F, typename SuggestedArgType>
199 decltype(member_ptr_helper(&F::operator())) argument_helper(int);
200 
201 template <typename F, typename SuggestedArgType>
202 SuggestedArgType argument_helper(...);
203 
204 template <typename F, typename SuggestedArgType>
205 using lambda_arg_type = decltype(argument_helper<F, SuggestedArgType>(0));
206 
207 // Used when parallel_for range is rounded-up.
208 template <typename Name> class __pf_kernel_wrapper;
209 
210 template <typename Type> struct get_kernel_wrapper_name_t {
212 };
213 
214 __SYCL_EXPORT device getDeviceFromHandler(handler &);
215 
216 // Checks if a device_global has any registered kernel usage.
217 __SYCL_EXPORT bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr);
218 
219 // Extracts a pointer to the value inside a dynamic parameter
220 __SYCL_EXPORT void *getValueFromDynamicParameter(
222  &DynamicParamBase);
223 
224 #if __SYCL_ID_QUERIES_FIT_IN_INT__
225 template <typename T> struct NotIntMsg;
226 
227 template <int Dims> struct NotIntMsg<range<Dims>> {
228  constexpr static const char *Msg =
229  "Provided range is out of integer limits. Pass "
230  "`-fno-sycl-id-queries-fit-in-int' to disable range check.";
231 };
232 
233 template <int Dims> struct NotIntMsg<id<Dims>> {
234  constexpr static const char *Msg =
235  "Provided offset is out of integer limits. Pass "
236  "`-fno-sycl-id-queries-fit-in-int' to disable offset check.";
237 };
238 #endif
239 
240 // Helper for merging properties with ones defined in an optional kernel functor
241 // getter.
242 template <typename KernelType, typename PropertiesT, typename Cond = void>
244  using type = PropertiesT;
245 };
246 template <typename KernelType, typename PropertiesT>
248  KernelType, PropertiesT,
249  std::enable_if_t<ext::oneapi::experimental::detail::
250  HasKernelPropertiesGetMethod<KernelType>::value>> {
253  KernelType>::properties_t;
254  static_assert(
256  "get(sycl::ext::oneapi::experimental::properties_tag) member in kernel "
257  "functor class must return a valid property list.");
259  PropertiesT, get_method_properties>;
260 };
261 
262 #if __SYCL_ID_QUERIES_FIT_IN_INT__
263 template <typename T, typename ValT>
264 typename std::enable_if_t<std::is_same<ValT, size_t>::value ||
265  std::is_same<ValT, unsigned long long>::value>
266 checkValueRangeImpl(ValT V) {
267  static constexpr size_t Limit =
268  static_cast<size_t>((std::numeric_limits<int>::max)());
269  if (V > Limit)
270  throw sycl::exception(make_error_code(errc::nd_range), NotIntMsg<T>::Msg);
271 }
272 #endif
273 
274 template <int Dims, typename T>
275 typename std::enable_if_t<std::is_same_v<T, range<Dims>> ||
276  std::is_same_v<T, id<Dims>>>
277 checkValueRange(const T &V) {
278 #if __SYCL_ID_QUERIES_FIT_IN_INT__
279  for (size_t Dim = 0; Dim < Dims; ++Dim)
280  checkValueRangeImpl<T>(V[Dim]);
281 
282  {
283  unsigned long long Product = 1;
284  for (size_t Dim = 0; Dim < Dims; ++Dim) {
285  Product *= V[Dim];
286  // check value now to prevent product overflow in the end
287  checkValueRangeImpl<T>(Product);
288  }
289  }
290 #else
291  (void)V;
292 #endif
293 }
294 
295 template <int Dims>
296 void checkValueRange(const range<Dims> &R, const id<Dims> &O) {
297 #if __SYCL_ID_QUERIES_FIT_IN_INT__
298  checkValueRange<Dims>(R);
299  checkValueRange<Dims>(O);
300 
301  for (size_t Dim = 0; Dim < Dims; ++Dim) {
302  unsigned long long Sum = R[Dim] + O[Dim];
303 
304  checkValueRangeImpl<range<Dims>>(Sum);
305  }
306 #else
307  (void)R;
308  (void)O;
309 #endif
310 }
311 
312 template <int Dims, typename T>
313 typename std::enable_if_t<std::is_same_v<T, nd_range<Dims>>>
314 checkValueRange(const T &V) {
315 #if __SYCL_ID_QUERIES_FIT_IN_INT__
316  checkValueRange<Dims>(V.get_global_range());
317  checkValueRange<Dims>(V.get_local_range());
318  checkValueRange<Dims>(V.get_offset());
319 
320  checkValueRange<Dims>(V.get_global_range(), V.get_offset());
321 #else
322  (void)V;
323 #endif
324 }
325 
326 template <int Dims> class RoundedRangeIDGenerator {
327  id<Dims> Id;
328  id<Dims> InitId;
329  range<Dims> UserRange;
330  range<Dims> RoundedRange;
331  bool Done = false;
332 
333 public:
334  RoundedRangeIDGenerator(const id<Dims> &Id, const range<Dims> &UserRange,
335  const range<Dims> &RoundedRange)
336  : Id(Id), InitId(Id), UserRange(UserRange), RoundedRange(RoundedRange) {
337  for (int i = 0; i < Dims; ++i)
338  if (Id[i] >= UserRange[i])
339  Done = true;
340  }
341 
342  explicit operator bool() { return !Done; }
343 
344  void updateId() {
345  for (int i = 0; i < Dims; ++i) {
346  Id[i] += RoundedRange[i];
347  if (Id[i] < UserRange[i])
348  return;
349  Id[i] = InitId[i];
350  }
351  Done = true;
352  }
353 
354  id<Dims> getId() { return Id; }
355 
356  template <typename KernelType> auto getItem() {
357  if constexpr (std::is_invocable_v<KernelType, item<Dims> &> ||
358  std::is_invocable_v<KernelType, item<Dims> &, kernel_handler>)
359  return detail::Builder::createItem<Dims, true>(UserRange, getId(), {});
360  else {
361  static_assert(std::is_invocable_v<KernelType, item<Dims, false> &> ||
362  std::is_invocable_v<KernelType, item<Dims, false> &,
363  kernel_handler>,
364  "Kernel must be invocable with an item!");
365  return detail::Builder::createItem<Dims, false>(UserRange, getId());
366  }
367  }
368 };
369 
370 // TODO: The wrappers can be optimized further so that the body
371 // essentially looks like this:
372 // for (auto z = it[2]; z < UserRange[2]; z += it.get_range(2))
373 // for (auto y = it[1]; y < UserRange[1]; y += it.get_range(1))
374 // for (auto x = it[0]; x < UserRange[0]; x += it.get_range(0))
375 // KernelFunc({x,y,z});
376 template <typename TransformedArgType, int Dims, typename KernelType>
378 public:
380  KernelType KernelFunc;
381  void operator()(item<Dims> It) const {
382  auto RoundedRange = It.get_range();
383  for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen;
384  Gen.updateId()) {
385  auto item = Gen.template getItem<KernelType>();
386  KernelFunc(item);
387  }
388  }
389 };
390 
391 template <typename TransformedArgType, int Dims, typename KernelType>
393 public:
395  KernelType KernelFunc;
396  void operator()(item<Dims> It, kernel_handler KH) const {
397  auto RoundedRange = It.get_range();
398  for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen;
399  Gen.updateId()) {
400  auto item = Gen.template getItem<KernelType>();
401  KernelFunc(item, KH);
402  }
403  }
404 };
405 
406 using std::enable_if_t;
407 using sycl::detail::queue_impl;
408 
409 // Returns true if x*y will overflow in T;
410 // otherwise, returns false and stores x*y in dst.
411 template <typename T>
412 static std::enable_if_t<std::is_unsigned_v<T>, bool>
413 multiply_with_overflow_check(T &dst, T x, T y) {
414  dst = x * y;
415  return (y != 0) && (x > (std::numeric_limits<T>::max)() / y);
416 }
417 
418 template <int Dims> bool range_size_fits_in_size_t(const range<Dims> &r) {
419  size_t acc = 1;
420  for (int i = 0; i < Dims; ++i) {
421  bool did_overflow = multiply_with_overflow_check(acc, acc, r[i]);
422  if (did_overflow)
423  return false;
424  }
425  return true;
426 }
427 } // namespace detail
428 
462 class __SYCL_EXPORT handler {
463 private:
468  handler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost);
469 
479  handler(std::shared_ptr<detail::queue_impl> Queue,
480  std::shared_ptr<detail::queue_impl> PrimaryQueue,
481  std::shared_ptr<detail::queue_impl> SecondaryQueue, bool IsHost);
482 
489  handler(std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph);
490 
492  template <typename T, typename F = typename std::remove_const_t<
493  typename std::remove_reference_t<T>>>
494  F *storePlainArg(T &&Arg) {
495  CGData.MArgsStorage.emplace_back(sizeof(T));
496  auto Storage = reinterpret_cast<F *>(CGData.MArgsStorage.back().data());
497  *Storage = Arg;
498  return Storage;
499  }
500 
501  void setType(detail::CG::CGTYPE Type) { MCGType = Type; }
502 
503  detail::CG::CGTYPE getType() { return MCGType; }
504 
505  void throwIfActionIsCreated() {
506  if (detail::CG::None != getType())
508  "Attempt to set multiple actions for the "
509  "command group. Command group must consist of "
510  "a single kernel or explicit memory operation.");
511  }
512 
513  constexpr static int AccessTargetMask = 0x7ff;
517  template <typename KernelName, typename KernelType>
518  void throwOnLocalAccessorMisuse() const {
519  using NameT =
521  using KI = sycl::detail::KernelInfo<NameT>;
522 
523  auto *KernelArgs = &KI::getParamDesc(0);
524 
525  for (unsigned I = 0; I < KI::getNumParams(); ++I) {
526  const detail::kernel_param_kind_t &Kind = KernelArgs[I].kind;
527  const access::target AccTarget =
528  static_cast<access::target>(KernelArgs[I].info & AccessTargetMask);
530  (AccTarget == target::local))
531  throw sycl::exception(
533  "A local accessor must not be used in a SYCL kernel function "
534  "that is invoked via single_task or via the simple form of "
535  "parallel_for that takes a range parameter.");
536  }
537  }
538 
541  void
542  extractArgsAndReqsFromLambda(char *LambdaPtr, size_t KernelArgsNum,
543  const detail::kernel_param_desc_t *KernelArgs,
544  bool IsESIMD);
545 
547  void extractArgsAndReqs();
548 
549  void processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
550  const int Size, const size_t Index, size_t &IndexShift,
551  bool IsKernelCreatedFromSource, bool IsESIMD);
552 
554  detail::string getKernelName();
555 
556  template <typename LambdaNameT> bool lambdaAndKernelHaveEqualName() {
557  // TODO It is unclear a kernel and a lambda/functor must to be equal or not
558  // for parallel_for with sycl::kernel and lambda/functor together
559  // Now if they are equal we extract argumets from lambda/functor for the
560  // kernel. Else it is necessary use set_atg(s) for resolve the order and
561  // values of arguments for the kernel.
562  assert(MKernel && "MKernel is not initialized");
563  const std::string LambdaName = detail::KernelInfo<LambdaNameT>::getName();
564  detail::string KernelName = getKernelName();
565  return KernelName == LambdaName;
566  }
567 
570  void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
571 
578  event finalize();
579 
585  void addStream(const std::shared_ptr<detail::stream_impl> &Stream) {
586  MStreamStorage.push_back(Stream);
587  }
588 
594  void addReduction(const std::shared_ptr<const void> &ReduObj);
595 
601  template <typename T, int Dimensions, typename AllocatorT, typename Enable>
602  void
603  addReduction(const std::shared_ptr<buffer<T, Dimensions, AllocatorT, Enable>>
604  &ReduBuf) {
606  addReduction(std::shared_ptr<const void>(ReduBuf));
607  }
608 
609  ~handler() = default;
610 
611  // TODO: Private and unusued. Remove when ABI break is allowed.
612  bool is_host() { return MIsHost; }
613 
614 #ifdef __SYCL_DEVICE_ONLY__
615  // In device compilation accessor isn't inherited from host base classes, so
616  // can't detect by it. Since we don't expect it to be ever called in device
617  // execution, just use blind void *.
618  void associateWithHandler(void *AccBase, access::target AccTarget);
619  void associateWithHandler(void *AccBase, image_target AccTarget);
620 #else
621  void associateWithHandlerCommon(detail::AccessorImplPtr AccImpl,
622  int AccTarget);
624  access::target AccTarget);
626  image_target AccTarget);
628  image_target AccTarget);
629 #endif
630 
631  // Recursively calls itself until arguments pack is fully processed.
632  // The version for regular(standard layout) argument.
633  template <typename T, typename... Ts>
634  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&...Args) {
635  set_arg(ArgIndex, std::move(Arg));
636  setArgsHelper(++ArgIndex, std::move(Args)...);
637  }
638 
639  void setArgsHelper(int) {}
640 
641  void setLocalAccessorArgHelper(int ArgIndex,
642  detail::LocalAccessorBaseHost &LocalAccBase) {
643  detail::LocalAccessorImplPtr LocalAccImpl =
644  detail::getSyclObjImpl(LocalAccBase);
645  detail::LocalAccessorImplHost *Req = LocalAccImpl.get();
646  MLocalAccStorage.push_back(std::move(LocalAccImpl));
647  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
648  static_cast<int>(access::target::local), ArgIndex);
649  }
650 
651  // setArgHelper for local accessor argument (legacy accessor interface)
652  template <typename DataT, int Dims, access::mode AccessMode,
654  void setArgHelper(int ArgIndex,
655  accessor<DataT, Dims, AccessMode, access::target::local,
656  IsPlaceholder> &&Arg) {
657  (void)ArgIndex;
658  (void)Arg;
659 #ifndef __SYCL_DEVICE_ONLY__
660  setLocalAccessorArgHelper(ArgIndex, Arg);
661 #endif
662  }
663 
664  // setArgHelper for local accessor argument (up to date accessor interface)
665  template <typename DataT, int Dims>
666  void setArgHelper(int ArgIndex, local_accessor<DataT, Dims> &&Arg) {
667  (void)ArgIndex;
668  (void)Arg;
669 #ifndef __SYCL_DEVICE_ONLY__
670  setLocalAccessorArgHelper(ArgIndex, Arg);
671 #endif
672  }
673 
674  // setArgHelper for non local accessor argument.
675  template <typename DataT, int Dims, access::mode AccessMode,
677  typename std::enable_if_t<AccessTarget != access::target::local, void>
678  setArgHelper(
679  int ArgIndex,
683  detail::AccessorImplHost *Req = AccImpl.get();
684  // Add accessor to the list of requirements.
685  CGData.MRequirements.push_back(Req);
686  // Store copy of the accessor.
687  CGData.MAccStorage.push_back(std::move(AccImpl));
688  // Add accessor to the list of arguments.
689  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
690  static_cast<int>(AccessTarget), ArgIndex);
691  }
692 
693  template <typename T> void setArgHelper(int ArgIndex, T &&Arg) {
694  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
695 
696  if (!std::is_same<cl_mem, T>::value && std::is_pointer<T>::value) {
697  MArgs.emplace_back(detail::kernel_param_kind_t::kind_pointer, StoredArg,
698  sizeof(T), ArgIndex);
699  } else {
701  StoredArg, sizeof(T), ArgIndex);
702  }
703  }
704 
705  void setArgHelper(int ArgIndex, sampler &&Arg) {
706  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
707  MArgs.emplace_back(detail::kernel_param_kind_t::kind_sampler, StoredArg,
708  sizeof(sampler), ArgIndex);
709  }
710 
711  // setArgHelper for graph dynamic_parameters
712  template <typename T>
713  void
714  setArgHelper(int ArgIndex,
716  // Extract and copy arg so we can move it into setArgHelper
717  T ArgValue =
718  *static_cast<T *>(detail::getValueFromDynamicParameter(DynamicParam));
719  // Set the arg in the handler as normal
720  setArgHelper(ArgIndex, std::move(ArgValue));
721  // Register the dynamic parameter with the handler for later association
722  // with the node being added
723  registerDynamicParameter(DynamicParam, ArgIndex);
724  }
725 
730  void registerDynamicParameter(
732  &DynamicParamBase,
733  int ArgIndex);
734 
735  // TODO: Unusued. Remove when ABI break is allowed.
736  void verifyKernelInvoc(const kernel &Kernel) {
737  std::ignore = Kernel;
738  return;
739  }
740 
741  /* The kernel passed to StoreLambda can take an id, an item or an nd_item as
742  * its argument. Since esimd plugin directly invokes the kernel (doesn’t use
743  * piKernelSetArg), the kernel argument type must be known to the plugin.
744  * However, passing kernel argument type to the plugin requires changing ABI
745  * in HostKernel class. To overcome this problem, helpers below wrap the
746  * “original” kernel with a functor that always takes an nd_item as argument.
747  * A functor is used instead of a lambda because extractArgsAndReqsFromLambda
748  * needs access to the “original” kernel and keeps references to its internal
749  * data, i.e. the kernel passed as argument cannot be local in scope. The
750  * functor itself is again encapsulated in a std::function since functor’s
751  * type is unknown to the plugin.
752  */
753 
754  // For 'id, item w/wo offset, nd_item' kernel arguments
755  template <class KernelType, class NormalizedKernelType, int Dims>
756  KernelType *ResetHostKernelHelper(const KernelType &KernelFunc) {
757  NormalizedKernelType NormalizedKernel(KernelFunc);
758  auto NormalizedKernelFunc =
759  std::function<void(const sycl::nd_item<Dims> &)>(NormalizedKernel);
760  auto HostKernelPtr = new detail::HostKernel<decltype(NormalizedKernelFunc),
761  sycl::nd_item<Dims>, Dims>(
762  std::move(NormalizedKernelFunc));
763  MHostKernel.reset(HostKernelPtr);
764  return &HostKernelPtr->MKernel.template target<NormalizedKernelType>()
765  ->MKernelFunc;
766  }
767 
768  // For 'sycl::id<Dims>' kernel argument
769  template <class KernelType, typename ArgT, int Dims>
770  std::enable_if_t<std::is_same_v<ArgT, sycl::id<Dims>>, KernelType *>
771  ResetHostKernel(const KernelType &KernelFunc) {
772  struct NormalizedKernelType {
773  KernelType MKernelFunc;
774  NormalizedKernelType(const KernelType &KernelFunc)
775  : MKernelFunc(KernelFunc) {}
776  void operator()(const nd_item<Dims> &Arg) {
777  detail::runKernelWithArg(MKernelFunc, Arg.get_global_id());
778  }
779  };
780  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
781  KernelFunc);
782  }
783 
784  // For 'sycl::nd_item<Dims>' kernel argument
785  template <class KernelType, typename ArgT, int Dims>
786  std::enable_if_t<std::is_same_v<ArgT, sycl::nd_item<Dims>>, KernelType *>
787  ResetHostKernel(const KernelType &KernelFunc) {
788  struct NormalizedKernelType {
789  KernelType MKernelFunc;
790  NormalizedKernelType(const KernelType &KernelFunc)
791  : MKernelFunc(KernelFunc) {}
792  void operator()(const nd_item<Dims> &Arg) {
793  detail::runKernelWithArg(MKernelFunc, Arg);
794  }
795  };
796  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
797  KernelFunc);
798  }
799 
800  // For 'sycl::item<Dims, without_offset>' kernel argument
801  template <class KernelType, typename ArgT, int Dims>
802  std::enable_if_t<std::is_same_v<ArgT, sycl::item<Dims, false>>, KernelType *>
803  ResetHostKernel(const KernelType &KernelFunc) {
804  struct NormalizedKernelType {
805  KernelType MKernelFunc;
806  NormalizedKernelType(const KernelType &KernelFunc)
807  : MKernelFunc(KernelFunc) {}
808  void operator()(const nd_item<Dims> &Arg) {
809  sycl::item<Dims, false> Item = detail::Builder::createItem<Dims, false>(
810  Arg.get_global_range(), Arg.get_global_id());
811  detail::runKernelWithArg(MKernelFunc, Item);
812  }
813  };
814  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
815  KernelFunc);
816  }
817 
818  // For 'sycl::item<Dims, with_offset>' kernel argument
819  template <class KernelType, typename ArgT, int Dims>
820  std::enable_if_t<std::is_same_v<ArgT, sycl::item<Dims, true>>, KernelType *>
821  ResetHostKernel(const KernelType &KernelFunc) {
822  struct NormalizedKernelType {
823  KernelType MKernelFunc;
824  NormalizedKernelType(const KernelType &KernelFunc)
825  : MKernelFunc(KernelFunc) {}
826  void operator()(const nd_item<Dims> &Arg) {
827  sycl::item<Dims, true> Item = detail::Builder::createItem<Dims, true>(
828  Arg.get_global_range(), Arg.get_global_id(), Arg.get_offset());
829  detail::runKernelWithArg(MKernelFunc, Item);
830  }
831  };
832  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
833  KernelFunc);
834  }
835 
836  // For 'void' kernel argument (single_task)
837  template <class KernelType, typename ArgT, int Dims>
838  typename std::enable_if_t<std::is_same_v<ArgT, void>, KernelType *>
839  ResetHostKernel(const KernelType &KernelFunc) {
840  struct NormalizedKernelType {
841  KernelType MKernelFunc;
842  NormalizedKernelType(const KernelType &KernelFunc)
843  : MKernelFunc(KernelFunc) {}
844  void operator()(const nd_item<Dims> &Arg) {
845  (void)Arg;
846  detail::runKernelWithoutArg(MKernelFunc);
847  }
848  };
849  return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
850  KernelFunc);
851  }
852 
853  // For 'sycl::group<Dims>' kernel argument
854  // 'wrapper'-based approach using 'NormalizedKernelType' struct is not used
855  // for 'void(sycl::group<Dims>)' since 'void(sycl::group<Dims>)' is not
856  // supported in ESIMD.
857  template <class KernelType, typename ArgT, int Dims>
858  std::enable_if_t<std::is_same_v<ArgT, sycl::group<Dims>>, KernelType *>
859  ResetHostKernel(const KernelType &KernelFunc) {
860  MHostKernel.reset(
862  return (KernelType *)(MHostKernel->getPtr());
863  }
864 
872  void verifyUsedKernelBundle(const std::string &KernelName) {
873  verifyUsedKernelBundleInternal(detail::string_view{KernelName});
874  }
875  void verifyUsedKernelBundleInternal(detail::string_view KernelName);
876 
883  template <typename KernelName, typename KernelType, int Dims,
884  typename LambdaArgType>
885  void StoreLambda(KernelType KernelFunc) {
887  constexpr bool IsCallableWithKernelHandler =
889  LambdaArgType>::value;
890 
891  if (IsCallableWithKernelHandler && MIsHost) {
892  throw sycl::feature_not_supported(
893  "kernel_handler is not yet supported by host device.",
894  PI_ERROR_INVALID_OPERATION);
895  }
896 
897  KernelType *KernelPtr =
898  ResetHostKernel<KernelType, LambdaArgType, Dims>(KernelFunc);
899 
900  constexpr bool KernelHasName =
901  KI::getName() != nullptr && KI::getName()[0] != '\0';
902 
903  // Some host compilers may have different captures from Clang. Currently
904  // there is no stable way of handling this when extracting the captures, so
905  // a static assert is made to fail for incompatible kernel lambdas.
906  static_assert(
907  !KernelHasName || sizeof(KernelFunc) == KI::getKernelSize(),
908  "Unexpected kernel lambda size. This can be caused by an "
909  "external host compiler producing a lambda with an "
910  "unexpected layout. This is a limitation of the compiler."
911  "In many cases the difference is related to capturing constexpr "
912  "variables. In such cases removing constexpr specifier aligns the "
913  "captures between the host compiler and the device compiler."
914  "\n"
915  "In case of MSVC, passing "
916  "-fsycl-host-compiler-options='/std:c++latest' "
917  "might also help.");
918 
919  // Empty name indicates that the compilation happens without integration
920  // header, so don't perform things that require it.
921  if (KernelHasName) {
922  // TODO support ESIMD in no-integration-header case too.
923  MArgs.clear();
924  extractArgsAndReqsFromLambda(reinterpret_cast<char *>(KernelPtr),
925  KI::getNumParams(), &KI::getParamDesc(0),
926  KI::isESIMD());
927  MKernelName = KI::getName();
928  } else {
929  // In case w/o the integration header it is necessary to process
930  // accessors from the list(which are associated with this handler) as
931  // arguments. We must copy the associated accessors as they are checked
932  // later during finalize.
933  MArgs = MAssociatedAccesors;
934  }
935 
936  // If the kernel lambda is callable with a kernel_handler argument, manifest
937  // the associated kernel handler.
938  if (IsCallableWithKernelHandler) {
939  getOrInsertHandlerKernelBundle(/*Insert=*/true);
940  }
941  }
942 
943  void verifyDeviceHasProgressGuarantee(
947 
951  template <
952  typename KernelName,
953  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
954  void processProperties(PropertiesT Props) {
956  static_assert(
958  "Template type is not a property list.");
959  static_assert(
960  !PropertiesT::template has_property<
962  (PropertiesT::template has_property<
964  KI::isESIMD()),
965  "Floating point control property is supported for ESIMD kernels only.");
966  static_assert(
967  !PropertiesT::template has_property<
969  "indirectly_callable property cannot be applied to SYCL kernels");
970  if constexpr (PropertiesT::template has_property<
972  auto Config = Props.template get_property<
975  setKernelCacheConfig(PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM);
976  } else if (Config == sycl::ext::intel::experimental::large_data) {
977  setKernelCacheConfig(PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA);
978  }
979  } else {
980  std::ignore = Props;
981  }
982 
983  constexpr bool UsesRootSync = PropertiesT::template has_property<
985  setKernelIsCooperative(UsesRootSync);
986  if constexpr (PropertiesT::template has_property<
987  sycl::ext::oneapi::experimental::
988  work_group_progress_key>()) {
989  auto prop = Props.template get_property<
991  verifyDeviceHasProgressGuarantee(
992  prop.guarantee,
993  sycl::ext::oneapi::experimental::execution_scope::work_group,
994  prop.coordinationScope);
995  }
996  if constexpr (PropertiesT::template has_property<
997  sycl::ext::oneapi::experimental::
998  sub_group_progress_key>()) {
999  auto prop = Props.template get_property<
1001  verifyDeviceHasProgressGuarantee(
1002  prop.guarantee,
1004  prop.coordinationScope);
1005  }
1006  if constexpr (PropertiesT::template has_property<
1007  sycl::ext::oneapi::experimental::
1008  work_item_progress_key>()) {
1009  auto prop = Props.template get_property<
1011  verifyDeviceHasProgressGuarantee(
1012  prop.guarantee,
1013  sycl::ext::oneapi::experimental::execution_scope::work_item,
1014  prop.coordinationScope);
1015  }
1016  }
1017 
1022  template <int Dims_Src, int Dims_Dst>
1023  static bool IsCopyingRectRegionAvailable(const range<Dims_Src> Src,
1024  const range<Dims_Dst> Dst) {
1025  if (Dims_Src > Dims_Dst)
1026  return false;
1027  for (size_t I = 0; I < Dims_Src; ++I)
1028  if (Src[I] > Dst[I])
1029  return false;
1030  return true;
1031  }
1032 
1038  template <typename TSrc, int DimSrc, access::mode ModeSrc,
1039  access::target TargetSrc, typename TDst, int DimDst,
1040  access::mode ModeDst, access::target TargetDst,
1041  access::placeholder IsPHSrc, access::placeholder IsPHDst>
1042  std::enable_if_t<(DimSrc > 0) && (DimDst > 0), bool>
1045  if (!MIsHost &&
1046  IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range()))
1047  return false;
1048 
1049  range<1> LinearizedRange(Src.size());
1050  parallel_for<__copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
1051  ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
1052  LinearizedRange, [=](id<1> Id) {
1053  size_t Index = Id[0];
1054  id<DimSrc> SrcId = detail::getDelinearizedId(Src.get_range(), Index);
1055  id<DimDst> DstId = detail::getDelinearizedId(Dst.get_range(), Index);
1056  Dst[DstId] = Src[SrcId];
1057  });
1058  return true;
1059  }
1060 
1068  template <typename TSrc, int DimSrc, access::mode ModeSrc,
1069  access::target TargetSrc, typename TDst, int DimDst,
1070  access::mode ModeDst, access::target TargetDst,
1071  access::placeholder IsPHSrc, access::placeholder IsPHDst>
1072  std::enable_if_t<DimSrc == 0 || DimDst == 0, bool>
1075  if (!MIsHost)
1076  return false;
1077 
1078  single_task<__copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
1079  ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
1080  [=]() { *(Dst.get_pointer()) = *(Src.get_pointer()); });
1081  return true;
1082  }
1083 
1084 #ifndef __SYCL_DEVICE_ONLY__
1090  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
1091  access::target AccTarget, access::placeholder IsPH>
1092  std::enable_if_t<(Dim > 0)>
1093  copyAccToPtrHost(accessor<TSrc, Dim, AccMode, AccTarget, IsPH> Src,
1094  TDst *Dst) {
1095  range<Dim> Range = Src.get_range();
1096  parallel_for<__copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
1097  Range, [=](id<Dim> Index) {
1098  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
1099  using TSrcNonConst = typename std::remove_const_t<TSrc>;
1100  (reinterpret_cast<TSrcNonConst *>(Dst))[LinearIndex] = Src[Index];
1101  });
1102  }
1103 
1109  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
1110  access::target AccTarget, access::placeholder IsPH>
1111  std::enable_if_t<Dim == 0>
1112  copyAccToPtrHost(accessor<TSrc, Dim, AccMode, AccTarget, IsPH> Src,
1113  TDst *Dst) {
1114  single_task<__copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
1115  [=]() {
1116  using TSrcNonConst = typename std::remove_const_t<TSrc>;
1117  *(reinterpret_cast<TSrcNonConst *>(Dst)) = *(Src.get_pointer());
1118  });
1119  }
1120 
1125  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
1126  access::target AccTarget, access::placeholder IsPH>
1127  std::enable_if_t<(Dim > 0)>
1128  copyPtrToAccHost(TSrc *Src,
1130  range<Dim> Range = Dst.get_range();
1131  parallel_for<__copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
1132  Range, [=](id<Dim> Index) {
1133  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
1134  Dst[Index] = (reinterpret_cast<const TDst *>(Src))[LinearIndex];
1135  });
1136  }
1137 
1143  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
1144  access::target AccTarget, access::placeholder IsPH>
1145  std::enable_if_t<Dim == 0>
1146  copyPtrToAccHost(TSrc *Src,
1148  single_task<__copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
1149  [=]() {
1150  *(Dst.get_pointer()) = *(reinterpret_cast<const TDst *>(Src));
1151  });
1152  }
1153 #endif // __SYCL_DEVICE_ONLY__
1154 
1155  constexpr static bool isConstOrGlobal(access::target AccessTarget) {
1156  return AccessTarget == access::target::device ||
1157  AccessTarget == access::target::constant_buffer;
1158  }
1159 
1160  constexpr static bool isImageOrImageArray(access::target AccessTarget) {
1161  return AccessTarget == access::target::image ||
1162  AccessTarget == access::target::image_array;
1163  }
1164 
1165  constexpr static bool
1166  isValidTargetForExplicitOp(access::target AccessTarget) {
1167  return isConstOrGlobal(AccessTarget) || isImageOrImageArray(AccessTarget);
1168  }
1169 
1170  constexpr static bool isValidModeForSourceAccessor(access::mode AccessMode) {
1171  return AccessMode == access::mode::read ||
1173  }
1174 
1175  constexpr static bool
1176  isValidModeForDestinationAccessor(access::mode AccessMode) {
1177  return AccessMode == access::mode::write ||
1181  }
1182 
1183  // PI APIs only support select fill sizes: 1, 2, 4, 8, 16, 32, 64, 128
1184  constexpr static bool isBackendSupportedFillSize(size_t Size) {
1185  return Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16 ||
1186  Size == 32 || Size == 64 || Size == 128;
1187  }
1188 
1189  template <int Dims, typename LambdaArgType> struct TransformUserItemType {
1190  using type = std::conditional_t<
1191  std::is_convertible_v<nd_item<Dims>, LambdaArgType>, nd_item<Dims>,
1192  std::conditional_t<std::is_convertible_v<item<Dims>, LambdaArgType>,
1193  item<Dims>, LambdaArgType>>;
1194  };
1195 
1196  std::optional<std::array<size_t, 3>> getMaxWorkGroups();
1197  // We need to use this version to support gcc 7.5.0. Remove when minimal
1198  // supported gcc version is bumped.
1199  std::tuple<std::array<size_t, 3>, bool> getMaxWorkGroups_v2();
1200 
1201  template <int Dims>
1202  std::tuple<range<Dims>, bool> getRoundedRange(range<Dims> UserRange) {
1203  range<Dims> RoundedRange = UserRange;
1204  // Disable the rounding-up optimizations under these conditions:
1205  // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
1206  // 2. The kernel is provided via an interoperability method (this uses a
1207  // different code path).
1208  // 3. The range is already a multiple of the rounding factor.
1209  //
1210  // Cases 2 and 3 could be supported with extra effort.
1211  // As an optimization for the common case it is an
1212  // implementation choice to not support those scenarios.
1213  // Note that "this_item" is a free function, i.e. not tied to any
1214  // specific id or item. When concurrent parallel_fors are executing
1215  // on a device it is difficult to tell which parallel_for the call is
1216  // being made from. One could replicate portions of the
1217  // call-graph to make this_item calls kernel-specific but this is
1218  // not considered worthwhile.
1219 
1220  // Perform range rounding if rounding-up is enabled.
1221  if (this->DisableRangeRounding())
1222  return {range<Dims>{}, false};
1223 
1224  // Range should be a multiple of this for reasonable performance.
1225  size_t MinFactorX = 16;
1226  // Range should be a multiple of this for improved performance.
1227  size_t GoodFactor = 32;
1228  // Range should be at least this to make rounding worthwhile.
1229  size_t MinRangeX = 1024;
1230 
1231  // Check if rounding parameters have been set through environment:
1232  // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
1233  this->GetRangeRoundingSettings(MinFactorX, GoodFactor, MinRangeX);
1234 
1235  // In SYCL, each dimension of a global range size is specified by
1236  // a size_t, which can be up to 64 bits. All backends should be
1237  // able to accept a kernel launch with a 32-bit global range size
1238  // (i.e. do not throw an error). The OpenCL CPU backend will
1239  // accept every 64-bit global range, but the GPU backends will not
1240  // generally accept every 64-bit global range. So, when we get a
1241  // non-32-bit global range, we wrap the old kernel in a new kernel
1242  // that has each work item peform multiple invocations the old
1243  // kernel in a 32-bit global range.
1244  id<Dims> MaxNWGs = [&] {
1245  auto [MaxWGs, HasMaxWGs] = getMaxWorkGroups_v2();
1246  if (!HasMaxWGs) {
1247  id<Dims> Default;
1248  for (int i = 0; i < Dims; ++i)
1249  Default[i] = (std::numeric_limits<int32_t>::max)();
1250  return Default;
1251  }
1252 
1253  id<Dims> IdResult;
1254  size_t Limit = (std::numeric_limits<int>::max)();
1255  for (int i = 0; i < Dims; ++i)
1256  IdResult[i] = (std::min)(Limit, MaxWGs[Dims - i - 1]);
1257  return IdResult;
1258  }();
1260  range<Dims> MaxRange;
1261  for (int i = 0; i < Dims; ++i) {
1262  auto DesiredSize = MaxNWGs[i] * GoodFactor;
1263  MaxRange[i] =
1264  DesiredSize <= M ? DesiredSize : (M / GoodFactor) * GoodFactor;
1265  }
1266 
1267  bool DidAdjust = false;
1268  auto Adjust = [&](int Dim, size_t Value) {
1269  if (this->RangeRoundingTrace())
1270  std::cout << "parallel_for range adjusted at dim " << Dim << " from "
1271  << RoundedRange[Dim] << " to " << Value << std::endl;
1272  RoundedRange[Dim] = Value;
1273  DidAdjust = true;
1274  };
1275 
1276 #ifdef __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
1277  size_t GoodExpFactor = 1;
1278  switch (Dims) {
1279  case 1:
1280  GoodExpFactor = 32; // Make global range multiple of {32}
1281  break;
1282  case 2:
1283  GoodExpFactor = 16; // Make global range multiple of {16, 16}
1284  break;
1285  case 3:
1286  GoodExpFactor = 8; // Make global range multiple of {8, 8, 8}
1287  break;
1288  }
1289 
1290  // Check if rounding parameters have been set through environment:
1291  // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
1292  this->GetRangeRoundingSettings(MinFactorX, GoodExpFactor, MinRangeX);
1293 
1294  for (auto i = 0; i < Dims; ++i)
1295  if (UserRange[i] % GoodExpFactor) {
1296  Adjust(i, ((UserRange[i] / GoodExpFactor) + 1) * GoodExpFactor);
1297  }
1298 #else
1299  // Perform range rounding if there are sufficient work-items to
1300  // need rounding and the user-specified range is not a multiple of
1301  // a "good" value.
1302  if (RoundedRange[0] % MinFactorX != 0 && RoundedRange[0] >= MinRangeX) {
1303  // It is sufficient to round up just the first dimension.
1304  // Multiplying the rounded-up value of the first dimension
1305  // by the values of the remaining dimensions (if any)
1306  // will yield a rounded-up value for the total range.
1307  Adjust(0, ((RoundedRange[0] + GoodFactor - 1) / GoodFactor) * GoodFactor);
1308  }
1309 #endif // __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
1310 #ifdef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
1311  // If we are forcing range rounding kernels to be used, we always want the
1312  // rounded range kernel to be generated, even if rounding isn't needed
1313  DidAdjust = true;
1314 #endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
1315 
1316  for (int i = 0; i < Dims; ++i)
1317  if (RoundedRange[i] > MaxRange[i])
1318  Adjust(i, MaxRange[i]);
1319 
1320  if (!DidAdjust)
1321  return {range<Dims>{}, false};
1322  return {RoundedRange, true};
1323  }
1324 
1336  template <
1337  typename KernelName, typename KernelType, int Dims,
1338  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1339  void parallel_for_lambda_impl(range<Dims> UserRange, PropertiesT Props,
1340  KernelType KernelFunc) {
1341  throwIfActionIsCreated();
1342  throwOnLocalAccessorMisuse<KernelName, KernelType>();
1343  if (!range_size_fits_in_size_t(UserRange))
1345  "The total number of work-items in "
1346  "a range must fit within size_t");
1347 
1348  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
1349 
1350  // If 1D kernel argument is an integral type, convert it to sycl::item<1>
1351  // If user type is convertible from sycl::item/sycl::nd_item, use
1352  // sycl::item/sycl::nd_item to transport item information
1353  using TransformedArgType = std::conditional_t<
1354  std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
1355  typename TransformUserItemType<Dims, LambdaArgType>::type>;
1356 
1357  static_assert(!std::is_same_v<TransformedArgType, sycl::nd_item<Dims>>,
1358  "Kernel argument cannot have a sycl::nd_item type in "
1359  "sycl::parallel_for with sycl::range");
1360 
1361  static_assert(std::is_convertible_v<item<Dims>, LambdaArgType> ||
1362  std::is_convertible_v<item<Dims, false>, LambdaArgType>,
1363  "sycl::parallel_for(sycl::range) kernel must have the "
1364  "first argument of sycl::item type, or of a type which is "
1365  "implicitly convertible from sycl::item");
1366 
1367  using RefLambdaArgType = std::add_lvalue_reference_t<LambdaArgType>;
1368  static_assert(
1369  (std::is_invocable_v<KernelType, RefLambdaArgType> ||
1370  std::is_invocable_v<KernelType, RefLambdaArgType, kernel_handler>),
1371  "SYCL kernel lambda/functor has an unexpected signature, it should be "
1372  "invocable with sycl::item and optionally sycl::kernel_handler");
1373 
1374  // TODO: Properties may change the kernel function, so in order to avoid
1375  // conflicts they should be included in the name.
1376  using NameT =
1378 
1379  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1380 
1381  // Range rounding can be disabled by the user.
1382  // Range rounding is not done on the host device.
1383  // Range rounding is supported only for newer SYCL standards.
1384 #if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \
1385  !defined(DPCPP_HOST_DEVICE_OPENMP) && \
1386  !defined(DPCPP_HOST_DEVICE_PERF_NATIVE) && SYCL_LANGUAGE_VERSION >= 202001
1387  auto [RoundedRange, HasRoundedRange] = getRoundedRange(UserRange);
1388  if (HasRoundedRange) {
1389  using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
1390  auto Wrapper =
1391  getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
1392  KernelFunc, UserRange);
1393 
1394  using KName = std::conditional_t<std::is_same<KernelType, NameT>::value,
1395  decltype(Wrapper), NameWT>;
1396 
1397  kernel_parallel_for_wrapper<KName, TransformedArgType, decltype(Wrapper),
1398  PropertiesT>(Wrapper);
1399 #ifndef __SYCL_DEVICE_ONLY__
1400  // We are executing over the rounded range, but there are still
1401  // items/ids that are are constructed in ther range rounded
1402  // kernel use items/ids in the user range, which means that
1403  // __SYCL_ASSUME_INT can still be violated. So check the bounds
1404  // of the user range, instead of the rounded range.
1405  detail::checkValueRange<Dims>(UserRange);
1406  MNDRDesc.set(RoundedRange);
1407  StoreLambda<KName, decltype(Wrapper), Dims, TransformedArgType>(
1408  std::move(Wrapper));
1409  setType(detail::CG::Kernel);
1410  setNDRangeUsed(false);
1411 #endif
1412  } else
1413 #endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ &&
1414  // !DPCPP_HOST_DEVICE_OPENMP && !DPCPP_HOST_DEVICE_PERF_NATIVE &&
1415  // SYCL_LANGUAGE_VERSION >= 202001
1416  {
1417  (void)UserRange;
1418  (void)Props;
1419 #ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
1420  // If parallel_for range rounding is forced then only range rounded
1421  // kernel is generated
1422  kernel_parallel_for_wrapper<NameT, TransformedArgType, KernelType,
1423  PropertiesT>(KernelFunc);
1424 #ifndef __SYCL_DEVICE_ONLY__
1425  processProperties<NameT, PropertiesT>(Props);
1426  detail::checkValueRange<Dims>(UserRange);
1427  MNDRDesc.set(std::move(UserRange));
1428  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1429  std::move(KernelFunc));
1430  setType(detail::CG::Kernel);
1431  setNDRangeUsed(false);
1432 #endif
1433 #else
1434  (void)KernelFunc;
1435 #endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
1436  }
1437  }
1438 
1452  template <typename KernelName, typename KernelType, int Dims,
1453  typename PropertiesT>
1454  void parallel_for_impl(nd_range<Dims> ExecutionRange, PropertiesT Props,
1455  _KERNELFUNCPARAM(KernelFunc)) {
1456  throwIfActionIsCreated();
1457  // TODO: Properties may change the kernel function, so in order to avoid
1458  // conflicts they should be included in the name.
1459  using NameT =
1461  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1462  using LambdaArgType =
1463  sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
1464  static_assert(
1465  std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
1466  "Kernel argument of a sycl::parallel_for with sycl::nd_range "
1467  "must be either sycl::nd_item or be convertible from sycl::nd_item");
1468  using TransformedArgType = sycl::nd_item<Dims>;
1469 
1470  (void)ExecutionRange;
1471  (void)Props;
1472  kernel_parallel_for_wrapper<NameT, TransformedArgType, KernelType,
1473  PropertiesT>(KernelFunc);
1474 #ifndef __SYCL_DEVICE_ONLY__
1475  processProperties<NameT, PropertiesT>(Props);
1476  detail::checkValueRange<Dims>(ExecutionRange);
1477  MNDRDesc.set(std::move(ExecutionRange));
1478  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
1479  std::move(KernelFunc));
1480  setType(detail::CG::Kernel);
1481  setNDRangeUsed(true);
1482 #endif
1483  }
1484 
1492  template <int Dims>
1493  void parallel_for_impl(range<Dims> NumWorkItems, kernel Kernel) {
1494  throwIfActionIsCreated();
1495  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1496  detail::checkValueRange<Dims>(NumWorkItems);
1497  MNDRDesc.set(std::move(NumWorkItems));
1498  setType(detail::CG::Kernel);
1499  setNDRangeUsed(false);
1500  extractArgsAndReqs();
1501  MKernelName = getKernelName();
1502  }
1503 
1514  template <
1515  typename KernelName, typename KernelType, int Dims,
1516  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1517  void parallel_for_work_group_lambda_impl(range<Dims> NumWorkGroups,
1518  PropertiesT Props,
1519  _KERNELFUNCPARAM(KernelFunc)) {
1520  throwIfActionIsCreated();
1521  // TODO: Properties may change the kernel function, so in order to avoid
1522  // conflicts they should be included in the name.
1523  using NameT =
1525  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1526  using LambdaArgType =
1527  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
1528  (void)NumWorkGroups;
1529  (void)Props;
1530  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType, KernelType,
1531  PropertiesT>(KernelFunc);
1532 #ifndef __SYCL_DEVICE_ONLY__
1533  processProperties<NameT, PropertiesT>(Props);
1534  detail::checkValueRange<Dims>(NumWorkGroups);
1535  MNDRDesc.setNumWorkGroups(NumWorkGroups);
1536  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1537  setType(detail::CG::Kernel);
1538  setNDRangeUsed(false);
1539 #endif // __SYCL_DEVICE_ONLY__
1540  }
1541 
1554  template <
1555  typename KernelName, typename KernelType, int Dims,
1556  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1557  void parallel_for_work_group_lambda_impl(range<Dims> NumWorkGroups,
1559  PropertiesT Props,
1560  _KERNELFUNCPARAM(KernelFunc)) {
1561  throwIfActionIsCreated();
1562  // TODO: Properties may change the kernel function, so in order to avoid
1563  // conflicts they should be included in the name.
1564  using NameT =
1566  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1567  using LambdaArgType =
1568  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
1569  (void)NumWorkGroups;
1570  (void)WorkGroupSize;
1571  (void)Props;
1572  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType, KernelType,
1573  PropertiesT>(KernelFunc);
1574 #ifndef __SYCL_DEVICE_ONLY__
1575  processProperties<NameT, PropertiesT>(Props);
1576  nd_range<Dims> ExecRange =
1577  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
1578  detail::checkValueRange<Dims>(ExecRange);
1579  MNDRDesc.set(std::move(ExecRange));
1580  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1581  setType(detail::CG::Kernel);
1582 #endif // __SYCL_DEVICE_ONLY__
1583  }
1584 
1585 #ifdef SYCL_LANGUAGE_VERSION
1586 #define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel]]
1587 #else
1588 #define __SYCL_KERNEL_ATTR__
1589 #endif
1590 
1591  // NOTE: the name of this function - "kernel_single_task" - is used by the
1592  // Front End to determine kernel invocation kind.
1593  template <typename KernelName, typename KernelType, typename... Props>
1594 #ifdef __SYCL_DEVICE_ONLY__
1595  [[__sycl_detail__::add_ir_attributes_function(
1596  "sycl-single-task",
1598  nullptr,
1600 #endif
1601  __SYCL_KERNEL_ATTR__ void kernel_single_task(_KERNELFUNCPARAM(KernelFunc)) {
1602 #ifdef __SYCL_DEVICE_ONLY__
1603  KernelFunc();
1604 #else
1605  (void)KernelFunc;
1606 #endif
1607  }
1608 
1609  // NOTE: the name of this function - "kernel_single_task" - is used by the
1610  // Front End to determine kernel invocation kind.
1611  template <typename KernelName, typename KernelType, typename... Props>
1612 #ifdef __SYCL_DEVICE_ONLY__
1613  [[__sycl_detail__::add_ir_attributes_function(
1614  "sycl-single-task",
1616  nullptr,
1618 #endif
1619  __SYCL_KERNEL_ATTR__ void kernel_single_task(_KERNELFUNCPARAM(KernelFunc),
1620  kernel_handler KH) {
1621 #ifdef __SYCL_DEVICE_ONLY__
1622  KernelFunc(KH);
1623 #else
1624  (void)KernelFunc;
1625  (void)KH;
1626 #endif
1627  }
1628 
1629  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1630  // Front End to determine kernel invocation kind.
1631  template <typename KernelName, typename ElementType, typename KernelType,
1632  typename... Props>
1633 #ifdef __SYCL_DEVICE_ONLY__
1634  [[__sycl_detail__::add_ir_attributes_function(
1637 #endif
1638  __SYCL_KERNEL_ATTR__ void kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc)) {
1639 #ifdef __SYCL_DEVICE_ONLY__
1640  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1641 #else
1642  (void)KernelFunc;
1643 #endif
1644  }
1645 
1646  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
1647  // Front End to determine kernel invocation kind.
1648  template <typename KernelName, typename ElementType, typename KernelType,
1649  typename... Props>
1650 #ifdef __SYCL_DEVICE_ONLY__
1651  [[__sycl_detail__::add_ir_attributes_function(
1654 #endif
1655  __SYCL_KERNEL_ATTR__ void kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc),
1656  kernel_handler KH) {
1657 #ifdef __SYCL_DEVICE_ONLY__
1658  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1659 #else
1660  (void)KernelFunc;
1661  (void)KH;
1662 #endif
1663  }
1664 
1665  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1666  // used by the Front End to determine kernel invocation kind.
1667  template <typename KernelName, typename ElementType, typename KernelType,
1668  typename... Props>
1669 #ifdef __SYCL_DEVICE_ONLY__
1670  [[__sycl_detail__::add_ir_attributes_function(
1673 #endif
1675  kernel_parallel_for_work_group(_KERNELFUNCPARAM(KernelFunc)) {
1676 #ifdef __SYCL_DEVICE_ONLY__
1677  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
1678 #else
1679  (void)KernelFunc;
1680 #endif
1681  }
1682 
1683  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1684  // used by the Front End to determine kernel invocation kind.
1685  template <typename KernelName, typename ElementType, typename KernelType,
1686  typename... Props>
1687 #ifdef __SYCL_DEVICE_ONLY__
1688  [[__sycl_detail__::add_ir_attributes_function(
1691 #endif
1693  kernel_parallel_for_work_group(_KERNELFUNCPARAM(KernelFunc),
1694  kernel_handler KH) {
1695 #ifdef __SYCL_DEVICE_ONLY__
1696  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1697 #else
1698  (void)KernelFunc;
1699  (void)KH;
1700 #endif
1701  }
1702 
1703  template <typename... Props> struct KernelPropertiesUnpackerImpl {
1704  // Just pass extra Props... as template parameters to the underlying
1705  // Caller->* member functions. Don't have reflection so try to use
1706  // templates as much as possible to reduce the amount of boilerplate code
1707  // needed. All the type checks are expected to be done at the Caller's
1708  // methods side.
1709 
1710  template <typename... TypesToForward, typename... ArgsTy>
1711  static void kernel_single_task_unpack(handler *h, ArgsTy... Args) {
1712  h->kernel_single_task<TypesToForward..., Props...>(Args...);
1713  }
1714 
1715  template <typename... TypesToForward, typename... ArgsTy>
1716  static void kernel_parallel_for_unpack(handler *h, ArgsTy... Args) {
1717  h->kernel_parallel_for<TypesToForward..., Props...>(Args...);
1718  }
1719 
1720  template <typename... TypesToForward, typename... ArgsTy>
1721  static void kernel_parallel_for_work_group_unpack(handler *h,
1722  ArgsTy... Args) {
1723  h->kernel_parallel_for_work_group<TypesToForward..., Props...>(Args...);
1724  }
1725  };
1726 
1727  template <typename PropertiesT>
1728  struct KernelPropertiesUnpacker : public KernelPropertiesUnpackerImpl<> {
1729  // This should always fail outside the specialization below but must be
1730  // dependent to avoid failing even if not instantiated.
1731  static_assert(
1732  ext::oneapi::experimental::is_property_list<PropertiesT>::value,
1733  "Template type is not a property list.");
1734  };
1735 
1736  template <typename... Props>
1737  struct KernelPropertiesUnpacker<
1739  : public KernelPropertiesUnpackerImpl<Props...> {};
1740 
1741  // Helper function to
1742  //
1743  // * Make use of the KernelPropertiesUnpacker above
1744  // * Decide if we need an extra kernel_handler parameter
1745  //
1746  // The interface uses a \p Lambda callback to propagate that information back
1747  // to the caller as we need the caller to communicate:
1748  //
1749  // * Name of the method to call
1750  // * Provide explicit template type parameters for the call
1751  //
1752  // Couldn't think of a better way to achieve both.
1753  template <typename KernelName, typename KernelType, typename PropertiesT,
1754  bool HasKernelHandlerArg, typename FuncTy>
1755  void unpack(_KERNELFUNCPARAM(KernelFunc), FuncTy Lambda) {
1756 #ifdef __SYCL_DEVICE_ONLY__
1757  detail::CheckDeviceCopyable<KernelType>();
1758 #endif // __SYCL_DEVICE_ONLY__
1759  using MergedPropertiesT =
1760  typename detail::GetMergedKernelProperties<KernelType,
1761  PropertiesT>::type;
1762  using Unpacker = KernelPropertiesUnpacker<MergedPropertiesT>;
1763 #ifndef __SYCL_DEVICE_ONLY__
1764  // If there are properties provided by get method then process them.
1765  if constexpr (ext::oneapi::experimental::detail::
1766  HasKernelPropertiesGetMethod<
1767  _KERNELFUNCPARAMTYPE>::value) {
1768  processProperties<KernelName>(
1769  KernelFunc.get(ext::oneapi::experimental::properties_tag{}));
1770  }
1771 #endif
1772  if constexpr (HasKernelHandlerArg) {
1773  kernel_handler KH;
1774  Lambda(Unpacker{}, this, KernelFunc, KH);
1775  } else {
1776  Lambda(Unpacker{}, this, KernelFunc);
1777  }
1778  }
1779 
1780  // NOTE: to support kernel_handler argument in kernel lambdas, only
1781  // kernel_***_wrapper functions must be called in this code
1782 
1783  template <
1784  typename KernelName, typename KernelType,
1785  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1786  void kernel_single_task_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1787  unpack<KernelName, KernelType, PropertiesT,
1789  KernelFunc, [&](auto Unpacker, auto... args) {
1790  Unpacker.template kernel_single_task_unpack<KernelName, KernelType>(
1791  args...);
1792  });
1793  }
1794 
1795  template <
1796  typename KernelName, typename ElementType, typename KernelType,
1797  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1798  void kernel_parallel_for_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1799  unpack<KernelName, KernelType, PropertiesT,
1800  detail::KernelLambdaHasKernelHandlerArgT<KernelType,
1801  ElementType>::value>(
1802  KernelFunc, [&](auto Unpacker, auto... args) {
1803  Unpacker.template kernel_parallel_for_unpack<KernelName, ElementType,
1804  KernelType>(args...);
1805  });
1806  }
1807 
1808  template <
1809  typename KernelName, typename ElementType, typename KernelType,
1810  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1811  void kernel_parallel_for_work_group_wrapper(_KERNELFUNCPARAM(KernelFunc)) {
1812  unpack<KernelName, KernelType, PropertiesT,
1813  detail::KernelLambdaHasKernelHandlerArgT<KernelType,
1814  ElementType>::value>(
1815  KernelFunc, [&](auto Unpacker, auto... args) {
1816  Unpacker.template kernel_parallel_for_work_group_unpack<
1817  KernelName, ElementType, KernelType>(args...);
1818  });
1819  }
1820 
1828  template <
1829  typename KernelName, typename KernelType,
1830  typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
1831  void single_task_lambda_impl(PropertiesT Props,
1832  _KERNELFUNCPARAM(KernelFunc)) {
1833  (void)Props;
1834  throwIfActionIsCreated();
1835  throwOnLocalAccessorMisuse<KernelName, KernelType>();
1836  // TODO: Properties may change the kernel function, so in order to avoid
1837  // conflicts they should be included in the name.
1838  using NameT =
1840  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
1841  kernel_single_task_wrapper<NameT, KernelType, PropertiesT>(KernelFunc);
1842 #ifndef __SYCL_DEVICE_ONLY__
1843  // No need to check if range is out of INT_MAX limits as it's compile-time
1844  // known constant.
1845  MNDRDesc.set(range<1>{1});
1846  processProperties<NameT, PropertiesT>(Props);
1847  StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(KernelFunc);
1848  setType(detail::CG::Kernel);
1849 #endif
1850  }
1851 
1852  void setStateExplicitKernelBundle();
1853  void setStateSpecConstSet();
1854  bool isStateExplicitKernelBundle() const;
1855 
1856  std::shared_ptr<detail::kernel_bundle_impl>
1857  getOrInsertHandlerKernelBundle(bool Insert) const;
1858 
1859  void setHandlerKernelBundle(kernel Kernel);
1860 
1861  void setHandlerKernelBundle(
1862  const std::shared_ptr<detail::kernel_bundle_impl> &NewKernelBundleImpPtr);
1863 
1864  template <typename FuncT>
1865  std::enable_if_t<detail::check_fn_signature<std::remove_reference_t<FuncT>,
1866  void()>::value ||
1867  detail::check_fn_signature<std::remove_reference_t<FuncT>,
1868  void(interop_handle)>::value>
1869  host_task_impl(FuncT &&Func);
1870 
1874  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
1875  getCommandGraph() const;
1876 
1883  void setUserFacingNodeType(ext::oneapi::experimental::node_type Type);
1884 
1885 public:
1886  handler(const handler &) = delete;
1887  handler(handler &&) = delete;
1888  handler &operator=(const handler &) = delete;
1889  handler &operator=(handler &&) = delete;
1890 
1891  template <auto &SpecName>
1893  typename std::remove_reference_t<decltype(SpecName)>::value_type Value) {
1894 
1895  setStateSpecConstSet();
1896 
1897  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1898  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1899 
1900  detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1902  .set_specialization_constant<SpecName>(Value);
1903  }
1904 
1905  template <auto &SpecName>
1906  typename std::remove_reference_t<decltype(SpecName)>::value_type
1908 
1909  if (isStateExplicitKernelBundle())
1911  "Specialization constants cannot be read after "
1912  "explicitly setting the used kernel bundle");
1913 
1914  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1915  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1916 
1917  return detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1919  .get_specialization_constant<SpecName>();
1920  }
1921 
1922  void
1923  use_kernel_bundle(const kernel_bundle<bundle_state::executable> &ExecBundle);
1924 
1933  template <typename DataT, int Dims, access::mode AccMode,
1936  if (Acc.is_placeholder())
1937  associateWithHandler(&Acc, AccTarget);
1938  }
1939 
1949  template <typename DataT, int Dims, access::mode AccMode,
1953  dynamicParamAcc) {
1955  AccT Acc = *static_cast<AccT *>(
1956  detail::getValueFromDynamicParameter(dynamicParamAcc));
1957  if (Acc.is_placeholder())
1958  associateWithHandler(&Acc, AccTarget);
1959  }
1960 
1964  void depends_on(event Event);
1965 
1969  void depends_on(const std::vector<event> &Events);
1970 
1971  template <typename T>
1972  using remove_cv_ref_t = typename std::remove_cv_t<std::remove_reference_t<T>>;
1973 
1974  template <typename U, typename T>
1975  using is_same_type = std::is_same<remove_cv_ref_t<U>, remove_cv_ref_t<T>>;
1976 
1977  template <typename T> struct ShouldEnableSetArg {
1978  static constexpr bool value =
1979  std::is_trivially_copyable_v<std::remove_reference_t<T>>
1980 #if SYCL_LANGUAGE_VERSION && SYCL_LANGUAGE_VERSION <= 201707
1981  && std::is_standard_layout<std::remove_reference_t<T>>::value
1982 #endif
1983  || is_same_type<sampler, T>::value // Sampler
1985  std::is_pointer_v<remove_cv_ref_t<T>>) // USM
1986  || is_same_type<cl_mem, T>::value; // Interop
1987  };
1988 
1995  template <typename T>
1996  typename std::enable_if_t<ShouldEnableSetArg<T>::value, void>
1997  set_arg(int ArgIndex, T &&Arg) {
1998  setArgHelper(ArgIndex, std::move(Arg));
1999  }
2000 
2001  template <typename DataT, int Dims, access::mode AccessMode,
2003  void
2004  set_arg(int ArgIndex,
2006  setArgHelper(ArgIndex, std::move(Arg));
2007  }
2008 
2009  template <typename DataT, int Dims>
2010  void set_arg(int ArgIndex, local_accessor<DataT, Dims> Arg) {
2011  setArgHelper(ArgIndex, std::move(Arg));
2012  }
2013 
2014  // set_arg for graph dynamic_parameters
2015  template <typename T>
2016  void set_arg(int argIndex,
2018  setArgHelper(argIndex, dynamicParam);
2019  }
2020 
2026  template <typename... Ts> void set_args(Ts &&...Args) {
2027  setArgsHelper(0, std::move(Args)...);
2028  }
2029 
2037  template <typename KernelName = detail::auto_name, typename KernelType>
2038  void single_task(_KERNELFUNCPARAM(KernelFunc)) {
2039  single_task_lambda_impl<KernelName>(
2041  }
2042 
2043  template <typename KernelName = detail::auto_name, typename KernelType>
2044  void parallel_for(range<1> NumWorkItems, _KERNELFUNCPARAM(KernelFunc)) {
2045  parallel_for_lambda_impl<KernelName>(
2047  std::move(KernelFunc));
2048  }
2049 
2050  template <typename KernelName = detail::auto_name, typename KernelType>
2051  void parallel_for(range<2> NumWorkItems, _KERNELFUNCPARAM(KernelFunc)) {
2052  parallel_for_lambda_impl<KernelName>(
2054  std::move(KernelFunc));
2055  }
2056 
2057  template <typename KernelName = detail::auto_name, typename KernelType>
2058  void parallel_for(range<3> NumWorkItems, _KERNELFUNCPARAM(KernelFunc)) {
2059  parallel_for_lambda_impl<KernelName>(
2061  std::move(KernelFunc));
2062  }
2063 
2065  template <typename FuncT>
2066  std::enable_if_t<detail::check_fn_signature<std::remove_reference_t<FuncT>,
2067  void()>::value ||
2069  void(interop_handle)>::value>
2070  host_task(FuncT &&Func) {
2071  host_task_impl(Func);
2072  }
2073 
2087  template <typename KernelName = detail::auto_name, typename KernelType,
2088  int Dims>
2089  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL2020")
2090  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
2091  _KERNELFUNCPARAM(KernelFunc)) {
2092  throwIfActionIsCreated();
2093  using NameT =
2095  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2096  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
2097  using TransformedArgType = std::conditional_t<
2098  std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
2099  typename TransformUserItemType<Dims, LambdaArgType>::type>;
2100  (void)NumWorkItems;
2101  (void)WorkItemOffset;
2102  kernel_parallel_for_wrapper<NameT, TransformedArgType>(KernelFunc);
2103 #ifndef __SYCL_DEVICE_ONLY__
2104  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2105  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
2106  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
2107  std::move(KernelFunc));
2108  setType(detail::CG::Kernel);
2109  setNDRangeUsed(false);
2110 #endif
2111  }
2112 
2123  template <typename KernelName = detail::auto_name, typename KernelType,
2124  int Dims>
2126  _KERNELFUNCPARAM(KernelFunc)) {
2127  parallel_for_work_group_lambda_impl<KernelName>(
2129  KernelFunc);
2130  }
2131 
2144  template <typename KernelName = detail::auto_name, typename KernelType,
2145  int Dims>
2148  _KERNELFUNCPARAM(KernelFunc)) {
2149  parallel_for_work_group_lambda_impl<KernelName>(
2150  NumWorkGroups, WorkGroupSize,
2152  }
2153 
2160  void single_task(kernel Kernel) {
2161  throwIfActionIsCreated();
2162  // Ignore any set kernel bundles and use the one associated with the kernel
2163  setHandlerKernelBundle(Kernel);
2164  // No need to check if range is out of INT_MAX limits as it's compile-time
2165  // known constant
2166  MNDRDesc.set(range<1>{1});
2167  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2168  setType(detail::CG::Kernel);
2169  extractArgsAndReqs();
2170  MKernelName = getKernelName();
2171  }
2172 
2173  void parallel_for(range<1> NumWorkItems, kernel Kernel) {
2174  parallel_for_impl(NumWorkItems, Kernel);
2175  }
2176 
2177  void parallel_for(range<2> NumWorkItems, kernel Kernel) {
2178  parallel_for_impl(NumWorkItems, Kernel);
2179  }
2180 
2181  void parallel_for(range<3> NumWorkItems, kernel Kernel) {
2182  parallel_for_impl(NumWorkItems, Kernel);
2183  }
2184 
2193  template <int Dims>
2194  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
2195  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
2196  kernel Kernel) {
2197  throwIfActionIsCreated();
2198  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2199  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2200  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
2201  setType(detail::CG::Kernel);
2202  setNDRangeUsed(false);
2203  extractArgsAndReqs();
2204  MKernelName = getKernelName();
2205  }
2206 
2215  template <int Dims> void parallel_for(nd_range<Dims> NDRange, kernel Kernel) {
2216  throwIfActionIsCreated();
2217  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2218  detail::checkValueRange<Dims>(NDRange);
2219  MNDRDesc.set(std::move(NDRange));
2220  setType(detail::CG::Kernel);
2221  setNDRangeUsed(true);
2222  extractArgsAndReqs();
2223  MKernelName = getKernelName();
2224  }
2225 
2232  template <typename KernelName = detail::auto_name, typename KernelType>
2233  void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc)) {
2234  throwIfActionIsCreated();
2235  // Ignore any set kernel bundles and use the one associated with the kernel
2236  setHandlerKernelBundle(Kernel);
2237  using NameT =
2239  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2240  (void)Kernel;
2241  kernel_single_task<NameT>(KernelFunc);
2242 #ifndef __SYCL_DEVICE_ONLY__
2243  // No need to check if range is out of INT_MAX limits as it's compile-time
2244  // known constant
2245  MNDRDesc.set(range<1>{1});
2246  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2247  setType(detail::CG::Kernel);
2248  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2249  extractArgsAndReqs();
2250  MKernelName = getKernelName();
2251  } else
2252  StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(std::move(KernelFunc));
2253 #else
2254  detail::CheckDeviceCopyable<KernelType>();
2255 #endif
2256  }
2257 
2265  template <typename KernelName = detail::auto_name, typename KernelType,
2266  int Dims>
2267  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
2268  _KERNELFUNCPARAM(KernelFunc)) {
2269  throwIfActionIsCreated();
2270  // Ignore any set kernel bundles and use the one associated with the kernel
2271  setHandlerKernelBundle(Kernel);
2272  using NameT =
2274  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2275  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
2276  (void)Kernel;
2277  (void)NumWorkItems;
2278  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2279 #ifndef __SYCL_DEVICE_ONLY__
2280  detail::checkValueRange<Dims>(NumWorkItems);
2281  MNDRDesc.set(std::move(NumWorkItems));
2282  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2283  setType(detail::CG::Kernel);
2284  setNDRangeUsed(false);
2285  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2286  extractArgsAndReqs();
2287  MKernelName = getKernelName();
2288  } else
2289  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2290  std::move(KernelFunc));
2291 #endif
2292  }
2293 
2303  template <typename KernelName = detail::auto_name, typename KernelType,
2304  int Dims>
2305  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
2306  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
2307  id<Dims> WorkItemOffset, _KERNELFUNCPARAM(KernelFunc)) {
2308  throwIfActionIsCreated();
2309  // Ignore any set kernel bundles and use the one associated with the kernel
2310  setHandlerKernelBundle(Kernel);
2311  using NameT =
2313  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2314  using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
2315  (void)Kernel;
2316  (void)NumWorkItems;
2317  (void)WorkItemOffset;
2318  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2319 #ifndef __SYCL_DEVICE_ONLY__
2320  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
2321  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
2322  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2323  setType(detail::CG::Kernel);
2324  setNDRangeUsed(false);
2325  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2326  extractArgsAndReqs();
2327  MKernelName = getKernelName();
2328  } else
2329  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2330  std::move(KernelFunc));
2331 #endif
2332  }
2333 
2343  template <typename KernelName = detail::auto_name, typename KernelType,
2344  int Dims>
2345  void parallel_for(kernel Kernel, nd_range<Dims> NDRange,
2346  _KERNELFUNCPARAM(KernelFunc)) {
2347  throwIfActionIsCreated();
2348  // Ignore any set kernel bundles and use the one associated with the kernel
2349  setHandlerKernelBundle(Kernel);
2350  using NameT =
2352  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2353  using LambdaArgType =
2354  sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
2355  (void)Kernel;
2356  (void)NDRange;
2357  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
2358 #ifndef __SYCL_DEVICE_ONLY__
2359  detail::checkValueRange<Dims>(NDRange);
2360  MNDRDesc.set(std::move(NDRange));
2361  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2362  setType(detail::CG::Kernel);
2363  setNDRangeUsed(true);
2364  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
2365  extractArgsAndReqs();
2366  MKernelName = getKernelName();
2367  } else
2368  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
2369  std::move(KernelFunc));
2370 #endif
2371  }
2372 
2386  template <typename KernelName = detail::auto_name, typename KernelType,
2387  int Dims>
2388  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2389  _KERNELFUNCPARAM(KernelFunc)) {
2390  throwIfActionIsCreated();
2391  // Ignore any set kernel bundles and use the one associated with the kernel
2392  setHandlerKernelBundle(Kernel);
2393  using NameT =
2395  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2396  using LambdaArgType =
2397  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
2398  (void)Kernel;
2399  (void)NumWorkGroups;
2400  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2401 #ifndef __SYCL_DEVICE_ONLY__
2402  detail::checkValueRange<Dims>(NumWorkGroups);
2403  MNDRDesc.setNumWorkGroups(NumWorkGroups);
2404  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2405  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2406  setType(detail::CG::Kernel);
2407 #endif // __SYCL_DEVICE_ONLY__
2408  }
2409 
2425  template <typename KernelName = detail::auto_name, typename KernelType,
2426  int Dims>
2427  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2429  _KERNELFUNCPARAM(KernelFunc)) {
2430  throwIfActionIsCreated();
2431  // Ignore any set kernel bundles and use the one associated with the kernel
2432  setHandlerKernelBundle(Kernel);
2433  using NameT =
2435  verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
2436  using LambdaArgType =
2437  sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
2438  (void)Kernel;
2439  (void)NumWorkGroups;
2440  (void)WorkGroupSize;
2441  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2442 #ifndef __SYCL_DEVICE_ONLY__
2443  nd_range<Dims> ExecRange =
2444  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
2445  detail::checkValueRange<Dims>(ExecRange);
2446  MNDRDesc.set(std::move(ExecRange));
2447  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2448  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2449  setType(detail::CG::Kernel);
2450 #endif // __SYCL_DEVICE_ONLY__
2451  }
2452 
2453  template <typename KernelName = detail::auto_name, typename KernelType,
2454  typename PropertiesT>
2455  std::enable_if_t<
2457  single_task(PropertiesT Props, _KERNELFUNCPARAM(KernelFunc)) {
2458  single_task_lambda_impl<KernelName, KernelType, PropertiesT>(Props,
2459  KernelFunc);
2460  }
2461 
2462  template <typename KernelName = detail::auto_name, typename KernelType,
2463  typename PropertiesT>
2464  std::enable_if_t<
2466  parallel_for(range<1> NumWorkItems, PropertiesT Props,
2467  _KERNELFUNCPARAM(KernelFunc)) {
2468  parallel_for_lambda_impl<KernelName, KernelType, 1, PropertiesT>(
2469  NumWorkItems, Props, std::move(KernelFunc));
2470  }
2471 
2472  template <typename KernelName = detail::auto_name, typename KernelType,
2473  typename PropertiesT>
2474  std::enable_if_t<
2476  parallel_for(range<2> NumWorkItems, PropertiesT Props,
2477  _KERNELFUNCPARAM(KernelFunc)) {
2478  parallel_for_lambda_impl<KernelName, KernelType, 2, PropertiesT>(
2479  NumWorkItems, Props, std::move(KernelFunc));
2480  }
2481 
2482  template <typename KernelName = detail::auto_name, typename KernelType,
2483  typename PropertiesT>
2484  std::enable_if_t<
2486  parallel_for(range<3> NumWorkItems, PropertiesT Props,
2487  _KERNELFUNCPARAM(KernelFunc)) {
2488  parallel_for_lambda_impl<KernelName, KernelType, 3, PropertiesT>(
2489  NumWorkItems, Props, std::move(KernelFunc));
2490  }
2491 
2492  template <typename KernelName = detail::auto_name, typename KernelType,
2493  typename PropertiesT, int Dims>
2494  std::enable_if_t<
2496  parallel_for(nd_range<Dims> Range, PropertiesT Properties,
2497  _KERNELFUNCPARAM(KernelFunc)) {
2498  parallel_for_impl<KernelName>(Range, Properties, std::move(KernelFunc));
2499  }
2500 
2502 
2503  template <typename KernelName = detail::auto_name, typename PropertiesT,
2504  typename... RestT>
2505  std::enable_if_t<
2506  (sizeof...(RestT) > 1) &&
2507  detail::AreAllButLastReductions<RestT...>::value &&
2509  parallel_for(range<1> Range, PropertiesT Properties, RestT &&...Rest) {
2510  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2511  UnsupportedGraphFeatures::sycl_reductions>();
2512  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2513  std::forward<RestT>(Rest)...);
2514  }
2515 
2516  template <typename KernelName = detail::auto_name, typename PropertiesT,
2517  typename... RestT>
2518  std::enable_if_t<
2519  (sizeof...(RestT) > 1) &&
2520  detail::AreAllButLastReductions<RestT...>::value &&
2522  parallel_for(range<2> Range, PropertiesT Properties, RestT &&...Rest) {
2523  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2524  UnsupportedGraphFeatures::sycl_reductions>();
2525  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2526  std::forward<RestT>(Rest)...);
2527  }
2528 
2529  template <typename KernelName = detail::auto_name, typename PropertiesT,
2530  typename... RestT>
2531  std::enable_if_t<
2532  (sizeof...(RestT) > 1) &&
2533  detail::AreAllButLastReductions<RestT...>::value &&
2535  parallel_for(range<3> Range, PropertiesT Properties, RestT &&...Rest) {
2536  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2537  UnsupportedGraphFeatures::sycl_reductions>();
2538  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2539  std::forward<RestT>(Rest)...);
2540  }
2541 
2542  template <typename KernelName = detail::auto_name, typename... RestT>
2543  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2544  parallel_for(range<1> Range, RestT &&...Rest) {
2545  parallel_for<KernelName>(Range,
2547  std::forward<RestT>(Rest)...);
2548  }
2549 
2550  template <typename KernelName = detail::auto_name, typename... RestT>
2551  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2552  parallel_for(range<2> Range, RestT &&...Rest) {
2553  parallel_for<KernelName>(Range,
2555  std::forward<RestT>(Rest)...);
2556  }
2557 
2558  template <typename KernelName = detail::auto_name, typename... RestT>
2559  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2560  parallel_for(range<3> Range, RestT &&...Rest) {
2561  parallel_for<KernelName>(Range,
2563  std::forward<RestT>(Rest)...);
2564  }
2565 
2566  template <typename KernelName = detail::auto_name, int Dims,
2567  typename PropertiesT, typename... RestT>
2568  std::enable_if_t<
2569  (sizeof...(RestT) > 1) &&
2570  detail::AreAllButLastReductions<RestT...>::value &&
2572  parallel_for(nd_range<Dims> Range, PropertiesT Properties, RestT &&...Rest) {
2573  throwIfGraphAssociated<ext::oneapi::experimental::detail::
2574  UnsupportedGraphFeatures::sycl_reductions>();
2575  detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
2576  std::forward<RestT>(Rest)...);
2577  }
2578 
2579  template <typename KernelName = detail::auto_name, int Dims,
2580  typename... RestT>
2581  std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2582  parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
2583  parallel_for<KernelName>(Range,
2585  std::forward<RestT>(Rest)...);
2586  }
2587 
2589 
2590  template <typename KernelName = detail::auto_name, typename KernelType,
2591  int Dims, typename PropertiesT>
2592  void parallel_for_work_group(range<Dims> NumWorkGroups, PropertiesT Props,
2593  _KERNELFUNCPARAM(KernelFunc)) {
2594  parallel_for_work_group_lambda_impl<KernelName, KernelType, Dims,
2595  PropertiesT>(NumWorkGroups, Props,
2596  KernelFunc);
2597  }
2598 
2599  template <typename KernelName = detail::auto_name, typename KernelType,
2600  int Dims, typename PropertiesT>
2602  range<Dims> WorkGroupSize, PropertiesT Props,
2603  _KERNELFUNCPARAM(KernelFunc)) {
2604  parallel_for_work_group_lambda_impl<KernelName, KernelType, Dims,
2605  PropertiesT>(
2606  NumWorkGroups, WorkGroupSize, Props, KernelFunc);
2607  }
2608 
2609  // Clean up KERNELFUNC macro.
2610 #undef _KERNELFUNCPARAM
2611 
2612  // Explicit copy operations API
2613 
2621  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2622  access::target AccessTarget,
2625  std::shared_ptr<T_Dst> Dst) {
2626  if (Src.is_placeholder())
2627  checkIfPlaceholderIsBoundToHandler(Src);
2628 
2629  throwIfActionIsCreated();
2630  static_assert(isValidTargetForExplicitOp(AccessTarget),
2631  "Invalid accessor target for the copy method.");
2632  static_assert(isValidModeForSourceAccessor(AccessMode),
2633  "Invalid accessor mode for the copy method.");
2634  // Make sure data shared_ptr points to is not released until we finish
2635  // work with it.
2636  CGData.MSharedPtrStorage.push_back(Dst);
2637  typename std::shared_ptr<T_Dst>::element_type *RawDstPtr = Dst.get();
2638  copy(Src, RawDstPtr);
2639  }
2640 
2648  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2649  access::target AccessTarget,
2651  void
2652  copy(std::shared_ptr<T_Src> Src,
2654  if (Dst.is_placeholder())
2655  checkIfPlaceholderIsBoundToHandler(Dst);
2656 
2657  throwIfActionIsCreated();
2658  static_assert(isValidTargetForExplicitOp(AccessTarget),
2659  "Invalid accessor target for the copy method.");
2660  static_assert(isValidModeForDestinationAccessor(AccessMode),
2661  "Invalid accessor mode for the copy method.");
2662  // TODO: Add static_assert with is_device_copyable when vec is
2663  // device-copyable.
2664  // Make sure data shared_ptr points to is not released until we finish
2665  // work with it.
2666  CGData.MSharedPtrStorage.push_back(Src);
2667  typename std::shared_ptr<T_Src>::element_type *RawSrcPtr = Src.get();
2668  copy(RawSrcPtr, Dst);
2669  }
2670 
2678  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2679  access::target AccessTarget,
2682  T_Dst *Dst) {
2683  if (Src.is_placeholder())
2684  checkIfPlaceholderIsBoundToHandler(Src);
2685 
2686  throwIfActionIsCreated();
2687  static_assert(isValidTargetForExplicitOp(AccessTarget),
2688  "Invalid accessor target for the copy method.");
2689  static_assert(isValidModeForSourceAccessor(AccessMode),
2690  "Invalid accessor mode for the copy method.");
2691 #ifndef __SYCL_DEVICE_ONLY__
2692  if (MIsHost) {
2693  // TODO: Temporary implementation for host. Should be handled by memory
2694  // manager.
2695  copyAccToPtrHost(Src, Dst);
2696  return;
2697  }
2698 #endif
2699  setType(detail::CG::CopyAccToPtr);
2700 
2702  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2703 
2704  CGData.MRequirements.push_back(AccImpl.get());
2705  MSrcPtr = static_cast<void *>(AccImpl.get());
2706  MDstPtr = static_cast<void *>(Dst);
2707  // Store copy of accessor to the local storage to make sure it is alive
2708  // until we finish
2709  CGData.MAccStorage.push_back(std::move(AccImpl));
2710  }
2711 
2719  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2720  access::target AccessTarget,
2722  void
2723  copy(const T_Src *Src,
2725  if (Dst.is_placeholder())
2726  checkIfPlaceholderIsBoundToHandler(Dst);
2727 
2728  throwIfActionIsCreated();
2729  static_assert(isValidTargetForExplicitOp(AccessTarget),
2730  "Invalid accessor target for the copy method.");
2731  static_assert(isValidModeForDestinationAccessor(AccessMode),
2732  "Invalid accessor mode for the copy method.");
2733  // TODO: Add static_assert with is_device_copyable when vec is
2734  // device-copyable.
2735 #ifndef __SYCL_DEVICE_ONLY__
2736  if (MIsHost) {
2737  // TODO: Temporary implementation for host. Should be handled by memory
2738  // manager.
2739  copyPtrToAccHost(Src, Dst);
2740  return;
2741  }
2742 #endif
2743  setType(detail::CG::CopyPtrToAcc);
2744 
2746  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2747 
2748  CGData.MRequirements.push_back(AccImpl.get());
2749  MSrcPtr = const_cast<T_Src *>(Src);
2750  MDstPtr = static_cast<void *>(AccImpl.get());
2751  // Store copy of accessor to the local storage to make sure it is alive
2752  // until we finish
2753  CGData.MAccStorage.push_back(std::move(AccImpl));
2754  }
2755 
2763  template <
2764  typename T_Src, int Dims_Src, access::mode AccessMode_Src,
2765  access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
2766  access::mode AccessMode_Dst, access::target AccessTarget_Dst,
2769  void copy(accessor<T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src,
2770  IsPlaceholder_Src>
2771  Src,
2772  accessor<T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst,
2773  IsPlaceholder_Dst>
2774  Dst) {
2775  if (Src.is_placeholder())
2776  checkIfPlaceholderIsBoundToHandler(Src);
2777  if (Dst.is_placeholder())
2778  checkIfPlaceholderIsBoundToHandler(Dst);
2779 
2780  throwIfActionIsCreated();
2781  static_assert(isValidTargetForExplicitOp(AccessTarget_Src),
2782  "Invalid source accessor target for the copy method.");
2783  static_assert(isValidTargetForExplicitOp(AccessTarget_Dst),
2784  "Invalid destination accessor target for the copy method.");
2785  static_assert(isValidModeForSourceAccessor(AccessMode_Src),
2786  "Invalid source accessor mode for the copy method.");
2787  static_assert(isValidModeForDestinationAccessor(AccessMode_Dst),
2788  "Invalid destination accessor mode for the copy method.");
2789  if (Dst.get_size() < Src.get_size())
2790  throw sycl::invalid_object_error(
2791  "The destination accessor size is too small to copy the memory into.",
2792  PI_ERROR_INVALID_OPERATION);
2793 
2794  if (copyAccToAccHelper(Src, Dst))
2795  return;
2796  setType(detail::CG::CopyAccToAcc);
2797 
2798  detail::AccessorBaseHost *AccBaseSrc = (detail::AccessorBaseHost *)&Src;
2799  detail::AccessorImplPtr AccImplSrc = detail::getSyclObjImpl(*AccBaseSrc);
2800 
2801  detail::AccessorBaseHost *AccBaseDst = (detail::AccessorBaseHost *)&Dst;
2802  detail::AccessorImplPtr AccImplDst = detail::getSyclObjImpl(*AccBaseDst);
2803 
2804  CGData.MRequirements.push_back(AccImplSrc.get());
2805  CGData.MRequirements.push_back(AccImplDst.get());
2806  MSrcPtr = AccImplSrc.get();
2807  MDstPtr = AccImplDst.get();
2808  // Store copy of accessor to the local storage to make sure it is alive
2809  // until we finish
2810  CGData.MAccStorage.push_back(std::move(AccImplSrc));
2811  CGData.MAccStorage.push_back(std::move(AccImplDst));
2812  }
2813 
2818  template <typename T, int Dims, access::mode AccessMode,
2819  access::target AccessTarget,
2821  void
2823  if (Acc.is_placeholder())
2824  checkIfPlaceholderIsBoundToHandler(Acc);
2825 
2826  throwIfActionIsCreated();
2827  static_assert(isValidTargetForExplicitOp(AccessTarget),
2828  "Invalid accessor target for the update_host method.");
2829  setType(detail::CG::UpdateHost);
2830 
2832  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2833 
2834  MDstPtr = static_cast<void *>(AccImpl.get());
2835  CGData.MRequirements.push_back(AccImpl.get());
2836  CGData.MAccStorage.push_back(std::move(AccImpl));
2837  }
2838 
2839 public:
2848  template <typename T, int Dims, access::mode AccessMode,
2849  access::target AccessTarget,
2851  typename PropertyListT = property_list>
2852  void
2854  Dst,
2855  const T &Pattern) {
2856  assert(!MIsHost && "fill() should no longer be callable on a host device.");
2857 
2858  if (Dst.is_placeholder())
2859  checkIfPlaceholderIsBoundToHandler(Dst);
2860 
2861  throwIfActionIsCreated();
2862  setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill);
2863  // TODO add check:T must be an integral scalar value or a SYCL vector type
2864  static_assert(isValidTargetForExplicitOp(AccessTarget),
2865  "Invalid accessor target for the fill method.");
2866  // CG::Fill will result in piEnqueuFillBuffer/Image which requires that mem
2867  // data is contiguous. Thus we check range and offset when dim > 1
2868  // Images don't allow ranged accessors and are fine.
2869  if constexpr (isBackendSupportedFillSize(sizeof(T)) &&
2870  ((Dims <= 1) || isImageOrImageArray(AccessTarget))) {
2871  StageFillCG(Dst, Pattern);
2872  } else if constexpr (Dims == 0) {
2873  // Special case for zero-dim accessors.
2874  parallel_for<__fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
2875  range<1>(1), [=](id<1>) { Dst = Pattern; });
2876  } else {
2877  // Dim > 1
2878  bool OffsetUsable = (Dst.get_offset() == sycl::id<Dims>{});
2880  bool RangesUsable =
2881  (AccBase->getAccessRange() == AccBase->getMemoryRange());
2882  if (OffsetUsable && RangesUsable &&
2883  isBackendSupportedFillSize(sizeof(T))) {
2884  StageFillCG(Dst, Pattern);
2885  } else {
2886  range<Dims> Range = Dst.get_range();
2887  parallel_for<__fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
2888  Range, [=](id<Dims> Index) { Dst[Index] = Pattern; });
2889  }
2890  }
2891  }
2892 
2899  template <typename T> void fill(void *Ptr, const T &Pattern, size_t Count) {
2900  throwIfActionIsCreated();
2901  setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill);
2902  static_assert(is_device_copyable<T>::value,
2903  "Pattern must be device copyable");
2904  parallel_for<__usmfill<T>>(range<1>(Count), [=](id<1> Index) {
2905  T *CastedPtr = static_cast<T *>(Ptr);
2906  CastedPtr[Index] = Pattern;
2907  });
2908  }
2909 
2914  throwIfActionIsCreated();
2915  setType(detail::CG::Barrier);
2916  }
2917 
2924  void ext_oneapi_barrier(const std::vector<event> &WaitList);
2925 
2936  void memcpy(void *Dest, const void *Src, size_t Count);
2937 
2948  template <typename T> void copy(const T *Src, T *Dest, size_t Count) {
2949  this->memcpy(Dest, Src, Count * sizeof(T));
2950  }
2951 
2959  void memset(void *Dest, int Value, size_t Count);
2960 
2967  void prefetch(const void *Ptr, size_t Count);
2968 
2975  void mem_advise(const void *Ptr, size_t Length, int Advice);
2976 
2993  template <typename T = unsigned char,
2994  typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
2995  void ext_oneapi_memcpy2d(void *Dest, size_t DestPitch, const void *Src,
2996  size_t SrcPitch, size_t Width, size_t Height);
2997 
3011  template <typename T>
3012  void ext_oneapi_copy2d(const T *Src, size_t SrcPitch, T *Dest,
3013  size_t DestPitch, size_t Width, size_t Height);
3014 
3030  template <typename T = unsigned char,
3031  typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
3032  void ext_oneapi_memset2d(void *Dest, size_t DestPitch, int Value,
3033  size_t Width, size_t Height);
3034 
3047  template <typename T>
3048  void ext_oneapi_fill2d(void *Dest, size_t DestPitch, const T &Pattern,
3049  size_t Width, size_t Height);
3050 
3059  template <typename T, typename PropertyListT>
3061  const void *Src, size_t NumBytes = sizeof(T),
3062  size_t DestOffset = 0) {
3063  throwIfGraphAssociated<
3064  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
3065  sycl_ext_oneapi_device_global>();
3066  if (sizeof(T) < DestOffset + NumBytes)
3068  "Copy to device_global is out of bounds.");
3069 
3070  constexpr bool IsDeviceImageScoped = PropertyListT::template has_property<
3072 
3073  if (!detail::isDeviceGlobalUsedInKernel(&Dest)) {
3074  // If the corresponding device_global isn't used in any kernels, we fall
3075  // back to doing the memory operation on host-only.
3076  memcpyToHostOnlyDeviceGlobal(&Dest, Src, sizeof(T), IsDeviceImageScoped,
3077  NumBytes, DestOffset);
3078  return;
3079  }
3080 
3081  memcpyToDeviceGlobal(&Dest, Src, IsDeviceImageScoped, NumBytes, DestOffset);
3082  }
3083 
3092  template <typename T, typename PropertyListT>
3093  void
3094  memcpy(void *Dest,
3096  size_t NumBytes = sizeof(T), size_t SrcOffset = 0) {
3097  throwIfGraphAssociated<
3098  ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
3099  sycl_ext_oneapi_device_global>();
3100  if (sizeof(T) < SrcOffset + NumBytes)
3102  "Copy from device_global is out of bounds.");
3103 
3104  constexpr bool IsDeviceImageScoped = PropertyListT::template has_property<
3106 
3108  // If the corresponding device_global isn't used in any kernels, we fall
3109  // back to doing the memory operation on host-only.
3110  memcpyFromHostOnlyDeviceGlobal(Dest, &Src, IsDeviceImageScoped, NumBytes,
3111  SrcOffset);
3112  return;
3113  }
3114 
3115  memcpyFromDeviceGlobal(Dest, &Src, IsDeviceImageScoped, NumBytes,
3116  SrcOffset);
3117  }
3118 
3128  template <typename T, typename PropertyListT>
3129  void copy(const std::remove_all_extents_t<T> *Src,
3131  size_t Count = sizeof(T) / sizeof(std::remove_all_extents_t<T>),
3132  size_t StartIndex = 0) {
3133  this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
3134  StartIndex * sizeof(std::remove_all_extents_t<T>));
3135  }
3136 
3147  template <typename T, typename PropertyListT>
3148  void
3150  std::remove_all_extents_t<T> *Dest,
3151  size_t Count = sizeof(T) / sizeof(std::remove_all_extents_t<T>),
3152  size_t StartIndex = 0) {
3153  this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
3154  StartIndex * sizeof(std::remove_all_extents_t<T>));
3155  }
3159  void ext_oneapi_graph(ext::oneapi::experimental::command_graph<
3161  Graph);
3162 
3171  void ext_oneapi_copy(
3173  const ext::oneapi::experimental::image_descriptor &DestImgDesc);
3174 
3195  void ext_oneapi_copy(
3196  void *Src, sycl::range<3> SrcOffset, sycl::range<3> SrcExtent,
3198  sycl::range<3> DestOffset,
3200  sycl::range<3> CopyExtent);
3201 
3211  void ext_oneapi_copy(
3214 
3236  void
3237  ext_oneapi_copy(ext::oneapi::experimental::image_mem_handle Src,
3238  sycl::range<3> SrcOffset,
3240  void *Dest, sycl::range<3> DestOffset,
3241  sycl::range<3> DestExtent, sycl::range<3> CopyExtent);
3242 
3253  void ext_oneapi_copy(
3254  void *Src, void *Dest,
3255  const ext::oneapi::experimental::image_descriptor &DeviceImgDesc,
3256  size_t DeviceRowPitch);
3257 
3265  void
3266  ext_oneapi_copy(ext::oneapi::experimental::image_mem_handle Src,
3269 
3292  void ext_oneapi_copy(
3293  void *Src, sycl::range<3> SrcOffset, void *Dest,
3294  sycl::range<3> DestOffset,
3295  const ext::oneapi::experimental::image_descriptor &DeviceImgDesc,
3296  size_t DeviceRowPitch, sycl::range<3> HostExtent,
3297  sycl::range<3> CopyExtent);
3298 
3300  // semaphore to the queue.
3305  void ext_oneapi_wait_external_semaphore(
3307 
3309  // semaphore to the queue.
3316  void ext_oneapi_wait_external_semaphore(
3318  uint64_t WaitValue);
3319 
3326  void ext_oneapi_signal_external_semaphore(
3328 
3338  void ext_oneapi_signal_external_semaphore(
3340  uint64_t SignalValue);
3341 
3342 private:
3343  std::shared_ptr<detail::handler_impl> MImpl;
3344  std::shared_ptr<detail::queue_impl> MQueue;
3345 
3350  mutable detail::CG::StorageInitHelper CGData;
3351  std::vector<detail::LocalAccessorImplPtr> MLocalAccStorage;
3352  std::vector<std::shared_ptr<detail::stream_impl>> MStreamStorage;
3354  std::vector<detail::ArgDesc> MArgs;
3358  std::vector<detail::ArgDesc> MAssociatedAccesors;
3360  detail::NDRDescT MNDRDesc;
3361  detail::string MKernelName;
3363  std::shared_ptr<detail::kernel_impl> MKernel;
3369  void *MSrcPtr = nullptr;
3371  void *MDstPtr = nullptr;
3373  size_t MLength = 0;
3375  std::vector<char> MPattern;
3377  std::unique_ptr<detail::HostKernelBase> MHostKernel;
3379  std::unique_ptr<detail::HostTask> MHostTask;
3382  std::vector<detail::EventImplPtr> MEventsWaitWithBarrier;
3383 
3385  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph;
3388  std::shared_ptr<ext::oneapi::experimental::detail::exec_graph_impl>
3389  MExecGraph;
3391  std::shared_ptr<ext::oneapi::experimental::detail::node_impl> MSubgraphNode;
3393  std::unique_ptr<detail::CG> MGraphNodeCG;
3394 
3395  bool MIsHost = false;
3396 
3397  detail::code_location MCodeLoc = {};
3398  bool MIsFinalized = false;
3399  event MLastEvent;
3400 
3401  // Make queue_impl class friend to be able to call finalize method.
3402  friend class detail::queue_impl;
3403  // Make accessor class friend to keep the list of associated accessors.
3404  template <typename DataT, int Dims, access::mode AccMode,
3406  typename PropertyListT>
3407  friend class accessor;
3409 
3410  template <typename DataT, int Dimensions, access::mode AccessMode,
3413  // Make stream class friend to be able to keep the list of associated streams
3414  friend class stream;
3415  friend class detail::stream_impl;
3416  // Make reduction friends to store buffers and arrays created for it
3417  // in handler from reduction methods.
3418  template <typename T, class BinaryOperation, int Dims, size_t Extent,
3419  bool ExplicitIdentity, typename RedOutVar>
3421 
3423  template <class FunctorTy>
3424  friend void detail::reduction::withAuxHandler(handler &CGH, FunctorTy Func);
3425 
3426  template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
3427  typename PropertiesT, typename... RestT>
3429  PropertiesT Properties,
3430  RestT... Rest);
3431 
3432  template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
3433  typename PropertiesT, typename... RestT>
3434  friend void
3436  PropertiesT Properties, RestT... Rest);
3437 
3438 #ifndef __SYCL_DEVICE_ONLY__
3441  access::target);
3446 #endif
3447 
3448  friend class ::MockHandler;
3449  friend class detail::queue_impl;
3450 
3451  // Make pipe class friend to be able to call ext_intel_read/write_host_pipe
3452  // method.
3453  template <class _name, class _dataT, int32_t _min_capacity,
3454  class _propertiesT, class>
3456 
3463  void ext_intel_read_host_pipe(const std::string &Name, void *Ptr, size_t Size,
3464  bool Block = false) {
3465  ext_intel_read_host_pipe(detail::string_view(Name), Ptr, Size, Block);
3466  }
3467  void ext_intel_read_host_pipe(detail::string_view Name, void *Ptr,
3468  size_t Size, bool Block = false);
3469 
3476  void ext_intel_write_host_pipe(const std::string &Name, void *Ptr,
3477  size_t Size, bool Block = false) {
3478  ext_intel_write_host_pipe(detail::string_view(Name), Ptr, Size, Block);
3479  }
3480  void ext_intel_write_host_pipe(detail::string_view Name, void *Ptr,
3481  size_t Size, bool Block = false);
3484 
3485  bool DisableRangeRounding();
3486 
3487  bool RangeRoundingTrace();
3488 
3489  void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
3490  size_t &MinRange);
3491 
3492  template <typename WrapperT, typename TransformedArgType, int Dims,
3493  typename KernelType,
3495  KernelType, TransformedArgType>::value> * = nullptr>
3496  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
3497  range<Dims> UserRange) {
3498  return detail::RoundedRangeKernelWithKH<TransformedArgType, Dims,
3499  KernelType>{UserRange, KernelFunc};
3500  }
3501 
3502  template <typename WrapperT, typename TransformedArgType, int Dims,
3503  typename KernelType,
3504  std::enable_if_t<!detail::KernelLambdaHasKernelHandlerArgT<
3505  KernelType, TransformedArgType>::value> * = nullptr>
3506  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
3507  range<Dims> UserRange) {
3508  return detail::RoundedRangeKernel<TransformedArgType, Dims, KernelType>{
3509  UserRange, KernelFunc};
3510  }
3511 
3512  const std::shared_ptr<detail::context_impl> &getContextImplPtr() const;
3513 
3514  // Checks if 2D memory operations are supported by the underlying platform.
3515  bool supportsUSMMemcpy2D();
3516  bool supportsUSMFill2D();
3517  bool supportsUSMMemset2D();
3518 
3519  // Helper function for getting a loose bound on work-items.
3520  id<2> computeFallbackKernelBounds(size_t Width, size_t Height);
3521 
3522  // Common function for launching a 2D USM memcpy kernel to avoid redefinitions
3523  // of the kernel from copy and memcpy.
3524  template <typename T>
3525  void commonUSMCopy2DFallbackKernel(const void *Src, size_t SrcPitch,
3526  void *Dest, size_t DestPitch, size_t Width,
3527  size_t Height) {
3528  // Otherwise the data is accessible on the device so we do the operation
3529  // there instead.
3530  // Limit number of work items to be resistant to big copies.
3531  id<2> Chunk = computeFallbackKernelBounds(Height, Width);
3532  id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
3533  parallel_for<__usmmemcpy2d<T>>(
3534  range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) {
3535  T *CastedDest = static_cast<T *>(Dest);
3536  const T *CastedSrc = static_cast<const T *>(Src);
3537  for (uint32_t I = 0; I < Iterations[0]; ++I) {
3538  for (uint32_t J = 0; J < Iterations[1]; ++J) {
3539  id<2> adjustedIndex = Index + Chunk * id<2>{I, J};
3540  if (adjustedIndex[0] < Height && adjustedIndex[1] < Width) {
3541  CastedDest[adjustedIndex[0] * DestPitch + adjustedIndex[1]] =
3542  CastedSrc[adjustedIndex[0] * SrcPitch + adjustedIndex[1]];
3543  }
3544  }
3545  }
3546  });
3547  }
3548 
3549  // Common function for launching a 2D USM memcpy host-task to avoid
3550  // redefinitions of the kernel from copy and memcpy.
3551  template <typename T>
3552  void commonUSMCopy2DFallbackHostTask(const void *Src, size_t SrcPitch,
3553  void *Dest, size_t DestPitch,
3554  size_t Width, size_t Height) {
3555  // If both pointers are host USM or unknown (assumed non-USM) we use a
3556  // host-task to satisfy dependencies.
3557  host_task([=] {
3558  const T *CastedSrc = static_cast<const T *>(Src);
3559  T *CastedDest = static_cast<T *>(Dest);
3560  for (size_t I = 0; I < Height; ++I) {
3561  const T *SrcItBegin = CastedSrc + SrcPitch * I;
3562  T *DestItBegin = CastedDest + DestPitch * I;
3563  std::copy(SrcItBegin, SrcItBegin + Width, DestItBegin);
3564  }
3565  });
3566  }
3567 
3568  // StageFillCG() Supporting function to fill()
3569  template <typename T, int Dims, access::mode AccessMode,
3570  access::target AccessTarget,
3572  typename PropertyListT = property_list>
3573  void StageFillCG(
3574  accessor<T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT>
3575  Dst,
3576  const T &Pattern) {
3577  setType(detail::CG::Fill);
3578  detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Dst;
3579  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
3580 
3581  MDstPtr = static_cast<void *>(AccImpl.get());
3582  CGData.MRequirements.push_back(AccImpl.get());
3583  CGData.MAccStorage.push_back(std::move(AccImpl));
3584 
3585  MPattern.resize(sizeof(T));
3586  auto PatternPtr = reinterpret_cast<T *>(MPattern.data());
3587  *PatternPtr = Pattern;
3588  }
3589 
3590  // Common function for launching a 2D USM fill kernel to avoid redefinitions
3591  // of the kernel from memset and fill.
3592  template <typename T>
3593  void commonUSMFill2DFallbackKernel(void *Dest, size_t DestPitch,
3594  const T &Pattern, size_t Width,
3595  size_t Height) {
3596  // Otherwise the data is accessible on the device so we do the operation
3597  // there instead.
3598  // Limit number of work items to be resistant to big fill operations.
3599  id<2> Chunk = computeFallbackKernelBounds(Height, Width);
3600  id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
3601  parallel_for<__usmfill2d<T>>(
3602  range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) {
3603  T *CastedDest = static_cast<T *>(Dest);
3604  for (uint32_t I = 0; I < Iterations[0]; ++I) {
3605  for (uint32_t J = 0; J < Iterations[1]; ++J) {
3606  id<2> adjustedIndex = Index + Chunk * id<2>{I, J};
3607  if (adjustedIndex[0] < Height && adjustedIndex[1] < Width) {
3608  CastedDest[adjustedIndex[0] * DestPitch + adjustedIndex[1]] =
3609  Pattern;
3610  }
3611  }
3612  }
3613  });
3614  }
3615 
3616  // Common function for launching a 2D USM fill kernel or host_task to avoid
3617  // redefinitions of the kernel from memset and fill.
3618  template <typename T>
3619  void commonUSMFill2DFallbackHostTask(void *Dest, size_t DestPitch,
3620  const T &Pattern, size_t Width,
3621  size_t Height) {
3622  // If the pointer is host USM or unknown (assumed non-USM) we use a
3623  // host-task to satisfy dependencies.
3624  host_task([=] {
3625  T *CastedDest = static_cast<T *>(Dest);
3626  for (size_t I = 0; I < Height; ++I) {
3627  T *ItBegin = CastedDest + DestPitch * I;
3628  std::fill(ItBegin, ItBegin + Width, Pattern);
3629  }
3630  });
3631  }
3632 
3633  // Implementation of ext_oneapi_memcpy2d using command for native 2D memcpy.
3634  void ext_oneapi_memcpy2d_impl(void *Dest, size_t DestPitch, const void *Src,
3635  size_t SrcPitch, size_t Width, size_t Height);
3636 
3637  // Untemplated version of ext_oneapi_fill2d using command for native 2D fill.
3638  void ext_oneapi_fill2d_impl(void *Dest, size_t DestPitch, const void *Value,
3639  size_t ValueSize, size_t Width, size_t Height);
3640 
3641  // Implementation of ext_oneapi_memset2d using command for native 2D memset.
3642  void ext_oneapi_memset2d_impl(void *Dest, size_t DestPitch, int Value,
3643  size_t Width, size_t Height);
3644 
3645  // Implementation of memcpy to device_global.
3646  void memcpyToDeviceGlobal(const void *DeviceGlobalPtr, const void *Src,
3647  bool IsDeviceImageScoped, size_t NumBytes,
3648  size_t Offset);
3649 
3650  // Implementation of memcpy from device_global.
3651  void memcpyFromDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
3652  bool IsDeviceImageScoped, size_t NumBytes,
3653  size_t Offset);
3654 
3655  // Implementation of memcpy to an unregistered device_global.
3656  void memcpyToHostOnlyDeviceGlobal(const void *DeviceGlobalPtr,
3657  const void *Src, size_t DeviceGlobalTSize,
3658  bool IsDeviceImageScoped, size_t NumBytes,
3659  size_t Offset);
3660 
3661  // Implementation of memcpy from an unregistered device_global.
3662  void memcpyFromHostOnlyDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
3663  bool IsDeviceImageScoped, size_t NumBytes,
3664  size_t Offset);
3665 
3666  template <typename T, int Dims, access::mode AccessMode,
3667  access::target AccessTarget,
3669  typename PropertyListT = property_list>
3670  void checkIfPlaceholderIsBoundToHandler(
3671  accessor<T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT>
3672  Acc) {
3673  auto *AccBase = reinterpret_cast<detail::AccessorBaseHost *>(&Acc);
3674  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
3675  detail::AccessorImplHost *Req = AccImpl.get();
3676  if (std::find_if(MAssociatedAccesors.begin(), MAssociatedAccesors.end(),
3677  [&](const detail::ArgDesc &AD) {
3678  return AD.MType ==
3679  detail::kernel_param_kind_t::kind_accessor &&
3680  AD.MPtr == Req &&
3681  AD.MSize == static_cast<int>(AccessTarget);
3682  }) == MAssociatedAccesors.end())
3684  "placeholder accessor must be bound by calling "
3685  "handler::require() before it can be used.");
3686  }
3687 
3688  // Set value of the gpu cache configuration for the kernel.
3689  void setKernelCacheConfig(sycl::detail::pi::PiKernelCacheConfig);
3690  // Set value of the kernel is cooperative flag
3691  void setKernelIsCooperative(bool);
3692 
3693  template <
3695  void throwIfGraphAssociated() const {
3696 
3697  if (getCommandGraph()) {
3698  std::string FeatureString =
3700  FeatureT);
3702  "The " + FeatureString +
3703  " feature is not yet available "
3704  "for use with the SYCL Graph extension.");
3705  }
3706  }
3707 
3708  // Set that an ND Range was used during a call to parallel_for
3709  void setNDRangeUsed(bool Value);
3710 
3711  inline void internalProfilingTagImpl() {
3712  throwIfActionIsCreated();
3713  setType(detail::CG::ProfilingTag);
3714  }
3715 
3717 
3718 protected:
3720  void depends_on(const detail::EventImplPtr &Event);
3722  void depends_on(const std::vector<detail::EventImplPtr> &Events);
3723 };
3724 
3725 namespace detail {
3727 public:
3728  static void internalProfilingTagImpl(handler &Handler) {
3729  Handler.internalProfilingTagImpl();
3730  }
3731 };
3732 } // namespace detail
3733 
3734 } // namespace _V1
3735 } // namespace sycl
3736 
3737 #ifdef __SYCL_BUILD_SYCL_DLL
3738 // The following fails (somewhat expectedly) when compiled with MSVC:
3739 //
3740 // #include <memory>
3741 // struct __declspec(dllexport) handler {
3742 // std::unique_ptr<struct Incomplete> Member;
3743 // };
3744 //
3745 // We do __SYCL_EXPORT sycl::handler class and it has an
3746 // std::unique_ptr<detail::HostTask> member. As such, ensure the type is
3747 // complete if we're building the SYCL shared library.
3749 #endif
The file contains implementations of accessor class.
Defines a shared array that can be used by kernels in queues.
Definition: buffer.hpp:167
CGTYPE
Type of the command group.
Definition: cg.hpp:56
static void internalProfilingTagImpl(handler &Handler)
Definition: handler.hpp:3728
RoundedRangeIDGenerator(const id< Dims > &Id, const range< Dims > &UserRange, const range< Dims > &RoundedRange)
Definition: handler.hpp:334
void operator()(item< Dims > It, kernel_handler KH) const
Definition: handler.hpp:396
void operator()(item< Dims > It) const
Definition: handler.hpp:381
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:44
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
Implementation details of command_graph<modifiable>.
Definition: graph_impl.hpp:849
Command group handler class.
Definition: handler.hpp:462
void fill(void *Ptr, const T &Pattern, size_t Count)
Fills the specified memory with the specified pattern.
Definition: handler.hpp:2899
void parallel_for(range< 2 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:2177
void parallel_for(kernel Kernel, range< Dims > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range.
Definition: handler.hpp:2267
void single_task(_KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:2038
void copy(const std::remove_all_extents_t< T > *Src, ext::oneapi::experimental::device_global< T, PropertyListT > &Dest, size_t Count=sizeof(T)/sizeof(std::remove_all_extents_t< T >), size_t StartIndex=0)
Copies elements of type std::remove_all_extents_t<T> from a USM memory region to a device_global.
Definition: handler.hpp:3129
void parallel_for(nd_range< Dims > NDRange, kernel Kernel)
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:2215
void parallel_for(range< 1 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2044
void parallel_for_work_group(range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2601
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 2 > Range, PropertiesT Properties, RestT &&...Rest)
Definition: handler.hpp:2522
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2624
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(nd_range< Dims > Range, PropertiesT Properties, RestT &&...Rest)
Definition: handler.hpp:2572
void copy(const ext::oneapi::experimental::device_global< T, PropertyListT > &Src, std::remove_all_extents_t< T > *Dest, size_t Count=sizeof(T)/sizeof(std::remove_all_extents_t< T >), size_t StartIndex=0)
Copies elements of type std::remove_all_extents_t<T> from a device_global to a USM memory region.
Definition: handler.hpp:3149
void parallel_for(range< 3 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2058
handler & operator=(handler &&)=delete
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 3 > Range, PropertiesT Properties, RestT &&...Rest)
Definition: handler.hpp:2535
handler(handler &&)=delete
void copy(accessor< T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src, IsPlaceholder_Src > Src, accessor< T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst, IsPlaceholder_Dst > Dst)
Copies the content of memory object accessed by Src to the memory object accessed by Dst.
Definition: handler.hpp:2769
void copy(const T *Src, T *Dest, size_t Count)
Copies data from one memory region to another, each is either a host pointer or a pointer within USM ...
Definition: handler.hpp:2948
void parallel_for_work_group(range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:2146
void parallel_for(range< 3 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:2181
void parallel_for(range< 1 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:2173
void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function.
Definition: handler.hpp:2233
handler & operator=(const handler &)=delete
void parallel_for(kernel Kernel, nd_range< Dims > NDRange, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:2345
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2427
void require(accessor< DataT, Dims, AccMode, AccTarget, isPlaceholder > Acc)
Requires access to the memory object associated with the placeholder accessor.
Definition: handler.hpp:1935
void fill(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT > Dst, const T &Pattern)
Fills memory pointed by accessor with the pattern given.
Definition: handler.hpp:2853
void update_host(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder > Acc)
Provides guarantees that the memory object accessed via Acc is updated on the host after command grou...
Definition: handler.hpp:2822
void set_arg(int argIndex, ext::oneapi::experimental::dynamic_parameter< T > &dynamicParam)
Definition: handler.hpp:2016
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 2 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2476
std::is_same< remove_cv_ref_t< U >, remove_cv_ref_t< T > > is_same_type
Definition: handler.hpp:1975
std::enable_if_t< ShouldEnableSetArg< T >::value, void > set_arg(int ArgIndex, T &&Arg)
Sets argument for OpenCL interoperability kernels.
Definition: handler.hpp:1997
void parallel_for_work_group(range< Dims > NumWorkGroups, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
}@
Definition: handler.hpp:2592
void require(ext::oneapi::experimental::dynamic_parameter< accessor< DataT, Dims, AccMode, AccTarget, isPlaceholder >> dynamicParamAcc)
Requires access to the memory object associated with the placeholder accessor contained in a dynamic_...
Definition: handler.hpp:1951
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > single_task(PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2457
void single_task(kernel Kernel)
Invokes a SYCL kernel.
Definition: handler.hpp:2160
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(nd_range< Dims > Range, RestT &&...Rest)
Definition: handler.hpp:2582
void copy(const T_Src *Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2723
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 3 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2486
void copy(std::shared_ptr< T_Src > Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2652
handler(const handler &)=delete
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, T_Dst *Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2681
typename std::remove_cv_t< std::remove_reference_t< T > > remove_cv_ref_t
Definition: handler.hpp:1972
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(nd_range< Dims > Range, PropertiesT Properties, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2496
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(range< 1 > Range, RestT &&...Rest)
Definition: handler.hpp:2544
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(range< 2 > Range, RestT &&...Rest)
Definition: handler.hpp:2552
std::enable_if_t< detail::AreAllButLastReductions< RestT... >::value > parallel_for(range< 3 > Range, RestT &&...Rest)
Definition: handler.hpp:2560
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2388
void ext_oneapi_barrier()
Prevents any commands submitted afterward to this queue from executing until all commands previously ...
Definition: handler.hpp:2913
std::enable_if_t< ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 1 > NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2466
void memcpy(void *Dest, const ext::oneapi::experimental::device_global< T, PropertyListT > &Src, size_t NumBytes=sizeof(T), size_t SrcOffset=0)
Copies data from a device_global to USM memory.
Definition: handler.hpp:3094
void parallel_for_work_group(range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:2125
std::enable_if_t< detail::check_fn_signature< std::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< std::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
Definition: handler.hpp:2070
void set_specialization_constant(typename std::remove_reference_t< decltype(SpecName)>::value_type Value)
Definition: handler.hpp:1892
void set_args(Ts &&...Args)
Sets arguments for OpenCL interoperability kernels.
Definition: handler.hpp:2026
void memcpy(ext::oneapi::experimental::device_global< T, PropertyListT > &Dest, const void *Src, size_t NumBytes=sizeof(T), size_t DestOffset=0)
Copies data from a USM memory region to a device_global.
Definition: handler.hpp:3060
void parallel_for(range< 2 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2051
void set_arg(int ArgIndex, accessor< DataT, Dims, AccessMode, AccessTarget, IsPlaceholder > Arg)
Definition: handler.hpp:2004
std::enable_if_t<(sizeof...(RestT) > 1) &&detail::AreAllButLastReductions< RestT... >::value &&ext::oneapi::experimental::is_property_list< PropertiesT >::value > parallel_for(range< 1 > Range, PropertiesT Properties, RestT &&...Rest)
Reductions.
Definition: handler.hpp:2509
std::remove_reference_t< decltype(SpecName)>::value_type get_specialization_constant() const
Definition: handler.hpp:1907
void set_arg(int ArgIndex, local_accessor< DataT, Dims > Arg)
Definition: handler.hpp:2010
A unique identifier of an item in an index space.
Definition: id.hpp:36
Identifies an instance of the function object executing at each point in a range.
Definition: item.hpp:37
id< Dimensions > get_id() const
Definition: item.hpp:55
range< Dimensions > get_range() const
Definition: item.hpp:69
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:76
Identifies an instance of the function object executing at each point in an nd_range.
Definition: nd_item.hpp:48
id< Dimensions > get_global_id() const
Definition: nd_item.hpp:52
id< Dimensions > get_offset() const
Definition: nd_item.hpp:197
range< Dimensions > get_global_range() const
Definition: nd_item.hpp:164
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:22
Objects of the property_list class are containers for the SYCL properties.
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: range.hpp:26
size_t size() const
Definition: range.hpp:56
#define _KERNELFUNCPARAMTYPE
Definition: handler.hpp:76
#define __SYCL_KERNEL_ATTR__
Definition: handler.hpp:1588
#define _KERNELFUNCPARAM(a)
Definition: handler.hpp:80
__SYCL_EXTERN_STREAM_ATTRS ostream cout
Linked to standard output.
void withAuxHandler(handler &CGH, FunctorTy Func)
Definition: reduction.hpp:1179
void finalizeHandler(handler &CGH)
Definition: reduction.hpp:1178
void * getValueFromDynamicParameter(ext::oneapi::experimental::detail::dynamic_parameter_base &DynamicParamBase)
Definition: handler.cpp:75
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
Definition: accessor.cpp:17
size_t getLinearIndex(const T< Dims > &Index, const U< Dims > &Range)
Definition: common.hpp:292
decltype(member_ptr_helper(&F::operator())) argument_helper(int)
Definition: handler.hpp:199
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:313
static Arg member_ptr_helper(RetType(Func::*)(Arg) const)
bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr)
Definition: handler.cpp:40
static std::enable_if_t< std::is_unsigned_v< T >, bool > multiply_with_overflow_check(T &dst, T x, T y)
Definition: handler.hpp:413
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:30
std::enable_if_t< std::is_same_v< T, range< Dims > >||std::is_same_v< T, id< Dims > > > checkValueRange(const T &V)
Definition: handler.hpp:277
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType, ArgType >::value > runKernelWithArg(KernelType KernelName, ArgType Arg)
Definition: cg_types.hpp:212
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:43
void markBufferAsInternal(const std::shared_ptr< buffer_impl > &BufImpl)
Definition: helpers.cpp:70
std::enable_if_t< KernelLambdaHasKernelHandlerArgT< KernelType >::value > runKernelWithoutArg(KernelType KernelName)
Definition: cg_types.hpp:198
std::shared_ptr< LocalAccessorImplHost > LocalAccessorImplPtr
Definition: accessor.hpp:583
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
bool range_size_fits_in_size_t(const range< Dims > &r)
Definition: handler.hpp:418
void associateWithHandler(handler &, AccessorBaseHost *, access::target)
void reduction_parallel_for(handler &CGH, range< Dims > NDRange, PropertiesT Properties, RestT... Rest)
Definition: reduction.hpp:2724
decltype(argument_helper< F, SuggestedArgType >(0)) lambda_arg_type
Definition: handler.hpp:205
std::shared_ptr< AccessorImplHost > AccessorImplPtr
Definition: accessor.hpp:514
const char * UnsupportedFeatureToString(UnsupportedGraphFeatures Feature)
Definition: graph.hpp:60
typename merged_properties< LHSPropertiesT, RHSPropertiesT >::type merged_properties_t
Definition: properties.hpp:267
properties< std::tuple< PropertyValueTs... > > properties_t
Definition: properties.hpp:254
void mem_advise(handler &CGH, void *Ptr, size_t NumBytes, int Advice)
@ executable
In executable state, the graph is ready to execute.
void copy(handler &CGH, const T *Src, T *Dest, size_t Count)
static constexpr bool has_property()
static constexpr auto get_property()
void single_task(handler &CGH, const KernelType &KernelObj)
properties< std::tuple<> > empty_properties_t
Definition: properties.hpp:234
void fill(sycl::handler &CGH, T *Ptr, const T &Pattern, size_t Count)
void parallel_for(handler &CGH, range< Dimensions > Range, const KernelType &KernelObj, ReductionsT &&...Reductions)
image_target
Definition: access.hpp:74
class __SYCL_EBO __SYCL_SPECIAL_CLASS Dimensions
signed char __SYCL2020_DEPRECATED
Definition: aliases.hpp:94
ext::intel::pipe< name, dataT, min_capacity > pipe
Definition: pipes.hpp:18
class __SYCL_EBO __SYCL_SPECIAL_CLASS IsPlaceholder
class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:93
autodecltype(x) x
const void value_type
Definition: multi_ptr.hpp:457
Definition: access.hpp:18
_pi_kernel_cache_config
Definition: pi.h:850
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA
Definition: pi.h:856
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM
Definition: pi.h:854
C++ wrapper of extern "C" PI interfaces.
Predicate returning true if all template type parameters except the last one are reductions.
std::vector< detail::AccessorImplPtr > MAccStorage
Storage for accessors.
Definition: cg.hpp:102
std::vector< AccessorImplHost * > MRequirements
List of requirements that specify which memory is needed for the command group to be executed.
Definition: cg.hpp:108
static constexpr const char * getName()
Definition: kernel_desc.hpp:84
A struct to describe the properties of an image.
is_device_copyable is a user specializable class template to indicate that a type T is device copyabl...