DPC++ Runtime
Runtime libraries for oneAPI Data Parallel C++
handler.hpp
Go to the documentation of this file.
1 //==-------- handler.hpp --- SYCL command group handler --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
12 #include <CL/sycl/accessor.hpp>
13 #include <CL/sycl/context.hpp>
14 #include <CL/sycl/detail/cg.hpp>
19 #include <CL/sycl/event.hpp>
20 #include <CL/sycl/id.hpp>
22 #include <CL/sycl/item.hpp>
23 #include <CL/sycl/kernel.hpp>
26 #include <CL/sycl/nd_item.hpp>
27 #include <CL/sycl/nd_range.hpp>
29 #include <CL/sycl/sampler.hpp>
30 #include <CL/sycl/stl.hpp>
31 
32 #include <functional>
33 #include <limits>
34 #include <memory>
35 #include <tuple>
36 #include <type_traits>
37 
38 // SYCL_LANGUAGE_VERSION is 4 digit year followed by 2 digit revision
39 #if !SYCL_LANGUAGE_VERSION || SYCL_LANGUAGE_VERSION < 202001
40 #define __SYCL_NONCONST_FUNCTOR__
41 #endif
42 
43 template <typename DataT, int Dimensions, cl::sycl::access::mode AccessMode,
44  cl::sycl::access::target AccessTarget,
45  cl::sycl::access::placeholder IsPlaceholder>
46 class __fill;
47 
48 template <typename T> class __usmfill;
49 
50 template <typename T_Src, typename T_Dst, int Dims,
51  cl::sycl::access::mode AccessMode,
52  cl::sycl::access::target AccessTarget,
53  cl::sycl::access::placeholder IsPlaceholder>
55 
56 template <typename T_Src, typename T_Dst, int Dims,
57  cl::sycl::access::mode AccessMode,
58  cl::sycl::access::target AccessTarget,
59  cl::sycl::access::placeholder IsPlaceholder>
61 
62 template <typename T_Src, int Dims_Src, cl::sycl::access::mode AccessMode_Src,
63  cl::sycl::access::target AccessTarget_Src, typename T_Dst,
64  int Dims_Dst, cl::sycl::access::mode AccessMode_Dst,
65  cl::sycl::access::target AccessTarget_Dst,
66  cl::sycl::access::placeholder IsPlaceholder_Src,
67  cl::sycl::access::placeholder IsPlaceholder_Dst>
69 
70 // For unit testing purposes
71 class MockHandler;
72 
74 namespace sycl {
75 
76 // Forward declaration
77 
78 class handler;
79 template <typename T, int Dimensions, typename AllocatorT, typename Enable>
80 class buffer;
81 namespace detail {
82 
83 class kernel_impl;
84 class queue_impl;
85 class stream_impl;
86 template <typename DataT, int Dimensions, access::mode AccessMode,
87  access::target AccessTarget, access::placeholder IsPlaceholder>
88 class image_accessor;
89 template <typename RetType, typename Func, typename Arg>
90 static Arg member_ptr_helper(RetType (Func::*)(Arg) const);
91 
92 // Non-const version of the above template to match functors whose 'operator()'
93 // is declared w/o the 'const' qualifier.
94 template <typename RetType, typename Func, typename Arg>
95 static Arg member_ptr_helper(RetType (Func::*)(Arg));
96 
97 // template <typename RetType, typename Func>
98 // static void member_ptr_helper(RetType (Func::*)() const);
99 
100 // template <typename RetType, typename Func>
101 // static void member_ptr_helper(RetType (Func::*)());
102 
103 template <typename F, typename SuggestedArgType>
104 decltype(member_ptr_helper(&F::operator())) argument_helper(int);
105 
106 template <typename F, typename SuggestedArgType>
107 SuggestedArgType argument_helper(...);
108 
109 template <typename F, typename SuggestedArgType>
110 using lambda_arg_type = decltype(argument_helper<F, SuggestedArgType>(0));
111 
112 // Used when parallel_for range is rounded-up.
113 template <typename Name> class __pf_kernel_wrapper;
114 
115 template <typename Type> struct get_kernel_wrapper_name_t {
117 };
118 
119 __SYCL_EXPORT device getDeviceFromHandler(handler &);
120 
121 #if __SYCL_ID_QUERIES_FIT_IN_INT__
122 template <typename T> struct NotIntMsg;
123 
124 template <int Dims> struct NotIntMsg<range<Dims>> {
125  constexpr static const char *Msg =
126  "Provided range is out of integer limits. Pass "
127  "`-fno-sycl-id-queries-fit-in-int' to disable range check.";
128 };
129 
130 template <int Dims> struct NotIntMsg<id<Dims>> {
131  constexpr static const char *Msg =
132  "Provided offset is out of integer limits. Pass "
133  "`-fno-sycl-id-queries-fit-in-int' to disable offset check.";
134 };
135 #endif
136 
137 #if __SYCL_ID_QUERIES_FIT_IN_INT__
138 template <typename T, typename ValT>
139 typename detail::enable_if_t<std::is_same<ValT, size_t>::value ||
140  std::is_same<ValT, unsigned long long>::value>
141 checkValueRangeImpl(ValT V) {
142  static constexpr size_t Limit =
143  static_cast<size_t>((std::numeric_limits<int>::max)());
144  if (V > Limit)
145  throw runtime_error(NotIntMsg<T>::Msg, PI_INVALID_VALUE);
146 }
147 #endif
148 
149 template <int Dims, typename T>
150 typename detail::enable_if_t<std::is_same<T, range<Dims>>::value ||
151  std::is_same<T, id<Dims>>::value>
152 checkValueRange(const T &V) {
153 #if __SYCL_ID_QUERIES_FIT_IN_INT__
154  for (size_t Dim = 0; Dim < Dims; ++Dim)
155  checkValueRangeImpl<T>(V[Dim]);
156 
157  {
158  unsigned long long Product = 1;
159  for (size_t Dim = 0; Dim < Dims; ++Dim) {
160  Product *= V[Dim];
161  // check value now to prevent product overflow in the end
162  checkValueRangeImpl<T>(Product);
163  }
164  }
165 #else
166  (void)V;
167 #endif
168 }
169 
170 template <int Dims>
171 void checkValueRange(const range<Dims> &R, const id<Dims> &O) {
172 #if __SYCL_ID_QUERIES_FIT_IN_INT__
173  checkValueRange<Dims>(R);
174  checkValueRange<Dims>(O);
175 
176  for (size_t Dim = 0; Dim < Dims; ++Dim) {
177  unsigned long long Sum = R[Dim] + O[Dim];
178 
179  checkValueRangeImpl<range<Dims>>(Sum);
180  }
181 #else
182  (void)R;
183  (void)O;
184 #endif
185 }
186 
187 template <int Dims, typename T>
189 checkValueRange(const T &V) {
190 #if __SYCL_ID_QUERIES_FIT_IN_INT__
191  checkValueRange<Dims>(V.get_global_range());
192  checkValueRange<Dims>(V.get_local_range());
193  checkValueRange<Dims>(V.get_offset());
194 
195  checkValueRange<Dims>(V.get_global_range(), V.get_offset());
196 #else
197  (void)V;
198 #endif
199 }
200 
201 template <typename TransformedArgType, int Dims, typename KernelType>
203 public:
204  RoundedRangeKernel(range<Dims> NumWorkItems, KernelType KernelFunc)
205  : NumWorkItems(NumWorkItems), KernelFunc(KernelFunc) {}
206 
207  void operator()(TransformedArgType Arg) const {
208  if (Arg[0] >= NumWorkItems[0])
209  return;
210  Arg.set_allowed_range(NumWorkItems);
211  KernelFunc(Arg);
212  }
213 
214 private:
215  range<Dims> NumWorkItems;
216  KernelType KernelFunc;
217 };
218 
219 template <typename TransformedArgType, int Dims, typename KernelType>
221 public:
223  : NumWorkItems(NumWorkItems), KernelFunc(KernelFunc) {}
224 
225  void operator()(TransformedArgType Arg, kernel_handler KH) const {
226  if (Arg[0] >= NumWorkItems[0])
227  return;
228  Arg.set_allowed_range(NumWorkItems);
229  KernelFunc(Arg, KH);
230  }
231 
232 private:
233  range<Dims> NumWorkItems;
234  KernelType KernelFunc;
235 };
236 
237 } // namespace detail
238 
239 namespace ext {
240 namespace oneapi {
241 namespace detail {
242 template <typename T, class BinaryOperation, int Dims, bool IsUSM,
243  access::placeholder IsPlaceholder>
245 
248 
249 template <typename KernelName, typename KernelType, int Dims, class Reduction>
250 void reduCGFunc(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
251  size_t MaxWGSize, uint32_t NumConcurrentWorkGroups,
252  Reduction &Redu);
253 
254 template <typename KernelName, typename KernelType, int Dims, class Reduction>
256 reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
257  const nd_range<Dims> &Range, Reduction &Redu);
258 
259 template <typename KernelName, typename KernelType, int Dims, class Reduction>
261 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
262  Reduction &Redu);
263 
264 template <typename KernelName, typename KernelType, int Dims, class Reduction>
266 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
267  Reduction &Redu);
268 
269 template <typename KernelName, typename KernelType, class Reduction>
271 reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
272  Reduction &Redu);
273 
274 template <typename KernelName, typename KernelType, int Dims,
275  typename... Reductions, size_t... Is>
276 void reduCGFunc(handler &CGH, KernelType KernelFunc,
277  const nd_range<Dims> &Range,
278  std::tuple<Reductions...> &ReduTuple,
279  std::index_sequence<Is...>);
280 
281 template <typename KernelName, typename KernelType, typename... Reductions,
282  size_t... Is>
283 size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
284  std::tuple<Reductions...> &ReduTuple,
285  std::index_sequence<Is...>);
286 
287 template <typename KernelName, class Reduction>
288 std::enable_if_t<!Reduction::is_usm>
289 reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu);
290 
291 template <typename KernelName, class Reduction>
292 std::enable_if_t<Reduction::is_usm>
293 reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu);
294 
295 template <typename... Reduction, size_t... Is>
296 std::shared_ptr<event>
297 reduSaveFinalResultToUserMem(std::shared_ptr<detail::queue_impl> Queue,
298  bool IsHost, std::tuple<Reduction...> &ReduTuple,
299  std::index_sequence<Is...>);
300 
301 template <typename Reduction, typename... RestT>
302 std::enable_if_t<!Reduction::is_usm>
303 reduSaveFinalResultToUserMemHelper(std::vector<event> &Events,
304  std::shared_ptr<detail::queue_impl> Queue,
305  bool IsHost, Reduction &Redu, RestT... Rest);
306 
307 __SYCL_EXPORT uint32_t
308 reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
309 
310 __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
311  size_t LocalMemBytesPerWorkItem);
312 
313 template <typename... ReductionT, size_t... Is>
314 size_t reduGetMemPerWorkItem(std::tuple<ReductionT...> &ReduTuple,
315  std::index_sequence<Is...>);
316 
317 template <typename TupleT, std::size_t... Is>
318 std::tuple<std::tuple_element_t<Is, TupleT>...>
319 tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>);
320 
321 template <typename FirstT, typename... RestT> struct AreAllButLastReductions;
322 
323 } // namespace detail
324 } // namespace oneapi
325 } // namespace ext
326 
327 namespace __SYCL2020_DEPRECATED("use 'ext::oneapi' instead") ONEAPI {
328  using namespace ext::oneapi;
329 }
330 
364 class __SYCL_EXPORT handler {
365 private:
370  handler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost);
371 
373  template <typename T, typename F = typename detail::remove_const_t<
375  F *storePlainArg(T &&Arg) {
376  MArgsStorage.emplace_back(sizeof(T));
377  auto Storage = reinterpret_cast<F *>(MArgsStorage.back().data());
378  *Storage = Arg;
379  return Storage;
380  }
381 
382  void setType(detail::CG::CGTYPE Type) {
383  constexpr detail::CG::CG_VERSION Version = detail::CG::CG_VERSION::V1;
384  MCGType = static_cast<detail::CG::CGTYPE>(
385  getVersionedCGType(Type, static_cast<int>(Version)));
386  }
387 
388  detail::CG::CGTYPE getType() {
389  return static_cast<detail::CG::CGTYPE>(getUnversionedCGType(MCGType));
390  }
391 
392  void throwIfActionIsCreated() {
393  if (detail::CG::None != getType())
394  throw sycl::runtime_error("Attempt to set multiple actions for the "
395  "command group. Command group must consist of "
396  "a single kernel or explicit memory operation.",
397  CL_INVALID_OPERATION);
398  }
399 
403  void
404  extractArgsAndReqsFromLambda(char *LambdaPtr, size_t KernelArgsNum,
405  const detail::kernel_param_desc_t *KernelArgs);
406 
409  void
410  extractArgsAndReqsFromLambda(char *LambdaPtr, size_t KernelArgsNum,
411  const detail::kernel_param_desc_t *KernelArgs,
412  bool IsESIMD);
413 
415  void extractArgsAndReqs();
416 
418  void processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
419  const int Size, const size_t Index, size_t &IndexShift,
420  bool IsKernelCreatedFromSource);
421 
422  void processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
423  const int Size, const size_t Index, size_t &IndexShift,
424  bool IsKernelCreatedFromSource, bool IsESIMD);
425 
427  std::string getKernelName();
428 
429  template <typename LambdaNameT> bool lambdaAndKernelHaveEqualName() {
430  // TODO It is unclear a kernel and a lambda/functor must to be equal or not
431  // for parallel_for with sycl::kernel and lambda/functor together
432  // Now if they are equal we extract argumets from lambda/functor for the
433  // kernel. Else it is necessary use set_atg(s) for resolve the order and
434  // values of arguments for the kernel.
435  assert(MKernel && "MKernel is not initialized");
436  const std::string LambdaName = detail::KernelInfo<LambdaNameT>::getName();
437  const std::string KernelName = getKernelName();
438  return LambdaName == KernelName;
439  }
440 
443  void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
444 
451  event finalize();
452 
458  void addStream(const std::shared_ptr<detail::stream_impl> &Stream) {
459  MStreamStorage.push_back(Stream);
460  }
461 
468  void addReduction(const std::shared_ptr<const void> &ReduObj) {
469  MSharedPtrStorage.push_back(ReduObj);
470  }
471 
472  ~handler() = default;
473 
474  bool is_host() { return MIsHost; }
475 
477  access::target AccTarget);
478 
479  // Recursively calls itself until arguments pack is fully processed.
480  // The version for regular(standard layout) argument.
481  template <typename T, typename... Ts>
482  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&... Args) {
483  set_arg(ArgIndex, std::move(Arg));
484  setArgsHelper(++ArgIndex, std::move(Args)...);
485  }
486 
487  void setArgsHelper(int) {}
488 
489  // setArgHelper for local accessor argument.
490  template <typename DataT, int Dims, access::mode AccessMode,
491  access::placeholder IsPlaceholder>
492  void setArgHelper(int ArgIndex,
493  accessor<DataT, Dims, AccessMode, access::target::local,
494  IsPlaceholder> &&Arg) {
495  detail::LocalAccessorBaseHost *LocalAccBase =
497  detail::LocalAccessorImplPtr LocalAccImpl =
498  detail::getSyclObjImpl(*LocalAccBase);
499  detail::LocalAccessorImplHost *Req = LocalAccImpl.get();
500  MLocalAccStorage.push_back(std::move(LocalAccImpl));
501  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
502  static_cast<int>(access::target::local), ArgIndex);
503  }
504 
505  // setArgHelper for non local accessor argument.
506  template <typename DataT, int Dims, access::mode AccessMode,
507  access::target AccessTarget, access::placeholder IsPlaceholder>
509  setArgHelper(
510  int ArgIndex,
514  detail::Requirement *Req = AccImpl.get();
515  // Add accessor to the list of requirements.
516  MRequirements.push_back(Req);
517  // Store copy of the accessor.
518  MAccStorage.push_back(std::move(AccImpl));
519  // Add accessor to the list of arguments.
520  MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor, Req,
521  static_cast<int>(AccessTarget), ArgIndex);
522  }
523 
524  template <typename T> void setArgHelper(int ArgIndex, T &&Arg) {
525  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
526 
527  if (!std::is_same<cl_mem, T>::value && std::is_pointer<T>::value) {
528  MArgs.emplace_back(detail::kernel_param_kind_t::kind_pointer, StoredArg,
529  sizeof(T), ArgIndex);
530  } else {
531  MArgs.emplace_back(detail::kernel_param_kind_t::kind_std_layout,
532  StoredArg, sizeof(T), ArgIndex);
533  }
534  }
535 
536  void setArgHelper(int ArgIndex, sampler &&Arg) {
537  auto StoredArg = static_cast<void *>(storePlainArg(Arg));
538  MArgs.emplace_back(detail::kernel_param_kind_t::kind_sampler, StoredArg,
539  sizeof(sampler), ArgIndex);
540  }
541 
542  void verifyKernelInvoc(const kernel &Kernel) {
543  if (is_host()) {
544  throw invalid_object_error(
545  "This kernel invocation method cannot be used on the host",
547  }
548  if (Kernel.is_host()) {
549  throw invalid_object_error("Invalid kernel type, OpenCL expected",
551  }
552  }
553 
560  template <typename KernelName, typename KernelType, int Dims,
561  typename LambdaArgType>
562  void StoreLambda(KernelType KernelFunc) {
563  constexpr bool IsCallableWithKernelHandler =
565  LambdaArgType>();
566  if (IsCallableWithKernelHandler && MIsHost) {
567  throw cl::sycl::feature_not_supported(
568  "kernel_handler is not yet supported by host device.",
570  }
571  MHostKernel.reset(
573  KernelFunc));
574 
576  // Empty name indicates that the compilation happens without integration
577  // header, so don't perform things that require it.
578  if (KI::getName() != nullptr && KI::getName()[0] != '\0') {
579  // TODO support ESIMD in no-integration-header case too.
580  MArgs.clear();
581  extractArgsAndReqsFromLambda(MHostKernel->getPtr(), KI::getNumParams(),
582  &KI::getParamDesc(0), KI::isESIMD());
583  MKernelName = KI::getName();
584  MOSModuleHandle = detail::OSUtil::getOSModuleHandle(KI::getName());
585  } else {
586  // In case w/o the integration header it is necessary to process
587  // accessors from the list(which are associated with this handler) as
588  // arguments.
589  MArgs = std::move(MAssociatedAccesors);
590  }
591 
592  // If the kernel lambda is callable with a kernel_handler argument, manifest
593  // the associated kernel handler.
594  if constexpr (IsCallableWithKernelHandler) {
595  getOrInsertHandlerKernelBundle(/*Insert=*/true);
596  }
597  }
598 
603  template <int Dims_Src, int Dims_Dst>
604  static bool IsCopyingRectRegionAvailable(const range<Dims_Src> Src,
605  const range<Dims_Dst> Dst) {
606  if (Dims_Src > Dims_Dst)
607  return false;
608  for (size_t I = 0; I < Dims_Src; ++I)
609  if (Src[I] > Dst[I])
610  return false;
611  return true;
612  }
613 
614  // TODO: Delete these functions when ABI breaking changes are allowed.
615  // Currently these functions are unused but they are static members of
616  // the exported class 'handler' and has got into sycl library some time ago
617  // and must stay there for a while.
618  static id<1> getDelinearizedIndex(const range<1> Range, const size_t Index) {
619  return detail::getDelinearizedId(Range, Index);
620  }
621  static id<2> getDelinearizedIndex(const range<2> Range, const size_t Index) {
622  return detail::getDelinearizedId(Range, Index);
623  }
624  static id<3> getDelinearizedIndex(const range<3> Range, const size_t Index) {
625  return detail::getDelinearizedId(Range, Index);
626  }
627 
633  template <typename TSrc, int DimSrc, access::mode ModeSrc,
634  access::target TargetSrc, typename TDst, int DimDst,
635  access::mode ModeDst, access::target TargetDst,
636  access::placeholder IsPHSrc, access::placeholder IsPHDst>
637  detail::enable_if_t<(DimSrc > 0) && (DimDst > 0), bool>
640  if (!MIsHost &&
641  IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range()))
642  return false;
643 
644  range<1> LinearizedRange(Src.size());
645  parallel_for<class __copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc,
646  TDst, DimDst, ModeDst, TargetDst,
647  IsPHSrc, IsPHDst>>
648  (LinearizedRange, [=](id<1> Id) {
649  size_t Index = Id[0];
650  id<DimSrc> SrcId = detail::getDelinearizedId(Src.get_range(), Index);
651  id<DimDst> DstId = detail::getDelinearizedId(Dst.get_range(), Index);
652  Dst[DstId] = Src[SrcId];
653  });
654  return true;
655  }
656 
664  template <typename TSrc, int DimSrc, access::mode ModeSrc,
665  access::target TargetSrc, typename TDst, int DimDst,
666  access::mode ModeDst, access::target TargetDst,
667  access::placeholder IsPHSrc, access::placeholder IsPHDst>
671  if (!MIsHost)
672  return false;
673 
674  single_task<class __copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc,
675  TDst, DimDst, ModeDst, TargetDst,
676  IsPHSrc, IsPHDst>> ([=]() {
677  *(Dst.get_pointer()) = *(Src.get_pointer());
678  });
679  return true;
680  }
681 
682 #ifndef __SYCL_DEVICE_ONLY__
683  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
689  access::target AccTarget, access::placeholder IsPH>
690  detail::enable_if_t<(Dim > 0)>
692  TDst *Dst) {
693  range<Dim> Range = Src.get_range();
694  parallel_for<class __copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>
695  (Range, [=](id<Dim> Index) {
696  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
697  using TSrcNonConst = typename detail::remove_const_t<TSrc>;
698  (reinterpret_cast<TSrcNonConst *>(Dst))[LinearIndex] = Src[Index];
699  });
700  }
701 
707  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
708  access::target AccTarget, access::placeholder IsPH>
711  TDst *Dst) {
712  single_task<class __copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>
713  ([=]() {
714  using TSrcNonConst = typename detail::remove_const_t<TSrc>;
715  *(reinterpret_cast<TSrcNonConst *>(Dst)) = *(Src.get_pointer());
716  });
717  }
718 
723  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
724  access::target AccTarget, access::placeholder IsPH>
725  detail::enable_if_t<(Dim > 0)>
726  copyPtrToAccHost(TSrc *Src,
728  range<Dim> Range = Dst.get_range();
729  parallel_for<class __copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>
730  (Range, [=](id<Dim> Index) {
731  const size_t LinearIndex = detail::getLinearIndex(Index, Range);
732  Dst[Index] = (reinterpret_cast<const TDst *>(Src))[LinearIndex];
733  });
734  }
735 
741  template <typename TSrc, typename TDst, int Dim, access::mode AccMode,
742  access::target AccTarget, access::placeholder IsPH>
744  copyPtrToAccHost(TSrc *Src,
746  single_task<class __copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>
747  ([=]() {
748  *(Dst.get_pointer()) = *(reinterpret_cast<const TDst *>(Src));
749  });
750  }
751 #endif // __SYCL_DEVICE_ONLY__
752 
753  constexpr static bool isConstOrGlobal(access::target AccessTarget) {
754  return AccessTarget == access::target::global_buffer ||
755  AccessTarget == access::target::constant_buffer;
756  }
757 
758  constexpr static bool isImageOrImageArray(access::target AccessTarget) {
759  return AccessTarget == access::target::image ||
760  AccessTarget == access::target::image_array;
761  }
762 
763  constexpr static bool
764  isValidTargetForExplicitOp(access::target AccessTarget) {
765  return isConstOrGlobal(AccessTarget) || isImageOrImageArray(AccessTarget);
766  }
767 
768  constexpr static bool isValidModeForSourceAccessor(access::mode AccessMode) {
769  return AccessMode == access::mode::read ||
770  AccessMode == access::mode::read_write;
771  }
772 
773  constexpr static bool
774  isValidModeForDestinationAccessor(access::mode AccessMode) {
775  return AccessMode == access::mode::write ||
776  AccessMode == access::mode::read_write ||
777  AccessMode == access::mode::discard_write ||
778  AccessMode == access::mode::discard_read_write;
779  }
780 
792  template <typename KernelName, typename KernelType, int Dims>
793  void parallel_for_lambda_impl(range<Dims> NumWorkItems,
794  KernelType KernelFunc) {
795  throwIfActionIsCreated();
797 
798  // If 1D kernel argument is an integral type, convert it to sycl::item<1>
799  using TransformedArgType =
800  typename std::conditional<std::is_integral<LambdaArgType>::value &&
801  Dims == 1,
802  item<Dims>, LambdaArgType>::type;
803  using NameT =
805 
806  // Range rounding can be disabled by the user.
807  // Range rounding is not done on the host device.
808  // Range rounding is supported only for newer SYCL standards.
809 #if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \
810  !defined(DPCPP_HOST_DEVICE_OPENMP) && \
811  !defined(DPCPP_HOST_DEVICE_PERF_NATIVE) && SYCL_LANGUAGE_VERSION >= 202001
812  // Range should be a multiple of this for reasonable performance.
813  size_t MinFactorX = 16;
814  // Range should be a multiple of this for improved performance.
815  size_t GoodFactorX = 32;
816  // Range should be at least this to make rounding worthwhile.
817  size_t MinRangeX = 1024;
818 
819  // Check if rounding parameters have been set through environment:
820  // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
821  this->GetRangeRoundingSettings(MinFactorX, GoodFactorX, MinRangeX);
822 
823  // Disable the rounding-up optimizations under these conditions:
824  // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
825  // 2. The kernel is provided via an interoperability method.
826  // 3. The API "this_item" is used inside the kernel.
827  // 4. The range is already a multiple of the rounding factor.
828  //
829  // Cases 2 and 3 could be supported with extra effort.
830  // As an optimization for the common case it is an
831  // implementation choice to not support those scenarios.
832  // Note that "this_item" is a free function, i.e. not tied to any
833  // specific id or item. When concurrent parallel_fors are executing
834  // on a device it is difficult to tell which parallel_for the call is
835  // being made from. One could replicate portions of the
836  // call-graph to make this_item calls kernel-specific but this is
837  // not considered worthwhile.
838 
839  // Get the kernel name to check condition 2.
840  std::string KName = typeid(NameT *).name();
842  bool DisableRounding =
843  this->DisableRangeRounding() ||
844  (KI::getName() == nullptr || KI::getName()[0] == '\0') ||
845  (KI::callsThisItem());
846 
847  // Perform range rounding if rounding-up is enabled
848  // and there are sufficient work-items to need rounding
849  // and the user-specified range is not a multiple of a "good" value.
850  if (!DisableRounding && (NumWorkItems[0] >= MinRangeX) &&
851  (NumWorkItems[0] % MinFactorX != 0)) {
852  // It is sufficient to round up just the first dimension.
853  // Multiplying the rounded-up value of the first dimension
854  // by the values of the remaining dimensions (if any)
855  // will yield a rounded-up value for the total range.
856  size_t NewValX =
857  ((NumWorkItems[0] + GoodFactorX - 1) / GoodFactorX) * GoodFactorX;
858  if (this->RangeRoundingTrace())
859  std::cout << "parallel_for range adjusted from " << NumWorkItems[0]
860  << " to " << NewValX << std::endl;
861 
862  using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
863  auto Wrapper =
864  getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
865  KernelFunc, NumWorkItems);
866 
867  using KName = std::conditional_t<std::is_same<KernelType, NameT>::value,
868  decltype(Wrapper), NameWT>;
869 
870  range<Dims> AdjustedRange = NumWorkItems;
871  AdjustedRange.set_range_dim0(NewValX);
872  kernel_parallel_for_wrapper<KName, TransformedArgType>(Wrapper);
873 #ifndef __SYCL_DEVICE_ONLY__
874  detail::checkValueRange<Dims>(AdjustedRange);
875  MNDRDesc.set(std::move(AdjustedRange));
876  StoreLambda<KName, decltype(Wrapper), Dims, TransformedArgType>(
877  std::move(Wrapper));
878  setType(detail::CG::Kernel);
879 #endif
880  } else
881 #endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ && \
882  // !DPCPP_HOST_DEVICE_OPENMP && !DPCPP_HOST_DEVICE_PERF_NATIVE && \
883  // SYCL_LANGUAGE_VERSION >= 202001
884  {
885  (void)NumWorkItems;
886  kernel_parallel_for_wrapper<NameT, TransformedArgType>(KernelFunc);
887 #ifndef __SYCL_DEVICE_ONLY__
888  detail::checkValueRange<Dims>(NumWorkItems);
889  MNDRDesc.set(std::move(NumWorkItems));
890  StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
891  std::move(KernelFunc));
892  setType(detail::CG::Kernel);
893 #endif
894  }
895  }
896 
904  template <int Dims>
905  void parallel_for_impl(range<Dims> NumWorkItems, kernel Kernel) {
906  throwIfActionIsCreated();
907  verifyKernelInvoc(Kernel);
908  MKernel = detail::getSyclObjImpl(std::move(Kernel));
909  detail::checkValueRange<Dims>(NumWorkItems);
910  MNDRDesc.set(std::move(NumWorkItems));
911  setType(detail::CG::Kernel);
912  extractArgsAndReqs();
913  MKernelName = getKernelName();
914  }
915 
916 #ifdef SYCL_LANGUAGE_VERSION
917 #define __SYCL_KERNEL_ATTR__ __attribute__((sycl_kernel))
918 #else
919 #define __SYCL_KERNEL_ATTR__
920 #endif
921  // NOTE: the name of this function - "kernel_single_task" - is used by the
922  // Front End to determine kernel invocation kind.
923  template <typename KernelName, typename KernelType>
925 #ifdef __SYCL_NONCONST_FUNCTOR__
926  kernel_single_task(KernelType KernelFunc) {
927 #else
928  kernel_single_task(const KernelType &KernelFunc) {
929 #endif
930 #ifdef __SYCL_DEVICE_ONLY__
931  KernelFunc();
932 #else
933  (void)KernelFunc;
934 #endif
935  }
936 
937  // NOTE: the name of this function - "kernel_single_task" - is used by the
938  // Front End to determine kernel invocation kind.
939  template <typename KernelName, typename KernelType>
941 #ifdef __SYCL_NONCONST_FUNCTOR__
942  kernel_single_task(KernelType KernelFunc, kernel_handler KH) {
943 #else
944  kernel_single_task(const KernelType &KernelFunc, kernel_handler KH) {
945 #endif
946 #ifdef __SYCL_DEVICE_ONLY__
947  KernelFunc(KH);
948 #else
949  (void)KernelFunc;
950  (void)KH;
951 #endif
952  }
953 
954  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
955  // Front End to determine kernel invocation kind.
956  template <typename KernelName, typename ElementType, typename KernelType>
958 #ifdef __SYCL_NONCONST_FUNCTOR__
959  kernel_parallel_for(KernelType KernelFunc) {
960 #else
961  kernel_parallel_for(const KernelType &KernelFunc) {
962 #endif
963 #ifdef __SYCL_DEVICE_ONLY__
964  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
965 #else
966  (void)KernelFunc;
967 #endif
968  }
969 
970  // NOTE: the name of these functions - "kernel_parallel_for" - are used by the
971  // Front End to determine kernel invocation kind.
972  template <typename KernelName, typename ElementType, typename KernelType>
974 #ifdef __SYCL_NONCONST_FUNCTOR__
975  kernel_parallel_for(KernelType KernelFunc, kernel_handler KH) {
976 #else
977  kernel_parallel_for(const KernelType &KernelFunc, kernel_handler KH) {
978 #endif
979 #ifdef __SYCL_DEVICE_ONLY__
980  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
981 #else
982  (void)KernelFunc;
983  (void)KH;
984 #endif
985  }
986 
987  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
988  // used by the Front End to determine kernel invocation kind.
989  template <typename KernelName, typename ElementType, typename KernelType>
991 #ifdef __SYCL_NONCONST_FUNCTOR__
992  kernel_parallel_for_work_group(KernelType KernelFunc) {
993 #else
994  kernel_parallel_for_work_group(const KernelType &KernelFunc) {
995 #endif
996 #ifdef __SYCL_DEVICE_ONLY__
997  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
998 #else
999  (void)KernelFunc;
1000 #endif
1001  }
1002 
1003  // NOTE: the name of this function - "kernel_parallel_for_work_group" - is
1004  // used by the Front End to determine kernel invocation kind.
1005  template <typename KernelName, typename ElementType, typename KernelType>
1007 #ifdef __SYCL_NONCONST_FUNCTOR__
1008  kernel_parallel_for_work_group(KernelType KernelFunc, kernel_handler KH) {
1009 #else
1010  kernel_parallel_for_work_group(const KernelType &KernelFunc,
1011  kernel_handler KH) {
1012 #endif
1013 #ifdef __SYCL_DEVICE_ONLY__
1014  KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
1015 #else
1016  (void)KernelFunc;
1017  (void)KH;
1018 #endif
1019  }
1020 
1021  // Wrappers for kernel_*** functions above with and without support of
1022  // additional kernel_handler argument.
1023 
1024  // NOTE: to support kernel_handler argument in kernel lambdas, only
1025  // kernel_***_wrapper functions must be called in this code
1026 
1027  // Wrappers for kernel_single_task(...)
1028 
1029  template <typename KernelName, typename KernelType>
1030  void
1031 #ifdef __SYCL_NONCONST_FUNCTOR__
1032  kernel_single_task_wrapper(KernelType KernelFunc) {
1033 #else
1034  kernel_single_task_wrapper(const KernelType &KernelFunc) {
1035 #endif
1036 #ifdef __SYCL_DEVICE_ONLY__
1037  detail::CheckDeviceCopyable<KernelType>();
1038 #endif // __SYCL_DEVICE_ONLY__
1040  KernelType>()) {
1041  kernel_handler KH;
1042  kernel_single_task<KernelName>(KernelFunc, KH);
1043  } else {
1044  kernel_single_task<KernelName>(KernelFunc);
1045  }
1046  }
1047 
1048  // Wrappers for kernel_parallel_for(...)
1049 
1050  template <typename KernelName, typename ElementType, typename KernelType>
1051  void
1052 #ifdef __SYCL_NONCONST_FUNCTOR__
1053  kernel_parallel_for_wrapper(KernelType KernelFunc) {
1054 #else
1055  kernel_parallel_for_wrapper(const KernelType &KernelFunc) {
1056 #endif
1057 #ifdef __SYCL_DEVICE_ONLY__
1058  detail::CheckDeviceCopyable<KernelType>();
1059 #endif // __SYCL_DEVICE_ONLY__
1061  KernelType, ElementType>()) {
1062  kernel_handler KH;
1063  kernel_parallel_for<KernelName, ElementType>(KernelFunc, KH);
1064  } else {
1065  kernel_parallel_for<KernelName, ElementType>(KernelFunc);
1066  }
1067  }
1068 
1069  // Wrappers for kernel_parallel_for_work_group(...)
1070 
1071  template <typename KernelName, typename ElementType, typename KernelType>
1072  void
1073 #ifdef __SYCL_NONCONST_FUNCTOR__
1074  kernel_parallel_for_work_group_wrapper(KernelType KernelFunc) {
1075 #else
1076  kernel_parallel_for_work_group_wrapper(const KernelType &KernelFunc) {
1077 #endif
1078 #ifdef __SYCL_DEVICE_ONLY__
1079  detail::CheckDeviceCopyable<KernelType>();
1080 #endif // __SYCL_DEVICE_ONLY__
1082  KernelType, ElementType>()) {
1083  kernel_handler KH;
1084  kernel_parallel_for_work_group<KernelName, ElementType>(KernelFunc, KH);
1085  } else {
1086  kernel_parallel_for_work_group<KernelName, ElementType>(KernelFunc);
1087  }
1088  }
1089 
1090  std::shared_ptr<detail::kernel_bundle_impl>
1091  getOrInsertHandlerKernelBundle(bool Insert) const;
1092 
1093  void setHandlerKernelBundle(
1094  const std::shared_ptr<detail::kernel_bundle_impl> &NewKernelBundleImpPtr);
1095 
1096  template <typename FuncT>
1098  detail::check_fn_signature<detail::remove_reference_t<FuncT>,
1099  void()>::value ||
1100  detail::check_fn_signature<detail::remove_reference_t<FuncT>,
1101  void(interop_handle)>::value>
1102  host_task_impl(FuncT &&Func) {
1103  throwIfActionIsCreated();
1104 
1105  MNDRDesc.set(range<1>(1));
1106  MArgs = std::move(MAssociatedAccesors);
1107 
1108  MHostTask.reset(new detail::HostTask(std::move(Func)));
1109 
1110  setType(detail::CG::CodeplayHostTask);
1111  }
1112 
1113 public:
1114  handler(const handler &) = delete;
1115  handler(handler &&) = delete;
1116  handler &operator=(const handler &) = delete;
1117  handler &operator=(handler &&) = delete;
1118 
1119 #if __cplusplus > 201402L
1120  template <auto &SpecName>
1121  void set_specialization_constant(
1122  typename std::remove_reference_t<decltype(SpecName)>::value_type Value) {
1123 
1124  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1125  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1126 
1127  detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1129  .set_specialization_constant<SpecName>(Value);
1130  }
1131 
1132  template <auto &SpecName>
1133  typename std::remove_reference_t<decltype(SpecName)>::value_type
1134  get_specialization_constant() const {
1135 
1136  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
1137  getOrInsertHandlerKernelBundle(/*Insert=*/true);
1138 
1139  return detail::createSyclObjFromImpl<kernel_bundle<bundle_state::input>>(
1141  .get_specialization_constant<SpecName>();
1142  }
1143 
1144 #endif
1145 
1146  void
1148  setHandlerKernelBundle(detail::getSyclObjImpl(ExecBundle));
1149  }
1150 
1158  template <typename DataT, int Dims, access::mode AccMode,
1159  access::target AccTarget>
1160  void
1162  Acc) {
1163 #ifndef __SYCL_DEVICE_ONLY__
1164  associateWithHandler(&Acc, AccTarget);
1165 #else
1166  (void)Acc;
1167 #endif
1168  }
1169 
1173  void depends_on(event Event) {
1174  MEvents.push_back(detail::getSyclObjImpl(Event));
1175  }
1176 
1180  void depends_on(const std::vector<event> &Events) {
1181  for (const event &Event : Events) {
1182  MEvents.push_back(detail::getSyclObjImpl(Event));
1183  }
1184  }
1185 
1186  template <typename T>
1187  using remove_cv_ref_t =
1189 
1190  template <typename U, typename T>
1191  using is_same_type = std::is_same<remove_cv_ref_t<U>, remove_cv_ref_t<T>>;
1192 
1193  template <typename T> struct ShouldEnableSetArg {
1194  static constexpr bool value =
1195  std::is_trivially_copyable<detail::remove_reference_t<T>>::value
1196 #if SYCL_LANGUAGE_VERSION && SYCL_LANGUAGE_VERSION <= 201707
1197  && std::is_standard_layout<detail::remove_reference_t<T>>::value
1198 #endif
1199  || is_same_type<sampler, T>::value // Sampler
1201  std::is_pointer<remove_cv_ref_t<T>>::value) // USM
1202  || is_same_type<cl_mem, T>::value; // Interop
1203  };
1204 
1211  template <typename T>
1213  set_arg(int ArgIndex, T &&Arg) {
1214  setArgHelper(ArgIndex, std::move(Arg));
1215  }
1216 
1217  template <typename DataT, int Dims, access::mode AccessMode,
1218  access::target AccessTarget, access::placeholder IsPlaceholder>
1219  void
1220  set_arg(int ArgIndex,
1222  setArgHelper(ArgIndex, std::move(Arg));
1223  }
1224 
1230  template <typename... Ts> void set_args(Ts &&... Args) {
1231  setArgsHelper(0, std::move(Args)...);
1232  }
1233 
1241  template <typename KernelName = detail::auto_name, typename KernelType>
1242 #ifdef __SYCL_NONCONST_FUNCTOR__
1243  void single_task(KernelType KernelFunc) {
1244 #else
1245  void single_task(const KernelType &KernelFunc) {
1246 #endif
1247  throwIfActionIsCreated();
1248  using NameT =
1250  kernel_single_task_wrapper<NameT>(KernelFunc);
1251 #ifndef __SYCL_DEVICE_ONLY__
1252  // No need to check if range is out of INT_MAX limits as it's compile-time
1253  // known constant.
1254  MNDRDesc.set(range<1>{1});
1255 
1256  StoreLambda<NameT, KernelType, /*Dims*/ 0, void>(KernelFunc);
1257  setType(detail::CG::Kernel);
1258 #endif
1259  }
1260 
1261  template <typename KernelName = detail::auto_name, typename KernelType>
1262 #ifdef __SYCL_NONCONST_FUNCTOR__
1263  void parallel_for(range<1> NumWorkItems, KernelType KernelFunc) {
1264 #else
1265  void parallel_for(range<1> NumWorkItems, const KernelType &KernelFunc) {
1266 #endif
1267  parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
1268  }
1269 
1270  template <typename KernelName = detail::auto_name, typename KernelType>
1271 #ifdef __SYCL_NONCONST_FUNCTOR__
1272  void parallel_for(range<2> NumWorkItems, KernelType KernelFunc) {
1273 #else
1274  void parallel_for(range<2> NumWorkItems, const KernelType &KernelFunc) {
1275 #endif
1276  parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
1277  }
1278 
1279  template <typename KernelName = detail::auto_name, typename KernelType>
1280 #ifdef __SYCL_NONCONST_FUNCTOR__
1281  void parallel_for(range<3> NumWorkItems, KernelType KernelFunc) {
1282 #else
1283  void parallel_for(range<3> NumWorkItems, const KernelType &KernelFunc) {
1284 #endif
1285  parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
1286  }
1287 
1292  template <typename FuncT>
1294  "run_on_host_intel() is deprecated, use host_task() instead")
1295  void run_on_host_intel(FuncT Func) {
1296  throwIfActionIsCreated();
1297  // No need to check if range is out of INT_MAX limits as it's compile-time
1298  // known constant
1299  MNDRDesc.set(range<1>{1});
1300 
1301  MArgs = std::move(MAssociatedAccesors);
1302  MHostKernel.reset(
1303  new detail::HostKernel<FuncT, void, 1, void>(std::move(Func)));
1304  setType(detail::CG::RunOnHostIntel);
1305  }
1306 
1308  template <typename FuncT>
1311  void()>::value ||
1313  void(interop_handle)>::value>
1314  host_task(FuncT &&Func) {
1315  host_task_impl(Func);
1316  }
1317 
1318 // replace _KERNELFUNCPARAM(KernelFunc) with KernelType KernelFunc
1319 // or const KernelType &KernelFunc
1320 #ifdef __SYCL_NONCONST_FUNCTOR__
1321 #define _KERNELFUNCPARAM(a) KernelType a
1322 #else
1323 #define _KERNELFUNCPARAM(a) const KernelType &a
1324 #endif
1325 
1339  template <typename KernelName = detail::auto_name, typename KernelType,
1340  int Dims>
1341  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL2020")
1342  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
1344  throwIfActionIsCreated();
1345  using NameT =
1348  (void)NumWorkItems;
1349  (void)WorkItemOffset;
1350  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
1351 #ifndef __SYCL_DEVICE_ONLY__
1352  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
1353  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
1354  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1355  setType(detail::CG::Kernel);
1356 #endif
1357  }
1358 
1371  template <typename KernelName = detail::auto_name, typename KernelType,
1372  int Dims>
1373  void parallel_for(nd_range<Dims> ExecutionRange,
1375  throwIfActionIsCreated();
1376  using NameT =
1378  using LambdaArgType =
1380  (void)ExecutionRange;
1381  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
1382 #ifndef __SYCL_DEVICE_ONLY__
1383  detail::checkValueRange<Dims>(ExecutionRange);
1384  MNDRDesc.set(std::move(ExecutionRange));
1385  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1386  setType(detail::CG::Kernel);
1387 #endif
1388  }
1389 
1398  template <typename KernelName = detail::auto_name, typename KernelType,
1399  int Dims, typename Reduction>
1400  void parallel_for(range<Dims> Range, Reduction Redu,
1402  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1403 
1404  // Before running the kernels, check that device has enough local memory
1405  // to hold local arrays required for the tree-reduction algorithm.
1406  constexpr bool IsTreeReduction =
1407  !Reduction::has_fast_reduce && !Reduction::has_fast_atomics;
1408  size_t OneElemSize =
1409  IsTreeReduction ? sizeof(typename Reduction::result_type) : 0;
1410  uint32_t NumConcurrentWorkGroups =
1411 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
1412  __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
1413 #else
1415 #endif
1416  // TODO: currently the maximal work group size is determined for the given
1417  // queue/device, while it is safer to use queries to the kernel pre-compiled
1418  // for the device.
1419  size_t MaxWGSize =
1420  ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
1421  ext::oneapi::detail::reduCGFunc<KernelName>(
1422  *this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu);
1423  if (Reduction::is_usm ||
1424  (Reduction::has_fast_atomics && Redu.initializeToIdentity()) ||
1425  (!Reduction::has_fast_atomics && Redu.hasUserDiscardWriteAccessor())) {
1426  this->finalize();
1427  handler CopyHandler(QueueCopy, MIsHost);
1428  CopyHandler.saveCodeLoc(MCodeLoc);
1429  ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1430  Redu);
1431  MLastEvent = CopyHandler.finalize();
1432  }
1433  }
1434 
1438  //
1439  // If the reduction variable must be initialized with the identity value
1440  // before the kernel run, then an additional working accessor is created,
1441  // initialized with the identity value and used in the kernel. That working
1442  // accessor is then copied to user's accessor or USM pointer after
1443  // the kernel run.
1444  // For USM pointers without initialize_to_identity properties the same scheme
1445  // with working accessor is used as re-using user's USM pointer in the kernel
1446  // would require creation of another variant of user's kernel, which does not
1447  // seem efficient.
1448  template <typename KernelName = detail::auto_name, typename KernelType,
1449  int Dims, typename Reduction>
1451  parallel_for(nd_range<Dims> Range, Reduction Redu,
1453  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1454  ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
1455 
1456  if (Reduction::is_usm || Redu.initializeToIdentity()) {
1457  this->finalize();
1458  handler CopyHandler(QueueCopy, MIsHost);
1459  CopyHandler.saveCodeLoc(MCodeLoc);
1460  ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1461  Redu);
1462  MLastEvent = CopyHandler.finalize();
1463  }
1464  }
1465 
1472  //
1473  // If the reduction variable must be initialized with the identity value
1474  // before the kernel run, then an additional working accessor is created,
1475  // initialized with the identity value and used in the kernel. That working
1476  // accessor is then copied to user's accessor or USM pointer after
1477  // the kernel run.
1478  // For USM pointers without initialize_to_identity properties the same scheme
1479  // with working accessor is used as re-using user's USM pointer in the kernel
1480  // would require creation of another variant of user's kernel, which does not
1481  // seem efficient.
1482  template <typename KernelName = detail::auto_name, typename KernelType,
1483  int Dims, typename Reduction>
1485  parallel_for(nd_range<Dims> Range, Reduction Redu,
1487 
1488  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1490 
1491  if (D.has(aspect::atomic64)) {
1492 
1493  ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
1494  Range, Redu);
1495 
1496  if (Reduction::is_usm || Redu.initializeToIdentity()) {
1497  this->finalize();
1498  handler CopyHandler(QueueCopy, MIsHost);
1499  CopyHandler.saveCodeLoc(MCodeLoc);
1500  ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1501  CopyHandler, Redu);
1502  MLastEvent = CopyHandler.finalize();
1503  }
1504  } else {
1505  parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1506  }
1507  }
1508 
1523  template <typename KernelName = detail::auto_name, typename KernelType,
1524  int Dims, typename Reduction>
1525  detail::enable_if_t<!Reduction::has_fast_atomics &&
1526  !Reduction::has_atomic_add_float64>
1527  parallel_for(nd_range<Dims> Range, Reduction Redu,
1529 
1530  parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1531  }
1532 
1533  template <typename KernelName, typename KernelType, int Dims,
1534  typename Reduction>
1536  parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
1537  KernelType KernelFunc) {
1538  // This parallel_for() is lowered to the following sequence:
1539  // 1) Call a kernel that a) call user's lambda function and b) performs
1540  // one iteration of reduction, storing the partial reductions/sums
1541  // to either a newly created global buffer or to user's reduction
1542  // accessor. So, if the original 'Range' has totally
1543  // N1 elements and work-group size is W, then after the first iteration
1544  // there will be N2 partial sums where N2 = N1 / W.
1545  // If (N2 == 1) then the partial sum is written to user's accessor.
1546  // Otherwise, a new global buffer is created and partial sums are written
1547  // to it.
1548  // 2) Call an aux kernel (if necessary, i.e. if N2 > 1) as many times as
1549  // necessary to reduce all partial sums into one final sum.
1550 
1551  // Before running the kernels, check that device has enough local memory
1552  // to hold local arrays that may be required for the reduction algorithm.
1553  // TODO: If the work-group-size is limited by the local memory, then
1554  // a special version of the main kernel may be created. The one that would
1555  // not use local accessors, which means it would not do the reduction in
1556  // the main kernel, but simply generate Range.get_global_range.size() number
1557  // of partial sums, leaving the reduction work to the additional/aux
1558  // kernels.
1559  constexpr bool HFR = Reduction::has_fast_reduce;
1560  size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
1561  // TODO: currently the maximal work group size is determined for the given
1562  // queue/device, while it may be safer to use queries to the kernel compiled
1563  // for the device.
1564  size_t MaxWGSize =
1565  ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
1566  if (Range.get_local_range().size() > MaxWGSize)
1567  throw sycl::runtime_error("The implementation handling parallel_for with"
1568  " reduction requires work group size not bigger"
1569  " than " +
1570  std::to_string(MaxWGSize),
1572 
1573  // 1. Call the kernel that includes user's lambda function.
1574  ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
1575  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1576  this->finalize();
1577 
1578  // 2. Run the additional kernel as many times as needed to reduce
1579  // all partial sums into one scalar.
1580 
1581  // TODO: Create a special slow/sequential version of the kernel that would
1582  // handle the reduction instead of reporting an assert below.
1583  if (MaxWGSize <= 1)
1584  throw sycl::runtime_error("The implementation handling parallel_for with "
1585  "reduction requires the maximal work group "
1586  "size to be greater than 1 to converge. "
1587  "The maximal work group size depends on the "
1588  "device and the size of the objects passed to "
1589  "the reduction.",
1591  size_t NWorkItems = Range.get_group_range().size();
1592  while (NWorkItems > 1) {
1593  handler AuxHandler(QueueCopy, MIsHost);
1594  AuxHandler.saveCodeLoc(MCodeLoc);
1595 
1596  NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
1597  AuxHandler, NWorkItems, MaxWGSize, Redu);
1598  MLastEvent = AuxHandler.finalize();
1599  } // end while (NWorkItems > 1)
1600 
1601  if (Reduction::is_usm || Redu.hasUserDiscardWriteAccessor()) {
1602  handler CopyHandler(QueueCopy, MIsHost);
1603  CopyHandler.saveCodeLoc(MCodeLoc);
1604  ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1605  Redu);
1606  MLastEvent = CopyHandler.finalize();
1607  }
1608  }
1609 
1610  // This version of parallel_for may handle one or more reductions packed in
1611  // \p Rest argument. Note thought that the last element in \p Rest pack is
1612  // the kernel function.
1613  // TODO: this variant is currently enabled for 2+ reductions only as the
1614  // versions handling 1 reduction variable are more efficient right now.
1615  //
1616  // Algorithm:
1617  // 1) discard_write accessor (DWAcc), InitializeToIdentity = true:
1618  // a) Create uninitialized buffer and read_write accessor (RWAcc).
1619  // b) discard-write partial sums to RWAcc.
1620  // c) Repeat the steps (a) and (b) to get one final sum.
1621  // d) Copy RWAcc to DWAcc.
1622  // 2) read_write accessor (RWAcc), InitializeToIdentity = false:
1623  // a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1624  // re-use user's RWAcc (if #work-groups is 1).
1625  // b) discard-write to RWAcc (#WG > 1), or update-write (#WG == 1).
1626  // c) Repeat the steps (a) and (b) to get one final sum.
1627  // 3) read_write accessor (RWAcc), InitializeToIdentity = true:
1628  // a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1629  // re-use user's RWAcc (if #work-groups is 1).
1630  // b) discard-write to RWAcc.
1631  // c) Repeat the steps (a) and (b) to get one final sum.
1632  // 4) USM pointer, InitializeToIdentity = false:
1633  // a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1634  // re-use user's USM pointer (if #work-groups is 1).
1635  // b) discard-write to RWAcc (#WG > 1) or
1636  // update-write to USM pointer (#WG == 1).
1637  // c) Repeat the steps (a) and (b) to get one final sum.
1638  // 5) USM pointer, InitializeToIdentity = true:
1639  // a) Create new uninitialized buffer (if #work-groups > 1) and RWAcc or
1640  // re-use user's USM pointer (if #work-groups is 1).
1641  // b) discard-write to RWAcc (#WG > 1) or
1642  // discard-write to USM pointer (#WG == 1).
1643  // c) Repeat the steps (a) and (b) to get one final sum.
1644  template <typename KernelName = detail::auto_name, int Dims,
1645  typename... RestT>
1647  (sizeof...(RestT) >= 3 &&
1649  parallel_for(nd_range<Dims> Range, RestT... Rest) {
1650  std::tuple<RestT...> ArgsTuple(Rest...);
1651  constexpr size_t NumArgs = sizeof...(RestT);
1652  auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
1653  auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
1654  auto ReduTuple =
1655  ext::oneapi::detail::tuple_select_elements(ArgsTuple, ReduIndices);
1656 
1657  size_t LocalMemPerWorkItem =
1658  ext::oneapi::detail::reduGetMemPerWorkItem(ReduTuple, ReduIndices);
1659  // TODO: currently the maximal work group size is determined for the given
1660  // queue/device, while it is safer to use queries to the kernel compiled
1661  // for the device.
1662  size_t MaxWGSize =
1663  ext::oneapi::detail::reduGetMaxWGSize(MQueue, LocalMemPerWorkItem);
1664  if (Range.get_local_range().size() > MaxWGSize)
1665  throw sycl::runtime_error("The implementation handling parallel_for with"
1666  " reduction requires work group size not bigger"
1667  " than " +
1668  std::to_string(MaxWGSize),
1670 
1671  ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
1672  ReduTuple, ReduIndices);
1673  std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1674  this->finalize();
1675 
1676  size_t NWorkItems = Range.get_group_range().size();
1677  while (NWorkItems > 1) {
1678  handler AuxHandler(QueueCopy, MIsHost);
1679  AuxHandler.saveCodeLoc(MCodeLoc);
1680 
1681  NWorkItems =
1682  ext::oneapi::detail::reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
1683  AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
1684  MLastEvent = AuxHandler.finalize();
1685  } // end while (NWorkItems > 1)
1686 
1688  QueueCopy, MIsHost, ReduTuple, ReduIndices);
1689  if (CopyEvent)
1690  MLastEvent = *CopyEvent;
1691  }
1692 
1703  template <typename KernelName = detail::auto_name, typename KernelType,
1704  int Dims>
1707  throwIfActionIsCreated();
1708  using NameT =
1710  using LambdaArgType =
1712  (void)NumWorkGroups;
1713  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
1714 #ifndef __SYCL_DEVICE_ONLY__
1715  detail::checkValueRange<Dims>(NumWorkGroups);
1716  MNDRDesc.setNumWorkGroups(NumWorkGroups);
1717  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1718  setType(detail::CG::Kernel);
1719 #endif // __SYCL_DEVICE_ONLY__
1720  }
1721 
1734  template <typename KernelName = detail::auto_name, typename KernelType,
1735  int Dims>
1737  range<Dims> WorkGroupSize,
1739  throwIfActionIsCreated();
1740  using NameT =
1742  using LambdaArgType =
1744  (void)NumWorkGroups;
1745  (void)WorkGroupSize;
1746  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
1747 #ifndef __SYCL_DEVICE_ONLY__
1748  nd_range<Dims> ExecRange =
1749  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
1750  detail::checkValueRange<Dims>(ExecRange);
1751  MNDRDesc.set(std::move(ExecRange));
1752  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
1753  setType(detail::CG::Kernel);
1754 #endif // __SYCL_DEVICE_ONLY__
1755  }
1756 
1763  void single_task(kernel Kernel) {
1764  throwIfActionIsCreated();
1765  verifyKernelInvoc(Kernel);
1766  // No need to check if range is out of INT_MAX limits as it's compile-time
1767  // known constant
1768  MNDRDesc.set(range<1>{1});
1769  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1770  setType(detail::CG::Kernel);
1771  extractArgsAndReqs();
1772  MKernelName = getKernelName();
1773  }
1774 
1775  void parallel_for(range<1> NumWorkItems, kernel Kernel) {
1776  parallel_for_impl(NumWorkItems, Kernel);
1777  }
1778 
1779  void parallel_for(range<2> NumWorkItems, kernel Kernel) {
1780  parallel_for_impl(NumWorkItems, Kernel);
1781  }
1782 
1783  void parallel_for(range<3> NumWorkItems, kernel Kernel) {
1784  parallel_for_impl(NumWorkItems, Kernel);
1785  }
1786 
1795  template <int Dims>
1796  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
1797  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
1798  kernel Kernel) {
1799  throwIfActionIsCreated();
1800  verifyKernelInvoc(Kernel);
1801  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1802  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
1803  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
1804  setType(detail::CG::Kernel);
1805  extractArgsAndReqs();
1806  MKernelName = getKernelName();
1807  }
1808 
1817  template <int Dims> void parallel_for(nd_range<Dims> NDRange, kernel Kernel) {
1818  throwIfActionIsCreated();
1819  verifyKernelInvoc(Kernel);
1820  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1821  detail::checkValueRange<Dims>(NDRange);
1822  MNDRDesc.set(std::move(NDRange));
1823  setType(detail::CG::Kernel);
1824  extractArgsAndReqs();
1825  MKernelName = getKernelName();
1826  }
1827 
1834  template <typename KernelName = detail::auto_name, typename KernelType>
1836  throwIfActionIsCreated();
1837  using NameT =
1839  (void)Kernel;
1840  kernel_single_task<NameT>(KernelFunc);
1841 #ifndef __SYCL_DEVICE_ONLY__
1842  // No need to check if range is out of INT_MAX limits as it's compile-time
1843  // known constant
1844  MNDRDesc.set(range<1>{1});
1845  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1846  setType(detail::CG::Kernel);
1847  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
1848  extractArgsAndReqs();
1849  MKernelName = getKernelName();
1850  } else
1851  StoreLambda<NameT, KernelType, /*Dims*/ 0, void>(std::move(KernelFunc));
1852 #else
1853  detail::CheckDeviceCopyable<KernelType>();
1854 #endif
1855  }
1856 
1860  template <typename FuncT>
1861  __SYCL_DEPRECATED("interop_task() is deprecated, use host_task() instead")
1862  void interop_task(FuncT Func) {
1863 
1864  MInteropTask.reset(new detail::InteropTask(std::move(Func)));
1865  setType(detail::CG::CodeplayInteropTask);
1866  }
1867 
1875  template <typename KernelName = detail::auto_name, typename KernelType,
1876  int Dims>
1877  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
1879  throwIfActionIsCreated();
1880  using NameT =
1883  (void)Kernel;
1884  (void)NumWorkItems;
1885  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
1886 #ifndef __SYCL_DEVICE_ONLY__
1887  detail::checkValueRange<Dims>(NumWorkItems);
1888  MNDRDesc.set(std::move(NumWorkItems));
1889  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1890  setType(detail::CG::Kernel);
1891  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
1892  extractArgsAndReqs();
1893  MKernelName = getKernelName();
1894  } else
1895  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
1896  std::move(KernelFunc));
1897 #endif
1898  }
1899 
1909  template <typename KernelName = detail::auto_name, typename KernelType,
1910  int Dims>
1911  __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020")
1912  void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
1913  id<Dims> WorkItemOffset, _KERNELFUNCPARAM(KernelFunc)) {
1914  throwIfActionIsCreated();
1915  using NameT =
1918  (void)Kernel;
1919  (void)NumWorkItems;
1920  (void)WorkItemOffset;
1921  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
1922 #ifndef __SYCL_DEVICE_ONLY__
1923  detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
1924  MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
1925  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1926  setType(detail::CG::Kernel);
1927  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
1928  extractArgsAndReqs();
1929  MKernelName = getKernelName();
1930  } else
1931  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
1932  std::move(KernelFunc));
1933 #endif
1934  }
1935 
1945  template <typename KernelName = detail::auto_name, typename KernelType,
1946  int Dims>
1947  void parallel_for(kernel Kernel, nd_range<Dims> NDRange,
1949  throwIfActionIsCreated();
1950  using NameT =
1952  using LambdaArgType =
1954  (void)Kernel;
1955  (void)NDRange;
1956  kernel_parallel_for_wrapper<NameT, LambdaArgType>(KernelFunc);
1957 #ifndef __SYCL_DEVICE_ONLY__
1958  detail::checkValueRange<Dims>(NDRange);
1959  MNDRDesc.set(std::move(NDRange));
1960  MKernel = detail::getSyclObjImpl(std::move(Kernel));
1961  setType(detail::CG::Kernel);
1962  if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
1963  extractArgsAndReqs();
1964  MKernelName = getKernelName();
1965  } else
1966  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(
1967  std::move(KernelFunc));
1968 #endif
1969  }
1970 
1984  template <typename KernelName = detail::auto_name, typename KernelType,
1985  int Dims>
1986  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
1988  throwIfActionIsCreated();
1989  using NameT =
1991  using LambdaArgType =
1993  (void)Kernel;
1994  (void)NumWorkGroups;
1995  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
1996 #ifndef __SYCL_DEVICE_ONLY__
1997  detail::checkValueRange<Dims>(NumWorkGroups);
1998  MNDRDesc.setNumWorkGroups(NumWorkGroups);
1999  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2000  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2001  setType(detail::CG::Kernel);
2002 #endif // __SYCL_DEVICE_ONLY__
2003  }
2004 
2020  template <typename KernelName = detail::auto_name, typename KernelType,
2021  int Dims>
2022  void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
2023  range<Dims> WorkGroupSize,
2025  throwIfActionIsCreated();
2026  using NameT =
2028  using LambdaArgType =
2030  (void)Kernel;
2031  (void)NumWorkGroups;
2032  (void)WorkGroupSize;
2033  kernel_parallel_for_work_group_wrapper<NameT, LambdaArgType>(KernelFunc);
2034 #ifndef __SYCL_DEVICE_ONLY__
2035  nd_range<Dims> ExecRange =
2036  nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
2037  detail::checkValueRange<Dims>(ExecRange);
2038  MNDRDesc.set(std::move(ExecRange));
2039  MKernel = detail::getSyclObjImpl(std::move(Kernel));
2040  StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
2041  setType(detail::CG::Kernel);
2042 #endif // __SYCL_DEVICE_ONLY__
2043  }
2044 
2045  // Clean up KERNELFUNC macro.
2046 #undef _KERNELFUNCPARAM
2047 
2048  // Explicit copy operations API
2049 
2057  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2058  access::target AccessTarget,
2059  access::placeholder IsPlaceholder = access::placeholder::false_t>
2061  std::shared_ptr<T_Dst> Dst) {
2062  throwIfActionIsCreated();
2063  static_assert(isValidTargetForExplicitOp(AccessTarget),
2064  "Invalid accessor target for the copy method.");
2065  static_assert(isValidModeForSourceAccessor(AccessMode),
2066  "Invalid accessor mode for the copy method.");
2067  // Make sure data shared_ptr points to is not released until we finish
2068  // work with it.
2069  MSharedPtrStorage.push_back(Dst);
2070  typename std::shared_ptr<T_Dst>::element_type *RawDstPtr = Dst.get();
2071  copy(Src, RawDstPtr);
2072  }
2073 
2081  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2082  access::target AccessTarget,
2083  access::placeholder IsPlaceholder = access::placeholder::false_t>
2084  void
2085  copy(std::shared_ptr<T_Src> Src,
2087  throwIfActionIsCreated();
2088  static_assert(isValidTargetForExplicitOp(AccessTarget),
2089  "Invalid accessor target for the copy method.");
2090  static_assert(isValidModeForDestinationAccessor(AccessMode),
2091  "Invalid accessor mode for the copy method.");
2092  // Make sure data shared_ptr points to is not released until we finish
2093  // work with it.
2094  MSharedPtrStorage.push_back(Src);
2095  typename std::shared_ptr<T_Src>::element_type *RawSrcPtr = Src.get();
2096  copy(RawSrcPtr, Dst);
2097  }
2098 
2106  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2107  access::target AccessTarget,
2108  access::placeholder IsPlaceholder = access::placeholder::false_t>
2110  T_Dst *Dst) {
2111  throwIfActionIsCreated();
2112  static_assert(isValidTargetForExplicitOp(AccessTarget),
2113  "Invalid accessor target for the copy method.");
2114  static_assert(isValidModeForSourceAccessor(AccessMode),
2115  "Invalid accessor mode for the copy method.");
2116 #ifndef __SYCL_DEVICE_ONLY__
2117  if (MIsHost) {
2118  // TODO: Temporary implementation for host. Should be handled by memory
2119  // manager.
2120  copyAccToPtrHost(Src, Dst);
2121  return;
2122  }
2123 #endif
2124  setType(detail::CG::CopyAccToPtr);
2125 
2127  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2128 
2129  MRequirements.push_back(AccImpl.get());
2130  MSrcPtr = static_cast<void *>(AccImpl.get());
2131  MDstPtr = static_cast<void *>(Dst);
2132  // Store copy of accessor to the local storage to make sure it is alive
2133  // until we finish
2134  MAccStorage.push_back(std::move(AccImpl));
2135  }
2136 
2144  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
2145  access::target AccessTarget,
2146  access::placeholder IsPlaceholder = access::placeholder::false_t>
2147  void
2148  copy(const T_Src *Src,
2150  throwIfActionIsCreated();
2151  static_assert(isValidTargetForExplicitOp(AccessTarget),
2152  "Invalid accessor target for the copy method.");
2153  static_assert(isValidModeForDestinationAccessor(AccessMode),
2154  "Invalid accessor mode for the copy method.");
2155 #ifndef __SYCL_DEVICE_ONLY__
2156  if (MIsHost) {
2157  // TODO: Temporary implementation for host. Should be handled by memory
2158  // manager.
2159  copyPtrToAccHost(Src, Dst);
2160  return;
2161  }
2162 #endif
2163  setType(detail::CG::CopyPtrToAcc);
2164 
2166  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2167 
2168  MRequirements.push_back(AccImpl.get());
2169  MSrcPtr = const_cast<T_Src *>(Src);
2170  MDstPtr = static_cast<void *>(AccImpl.get());
2171  // Store copy of accessor to the local storage to make sure it is alive
2172  // until we finish
2173  MAccStorage.push_back(std::move(AccImpl));
2174  }
2175 
2183  template <
2184  typename T_Src, int Dims_Src, access::mode AccessMode_Src,
2185  access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
2186  access::mode AccessMode_Dst, access::target AccessTarget_Dst,
2187  access::placeholder IsPlaceholder_Src = access::placeholder::false_t,
2188  access::placeholder IsPlaceholder_Dst = access::placeholder::false_t>
2189  void copy(accessor<T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src,
2190  IsPlaceholder_Src>
2191  Src,
2192  accessor<T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst,
2193  IsPlaceholder_Dst>
2194  Dst) {
2195  throwIfActionIsCreated();
2196  static_assert(isValidTargetForExplicitOp(AccessTarget_Src),
2197  "Invalid source accessor target for the copy method.");
2198  static_assert(isValidTargetForExplicitOp(AccessTarget_Dst),
2199  "Invalid destination accessor target for the copy method.");
2200  static_assert(isValidModeForSourceAccessor(AccessMode_Src),
2201  "Invalid source accessor mode for the copy method.");
2202  static_assert(isValidModeForDestinationAccessor(AccessMode_Dst),
2203  "Invalid destination accessor mode for the copy method.");
2204  assert(Dst.get_size() >= Src.get_size() &&
2205  "The destination accessor does not fit the copied memory.");
2206  if (copyAccToAccHelper(Src, Dst))
2207  return;
2208  setType(detail::CG::CopyAccToAcc);
2209 
2210  detail::AccessorBaseHost *AccBaseSrc = (detail::AccessorBaseHost *)&Src;
2211  detail::AccessorImplPtr AccImplSrc = detail::getSyclObjImpl(*AccBaseSrc);
2212 
2213  detail::AccessorBaseHost *AccBaseDst = (detail::AccessorBaseHost *)&Dst;
2214  detail::AccessorImplPtr AccImplDst = detail::getSyclObjImpl(*AccBaseDst);
2215 
2216  MRequirements.push_back(AccImplSrc.get());
2217  MRequirements.push_back(AccImplDst.get());
2218  MSrcPtr = AccImplSrc.get();
2219  MDstPtr = AccImplDst.get();
2220  // Store copy of accessor to the local storage to make sure it is alive
2221  // until we finish
2222  MAccStorage.push_back(std::move(AccImplSrc));
2223  MAccStorage.push_back(std::move(AccImplDst));
2224  }
2225 
2230  template <typename T, int Dims, access::mode AccessMode,
2231  access::target AccessTarget,
2232  access::placeholder IsPlaceholder = access::placeholder::false_t>
2233  void
2235  throwIfActionIsCreated();
2236  static_assert(isValidTargetForExplicitOp(AccessTarget),
2237  "Invalid accessor target for the update_host method.");
2238  setType(detail::CG::UpdateHost);
2239 
2241  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2242 
2243  MDstPtr = static_cast<void *>(AccImpl.get());
2244  MRequirements.push_back(AccImpl.get());
2245  MAccStorage.push_back(std::move(AccImpl));
2246  }
2247 
2256  template <typename T, int Dims, access::mode AccessMode,
2257  access::target AccessTarget,
2258  access::placeholder IsPlaceholder = access::placeholder::false_t,
2259  typename PropertyListT = property_list>
2260  void
2262  Dst,
2263  const T &Pattern) {
2264  throwIfActionIsCreated();
2265  // TODO add check:T must be an integral scalar value or a SYCL vector type
2266  static_assert(isValidTargetForExplicitOp(AccessTarget),
2267  "Invalid accessor target for the fill method.");
2268  if (!MIsHost && (((Dims == 1) && isConstOrGlobal(AccessTarget)) ||
2269  isImageOrImageArray(AccessTarget))) {
2270  setType(detail::CG::Fill);
2271 
2273  detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
2274 
2275  MDstPtr = static_cast<void *>(AccImpl.get());
2276  MRequirements.push_back(AccImpl.get());
2277  MAccStorage.push_back(std::move(AccImpl));
2278 
2279  MPattern.resize(sizeof(T));
2280  auto PatternPtr = reinterpret_cast<T *>(MPattern.data());
2281  *PatternPtr = Pattern;
2282  } else {
2283 
2284  // TODO: Temporary implementation for host. Should be handled by memory
2285  // manger.
2286  range<Dims> Range = Dst.get_range();
2287  parallel_for<class __fill<T, Dims, AccessMode, AccessTarget,
2288  IsPlaceholder>>(Range, [=](id<Dims> Index) {
2289  Dst[Index] = Pattern;
2290  });
2291  }
2292  }
2293 
2300  template <typename T> void fill(void *Ptr, const T &Pattern, size_t Count) {
2301  throwIfActionIsCreated();
2302  static_assert(std::is_trivially_copyable<T>::value,
2303  "Pattern must be trivially copyable");
2304  parallel_for<class __usmfill<T>>(range<1>(Count), [=](id<1> Index) {
2305  T *CastedPtr = static_cast<T *>(Ptr);
2306  CastedPtr[Index] = Pattern;
2307  });
2308  }
2309 
2314  throwIfActionIsCreated();
2315  setType(detail::CG::Barrier);
2316  }
2317 
2321  __SYCL2020_DEPRECATED("use 'ext_oneapi_barrier' instead")
2322  void barrier() { ext_oneapi_barrier(); }
2323 
2330  void ext_oneapi_barrier(const std::vector<event> &WaitList);
2331 
2338  __SYCL2020_DEPRECATED("use 'ext_oneapi_barrier' instead")
2339  void barrier(const std::vector<event> &WaitList);
2340 
2350  void memcpy(void *Dest, const void *Src, size_t Count);
2351 
2361  template <typename T> void copy(const T *Src, T *Dest, size_t Count) {
2362  this->memcpy(Dest, Src, Count * sizeof(T));
2363  }
2364 
2373  void memset(void *Dest, int Value, size_t Count);
2374 
2381  void prefetch(const void *Ptr, size_t Count);
2382 
2389  void mem_advise(const void *Ptr, size_t Length, int Advice);
2390 
2391 private:
2392  std::shared_ptr<detail::queue_impl> MQueue;
2397  std::vector<std::vector<char>> MArgsStorage;
2398  std::vector<detail::AccessorImplPtr> MAccStorage;
2399  std::vector<detail::LocalAccessorImplPtr> MLocalAccStorage;
2400  std::vector<std::shared_ptr<detail::stream_impl>> MStreamStorage;
2401  mutable std::vector<std::shared_ptr<const void>> MSharedPtrStorage;
2403  std::vector<detail::ArgDesc> MArgs;
2407  std::vector<detail::ArgDesc> MAssociatedAccesors;
2409  std::vector<detail::Requirement *> MRequirements;
2411  detail::NDRDescT MNDRDesc;
2412  std::string MKernelName;
2414  std::shared_ptr<detail::kernel_impl> MKernel;
2418  detail::CG::CGTYPE MCGType = detail::CG::None;
2420  void *MSrcPtr = nullptr;
2422  void *MDstPtr = nullptr;
2424  size_t MLength = 0;
2426  std::vector<char> MPattern;
2428  std::unique_ptr<detail::HostKernelBase> MHostKernel;
2430  std::unique_ptr<detail::HostTask> MHostTask;
2431  detail::OSModuleHandle MOSModuleHandle = detail::OSUtil::ExeModuleHandle;
2432  // Storage for a lambda or function when using InteropTasks
2433  std::unique_ptr<detail::InteropTask> MInteropTask;
2435  std::vector<detail::EventImplPtr> MEvents;
2438  std::vector<detail::EventImplPtr> MEventsWaitWithBarrier;
2439 
2440  bool MIsHost = false;
2441 
2442  detail::code_location MCodeLoc = {};
2443  bool MIsFinalized = false;
2444  event MLastEvent;
2445 
2446  // Make queue_impl class friend to be able to call finalize method.
2447  friend class detail::queue_impl;
2448  // Make accessor class friend to keep the list of associated accessors.
2449  template <typename DataT, int Dims, access::mode AccMode,
2450  access::target AccTarget, access::placeholder isPlaceholder,
2451  typename PropertyListT>
2452  friend class accessor;
2454 
2455  template <typename DataT, int Dimensions, access::mode AccessMode,
2456  access::target AccessTarget, access::placeholder IsPlaceholder>
2458  // Make stream class friend to be able to keep the list of associated streams
2459  friend class stream;
2460  friend class detail::stream_impl;
2461  // Make reduction_impl friend to store buffers and arrays created for it
2462  // in handler from reduction_impl methods.
2463  template <typename T, class BinaryOperation, int Dims, bool IsUSM,
2464  access::placeholder IsPlaceholder>
2466 
2467  // This method needs to call the method finalize().
2468  template <typename Reduction, typename... RestT>
2469  std::enable_if_t<!Reduction::is_usm> friend ext::oneapi::detail::
2471  std::vector<event> &Events, std::shared_ptr<detail::queue_impl> Queue,
2472  bool IsHost, Reduction &, RestT...);
2473 
2474  friend void detail::associateWithHandler(handler &,
2476  access::target);
2477 
2478  friend class ::MockHandler;
2479  friend class detail::queue_impl;
2480 
2481  bool DisableRangeRounding();
2482 
2483  bool RangeRoundingTrace();
2484 
2485  void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
2486  size_t &MinRange);
2487 
2488  template <typename WrapperT, typename TransformedArgType, int Dims,
2489  typename KernelType>
2490  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
2491  range<Dims> NumWorkItems) {
2493  KernelType, TransformedArgType>()) {
2494  return detail::RoundedRangeKernelWithKH<TransformedArgType, Dims,
2495  KernelType>(NumWorkItems,
2496  KernelFunc);
2497  } else {
2499  NumWorkItems, KernelFunc);
2500  }
2501  }
2502 };
2503 } // namespace sycl
2504 } // __SYCL_INLINE_NAMESPACE(cl)
cl::sycl::detail::associateWithHandler
void associateWithHandler(handler &, AccessorBaseHost *, access::target)
Definition: handler_proxy.cpp:17
cl::sycl::nd_range
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:23
cl::sycl::handler::parallel_for
detail::enable_if_t<!Reduction::has_fast_atomics &&!Reduction::has_atomic_add_float64 > parallel_for(nd_range< Dims > Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified nd_range.
Definition: handler.hpp:1527
cl::sycl::detail::stream_impl
Definition: stream_impl.hpp:24
cl::sycl::handler::host_task
detail::enable_if_t< detail::check_fn_signature< detail::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< detail::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
Definition: handler.hpp:1314
PI_INVALID_KERNEL
@ PI_INVALID_KERNEL
Definition: pi.h:85
property_list.hpp
cl::sycl::handler::parallel_for
void parallel_for(range< 1 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1775
__usmfill
Definition: handler.hpp:48
cg.hpp
cl::sycl::detail::getDelinearizedId
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:318
cl::sycl::detail::NDRDescT
Definition: cg_types.hpp:41
cl::sycl::detail::check_fn_signature
Definition: cg_types.hpp:126
cl::sycl::kernel_bundle
The kernel_bundle class represents collection of device images in a particular state.
Definition: kernel.hpp:26
type
cl::sycl::detail::getLinearIndex
size_t getLinearIndex(const T< Dims > &Index, const U< Dims > &Range)
Definition: common.hpp:295
cl::sycl::detail::member_ptr_helper
static Arg member_ptr_helper(RetType(Func::*)(Arg) const)
T
cl::sycl::detail::LocalAccessorImplHost
Definition: accessor_impl.hpp:171
cl::sycl::interop_handle
Definition: interop_handle.hpp:37
cl::sycl::event
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:31
PI_INVALID_OPERATION
@ PI_INVALID_OPERATION
Definition: pi.h:84
__SYCL2020_DEPRECATED
#define __SYCL2020_DEPRECATED(message)
Definition: defines_elementary.hpp:54
cl::sycl::item
Identifies an instance of the function object executing at each point in a range.
Definition: helpers.hpp:28
item.hpp
cl::sycl::handler::parallel_for
void parallel_for(range< 2 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1779
__copyAcc2Acc
Definition: handler.hpp:68
cl::sycl::access::placeholder
placeholder
Definition: access.hpp:43
cl::sycl::detail::LocalAccessorImplPtr
std::shared_ptr< LocalAccessorImplHost > LocalAccessorImplPtr
Definition: accessor_impl.hpp:183
cl::sycl::detail::remove_cv_t
typename std::remove_cv< T >::type remove_cv_t
Definition: stl_type_traits.hpp:32
stl.hpp
cl::sycl::detail::RoundedRangeKernelWithKH::operator()
void operator()(TransformedArgType Arg, kernel_handler KH) const
Definition: handler.hpp:225
cg_types.hpp
cl::sycl::handler::is_same_type
std::is_same< remove_cv_ref_t< U >, remove_cv_ref_t< T > > is_same_type
Definition: handler.hpp:1191
cl::sycl::id
A unique identifier of an item in an index space.
Definition: array.hpp:17
cl::sycl::detail::getVersionedCGType
constexpr unsigned int getVersionedCGType(unsigned int Type, unsigned char Version)
Definition: cg.hpp:122
cl::sycl::sampler
Encapsulates a configuration for sampling an image accessor.
Definition: sampler.hpp:65
cl::sycl::ext::oneapi::detail::reduGetMaxNumConcurrentWorkGroups
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< queue_impl > Queue)
handler_proxy.hpp
cl::sycl::detail::CG::CG_VERSION
CG_VERSION
Definition: cg.hpp:146
cl::sycl::detail::argument_helper
SuggestedArgType argument_helper(...)
cl::sycl::handler::set_args
void set_args(Ts &&... Args)
Sets arguments for OpenCL interoperability kernels.
Definition: handler.hpp:1230
cl::sycl::detail::RoundedRangeKernel::RoundedRangeKernel
RoundedRangeKernel(range< Dims > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:204
cl::sycl::detail::RoundedRangeKernel::operator()
void operator()(TransformedArgType Arg) const
Definition: handler.hpp:207
__copyAcc2Ptr
Definition: handler.hpp:54
context.hpp
_KERNELFUNCPARAM
#define _KERNELFUNCPARAM(a)
Definition: handler.hpp:1321
cl::sycl::nd_range::get_group_range
range< dimensions > get_group_range() const
Definition: nd_range.hpp:44
event.hpp
os_util.hpp
cl::sycl::detail::__pf_kernel_wrapper
Definition: handler.hpp:113
cl::sycl::detail::KernelBundleImplPtr
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
Definition: kernel_bundle.hpp:131
cl::sycl::handler::parallel_for
detail::enable_if_t< Reduction::has_fast_atomics > parallel_for(nd_range< Dims > Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc))
Implements parallel_for() accepting nd_range Range and one reduction object.
Definition: handler.hpp:1451
cl::sycl::accessor::get_range
range< Dimensions > get_range() const
Definition: accessor.hpp:1524
cl::sycl::handler::parallel_for
void parallel_for(range< Dims > Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified nd_range.
Definition: handler.hpp:1400
cl::sycl::property_list
Objects of the property_list class are containers for the SYCL properties.
Definition: property_list.hpp:25
access.hpp
cl::sycl::detail::lambda_arg_type
decltype(argument_helper< F, SuggestedArgType >(0)) lambda_arg_type
Definition: handler.hpp:110
cl::sycl::detail::code_location
Definition: common.hpp:29
cl::sycl::detail::remove_reference_t
typename std::remove_reference< T >::type remove_reference_t
Definition: stl_type_traits.hpp:35
cl::sycl::handler::update_host
void update_host(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder > Acc)
Provides guarantees that the memory object accessed via Acc is updated on the host after command grou...
Definition: handler.hpp:2234
cl::sycl::detail::write
void write(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize, unsigned WIOffset, const char *Str, unsigned Len, unsigned Padding=0)
Definition: stream.hpp:110
cl::sycl::detail::LocalAccessorBaseHost
Definition: accessor_impl.hpp:185
id.hpp
cl::sycl::range
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: buffer.hpp:24
cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMemHelper
std::enable_if_t<!Reduction::is_usm > reduSaveFinalResultToUserMemHelper(std::vector< event > &Events, std::shared_ptr< detail::queue_impl > Queue, bool IsHost, Reduction &Redu, RestT... Rest)
Definition: reduction.hpp:1945
interop_handle.hpp
cl::sycl::handler::parallel_for
detail::enable_if_t< Reduction::has_atomic_add_float64 > parallel_for(nd_range< Dims > Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc))
Implements parallel_for() accepting nd_range Range and one reduction object.
Definition: handler.hpp:1485
cl::sycl::handler::depends_on
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.hpp:1173
cl::sycl::detail::memcpy
void memcpy(void *Dst, const void *Src, std::size_t Size)
cl::sycl::handler::parallel_for_work_group
void parallel_for_work_group(range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:1736
cl::sycl::ext::oneapi::detail::reduGetMaxWGSize
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
cl::sycl::detail::isKernelLambdaCallableWithKernelHandler
constexpr bool isKernelLambdaCallableWithKernelHandler()
Definition: cg_types.hpp:175
cl::sycl::detail::CG::CGTYPE
CGTYPE
Type of the command group.
Definition: cg.hpp:152
nd_range.hpp
cl::sycl::handler::parallel_for_work_group
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:1986
export.hpp
cl::sycl::handler::ShouldEnableSetArg
Definition: handler.hpp:1193
cl::sycl::handler::parallel_for
void parallel_for(kernel Kernel, nd_range< Dims > NDRange, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:1947
cl::sycl::detail::AccessorImplHost
Definition: accessor_impl.hpp:76
cl::sycl::handler::require
void require(accessor< DataT, Dims, AccMode, AccTarget, access::placeholder::true_t > Acc)
Requires access to the memory object associated with the placeholder accessor.
Definition: handler.hpp:1161
cl::sycl::handler::parallel_for
void parallel_for(range< 1 > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:1263
cl::sycl::handler::single_task
void single_task(kernel Kernel)
Invokes a SYCL kernel.
Definition: handler.hpp:1763
cl::sycl::handler::parallel_for
void parallel_for(range< 3 > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:1281
kernel.hpp
cl::sycl::ext::oneapi::detail::tuple_select_elements
std::tuple< std::tuple_element_t< Is, TupleT >... > tuple_select_elements(TupleT Tuple, std::index_sequence< Is... >)
Utility function: for the given tuple.
Definition: reduction.hpp:1999
cl::sycl::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:34
cl::sycl::detail::RoundedRangeKernel
Definition: handler.hpp:202
kernel_bundle.hpp
cl::sycl::handler::use_kernel_bundle
void use_kernel_bundle(const kernel_bundle< bundle_state::executable > &ExecBundle)
Definition: handler.hpp:1147
cl::sycl::detail::AccessorImplPtr
std::shared_ptr< AccessorImplHost > AccessorImplPtr
Definition: accessor_impl.hpp:132
cl::sycl::handler::ext_oneapi_barrier
void ext_oneapi_barrier()
Prevents any commands submitted afterward to this queue from executing until all commands previously ...
Definition: handler.hpp:2313
cl::sycl::handler::parallel_for
void parallel_for(nd_range< Dims > ExecutionRange, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified nd_range.
Definition: handler.hpp:1373
cl::sycl::ext::oneapi::detail::AreAllButLastReductions
Predicate returning true if all template type parameters except the last one are reductions.
Definition: handler.hpp:321
cl::sycl::accessor
Buffer accessor.
Definition: accessor.hpp:225
cl::sycl::access::target
target
Definition: access.hpp:17
cl::sycl::handler::copy
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2060
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
cl::sycl::handler::remove_cv_ref_t
typename detail::remove_cv_t< detail::remove_reference_t< T > > remove_cv_ref_t
Definition: handler.hpp:1188
cl::sycl::handler::single_task
void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function.
Definition: handler.hpp:1835
cl::sycl::handler::fill
void fill(void *Ptr, const T &Pattern, size_t Count)
Fills the specified memory with the specified pattern.
Definition: handler.hpp:2300
cl::sycl::handler::parallel_for
void parallel_for(range< 2 > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:1272
cl::sycl::detail::KernelInfo
Definition: kernel_desc.hpp:70
cl::sycl::handler::parallel_for
std::enable_if_t<(sizeof...(RestT) >=3 &&ext::oneapi::detail::AreAllButLastReductions< RestT... >::value)> parallel_for(nd_range< Dims > Range, RestT... Rest)
Definition: handler.hpp:1649
cl::sycl::kernel::is_host
bool is_host() const
Check if the associated SYCL context is a SYCL host context.
Definition: kernel.cpp:26
cl::sycl::ext::intel::prefetch
prefetch_impl< _B > prefetch
Definition: fpga_lsu.hpp:44
cl::sycl::ext::oneapi::detail::reduCGFuncAtomic64
enable_if_t< Reduction::has_atomic_add_float64 > reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &Redu)
Definition: reduction.hpp:1805
cl::sycl::kernel_handler
Reading the value of a specialization constant.
Definition: kernel_handler.hpp:22
cl::sycl::ext::oneapi::detail::reduction_impl
This class encapsulates the reduction variable/accessor, the reduction operator and an optional opera...
Definition: handler.hpp:244
cl::sycl::handler::copy
void copy(std::shared_ptr< T_Src > Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2085
cl::sycl::detail::RoundedRangeKernelWithKH
Definition: handler.hpp:220
cl::sycl::handler
Command group handler class.
Definition: handler.hpp:364
cl::sycl::handler::parallel_for
void parallel_for(kernel Kernel, range< Dims > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function for the specified range.
Definition: handler.hpp:1877
cl::sycl::accessor::get_pointer
DataT * get_pointer() const
Definition: accessor.hpp:1603
__SYCL_KERNEL_ATTR__
#define __SYCL_KERNEL_ATTR__
Definition: handler.hpp:919
cl::sycl::ext::oneapi::detail::reduAuxCGFunc
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1904
cl::sycl::detail::queue_impl
Definition: queue_impl.hpp:52
cl::sycl::handler::__SYCL_DEPRECATED
__SYCL_DEPRECATED("run_on_host_intel() is deprecated, use host_task() instead") void run_on_host_intel(FuncT Func)
Defines and invokes a SYCL kernel on host device.
Definition: handler.hpp:1293
accessor.hpp
cl::sycl::detail::InteropTask
Definition: cg_types.hpp:218
std::get
constexpr tuple_element< I, tuple< Types... > >::type & get(cl::sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:199
cl::sycl::accessor::get_size
size_t get_size() const
Definition: accessor.hpp:1517
cl::sycl::detail::OSModuleHandle
intptr_t OSModuleHandle
Uniquely identifies an operating system module (executable or a dynamic library)
Definition: os_util.hpp:48
PI_INVALID_VALUE
@ PI_INVALID_VALUE
Definition: pi.h:87
kernel_handler.hpp
cl::sycl::detail::auto_name
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:33
cl::sycl::handler::copy
void copy(const T_Src *Src, accessor< T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder > Dst)
Copies the content of memory pointed by Src into the memory object accessed by Dst.
Definition: handler.hpp:2148
cl::sycl::detail::getSyclObjImpl
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: common.hpp:163
cl::sycl::accessor::size
size_t size() const noexcept
Definition: accessor.hpp:1521
PI_INVALID_WORK_GROUP_SIZE
@ PI_INVALID_WORK_GROUP_SIZE
Definition: pi.h:104
std
Definition: accessor.hpp:2356
cl::sycl::detail::checkValueRange
detail::enable_if_t< std::is_same< T, nd_range< Dims > >::value > checkValueRange(const T &V)
Definition: handler.hpp:189
cl::sycl::handler::set_arg
void set_arg(int ArgIndex, accessor< DataT, Dims, AccessMode, AccessTarget, IsPlaceholder > Arg)
Definition: handler.hpp:1220
cl::sycl::detail::image_accessor
Definition: accessor.hpp:372
cl::sycl::handler::depends_on
void depends_on(const std::vector< event > &Events)
Registers event dependencies on this command group.
Definition: handler.hpp:1180
cl::sycl::handler::parallel_for_Impl
detail::enable_if_t<!Reduction::has_fast_atomics > parallel_for_Impl(nd_range< Dims > Range, Reduction Redu, KernelType KernelFunc)
Definition: handler.hpp:1536
sampler.hpp
cl::sycl::detail::HostKernel
Definition: cg_types.hpp:244
cl::sycl::detail::getDeviceFromHandler
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
Definition: accessor.cpp:15
cl::sycl::handler::single_task
void single_task(KernelType KernelFunc)
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:1243
cl::sycl::detail::kernel_param_kind_t
kernel_param_kind_t
Definition: kernel_desc.hpp:25
cl::sycl::detail::get_kernel_wrapper_name_t
Definition: handler.hpp:115
cl::sycl::handler::__SYCL_DEPRECATED
__SYCL_DEPRECATED("interop_task() is deprecated, use host_task() instead") void interop_task(FuncT Func)
Invokes a lambda on the host.
Definition: handler.hpp:1861
cl::sycl::ext::oneapi::detail::reduCGFunc
void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1741
cl::sycl::handler::parallel_for_work_group
void parallel_for_work_group(range< Dims > NumWorkGroups, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-...
Definition: handler.hpp:1705
cl::sycl::detail::remove_const_t
typename std::remove_const< T >::type remove_const_t
Definition: stl_type_traits.hpp:30
cl::sycl::detail::kernel_param_desc_t
Definition: kernel_desc.hpp:36
cl::sycl::detail::AccessorBaseHost
Definition: accessor_impl.hpp:134
cl::sycl::detail::getUnversionedCGType
constexpr unsigned char getUnversionedCGType(unsigned int Type)
Definition: cg.hpp:128
cl::sycl::device::has
bool has(aspect Aspect) const
Indicates if the SYCL device has the given feature.
Definition: device.cpp:163
__fill
Definition: handler.hpp:46
nd_item.hpp
cl::sycl::access::mode
mode
Definition: access.hpp:28
KernelFunc
std::function< void(const sycl::nd_item< NDims > &)> KernelFunc
Definition: pi_esimd_cpu.cpp:129
cl::sycl::instead
std::uint8_t instead
Definition: aliases.hpp:71
cl::sycl::ext::oneapi::detail::reduGetMemPerWorkItem
size_t reduGetMemPerWorkItem(std::tuple< ReductionT... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1990
cl::sycl::handler::copy
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, T_Dst *Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2109
cl::sycl::handler::fill
void fill(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT > Dst, const T &Pattern)
Fills memory pointed by accessor with the pattern given.
Definition: handler.hpp:2261
cl::sycl::stream
A buffered output stream that allows outputting the values of built-in, vector and SYCL types to the ...
Definition: stream.hpp:742
cl::sycl::detail::get_kernel_name_t::name
Name name
Definition: kernel.hpp:39
cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMem
std::shared_ptr< event > reduSaveFinalResultToUserMem(std::shared_ptr< detail::queue_impl > Queue, bool IsHost, std::tuple< Reduction... > &ReduTuple, std::index_sequence< Is... >)
Creates additional kernels that copy the accumulated/final results from reductions accessors to eithe...
Definition: reduction.hpp:1968
cl::sycl::detail::enable_if_t
typename std::enable_if< B, T >::type enable_if_t
Definition: stl_type_traits.hpp:24
cl::sycl::handler::set_arg
detail::enable_if_t< ShouldEnableSetArg< T >::value, void > set_arg(int ArgIndex, T &&Arg)
Sets argument for OpenCL interoperability kernels.
Definition: handler.hpp:1213
cl::sycl::kernel
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:65
cl::sycl::handler::parallel_for
void parallel_for(range< 3 > NumWorkItems, kernel Kernel)
Definition: handler.hpp:1783
cl::sycl::handler::copy
void copy(accessor< T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src, IsPlaceholder_Src > Src, accessor< T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst, IsPlaceholder_Dst > Dst)
Copies the content of memory object accessed by Src to the memory object accessed by Dst.
Definition: handler.hpp:2189
cl::sycl::handler::parallel_for_work_group
void parallel_for_work_group(kernel Kernel, range< Dims > NumWorkGroups, range< Dims > WorkGroupSize, _KERNELFUNCPARAM(KernelFunc))
Hierarchical kernel invocation method of a kernel.
Definition: handler.hpp:2022
cl::sycl::detail::RoundedRangeKernelWithKH::RoundedRangeKernelWithKH
RoundedRangeKernelWithKH(range< Dims > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:222
PI_INVALID_DEVICE
@ PI_INVALID_DEVICE
Definition: pi.h:90
cl::sycl::nd_range::get_local_range
range< dimensions > get_local_range() const
Definition: nd_range.hpp:42
cl::sycl::handler::parallel_for
void parallel_for(nd_range< Dims > NDRange, kernel Kernel)
Defines and invokes a SYCL kernel function for the specified range and offsets.
Definition: handler.hpp:1817
__copyPtr2Acc
Definition: handler.hpp:60
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:12