DPC++ Runtime
Runtime libraries for oneAPI Data Parallel C++
queue.hpp
Go to the documentation of this file.
1 //==-------------------- queue.hpp - SYCL queue ----------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
16 #include <CL/sycl/device.hpp>
18 #include <CL/sycl/event.hpp>
20 #include <CL/sycl/handler.hpp>
23 #include <CL/sycl/stl.hpp>
24 
25 // Explicitly request format macros
26 #ifndef __STDC_FORMAT_MACROS
27 #define __STDC_FORMAT_MACROS 1
28 #endif
29 #include <cinttypes>
30 #include <utility>
31 
32 // having _TWO_ mid-param #ifdefs makes the functions very difficult to read.
33 // Here we simplify the &CodeLoc declaration to be _CODELOCPARAM(&CodeLoc) and
34 // _CODELOCARG(&CodeLoc) Similarly, the KernelFunc param is simplified to be
35 // _KERNELFUNCPARAM(KernelFunc) Once the queue kernel functions are defined,
36 // these macros are #undef immediately.
37 
38 // replace _CODELOCPARAM(&CodeLoc) with nothing
39 // or : , const detail::code_location &CodeLoc =
40 // detail::code_location::current()
41 // replace _CODELOCARG(&CodeLoc) with nothing
42 // or : const detail::code_location &CodeLoc = {}
43 
44 #ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
45 #define _CODELOCONLYPARAM(a) \
46  const detail::code_location a = detail::code_location::current()
47 #define _CODELOCPARAM(a) \
48  , const detail::code_location a = detail::code_location::current()
49 
50 #define _CODELOCARG(a)
51 #define _CODELOCFW(a) , a
52 #else
53 #define _CODELOCONLYPARAM(a)
54 #define _CODELOCPARAM(a)
55 
56 #define _CODELOCARG(a) const detail::code_location a = {}
57 #define _CODELOCFW(a)
58 #endif
59 
60 // replace _KERNELFUNCPARAM(KernelFunc) with KernelType KernelFunc
61 // or const KernelType &KernelFunc
62 #ifdef __SYCL_NONCONST_FUNCTOR__
63 #define _KERNELFUNCPARAM(a) KernelType a
64 #else
65 #define _KERNELFUNCPARAM(a) const KernelType &a
66 #endif
67 
68 // Helper macro to identify if fallback assert is needed
69 // FIXME remove __NVPTX__ condition once devicelib supports CUDA
70 #if !defined(SYCL_DISABLE_FALLBACK_ASSERT)
71 #define __SYCL_USE_FALLBACK_ASSERT 1
72 #else
73 #define __SYCL_USE_FALLBACK_ASSERT 0
74 #endif
75 
77 namespace sycl {
78 
79 // Forward declaration
80 class context;
81 class device;
82 class queue;
83 
84 namespace detail {
85 class queue_impl;
86 #if __SYCL_USE_FALLBACK_ASSERT
87 static event submitAssertCapture(queue &, event &, queue *,
88  const detail::code_location &);
89 #endif
90 } // namespace detail
91 
103 class __SYCL_EXPORT queue {
104 public:
109  explicit queue(const property_list &PropList = {})
110  : queue(default_selector(), async_handler{}, PropList) {}
111 
117  queue(const async_handler &AsyncHandler, const property_list &PropList = {})
118  : queue(default_selector(), AsyncHandler, PropList) {}
119 
125  queue(const device_selector &DeviceSelector,
126  const property_list &PropList = {})
127  : queue(DeviceSelector.select_device(), async_handler{}, PropList) {}
128 
135  queue(const device_selector &DeviceSelector,
136  const async_handler &AsyncHandler, const property_list &PropList = {})
137  : queue(DeviceSelector.select_device(), AsyncHandler, PropList) {}
138 
143  explicit queue(const device &SyclDevice, const property_list &PropList = {})
144  : queue(SyclDevice, async_handler{}, PropList) {}
145 
152  explicit queue(const device &SyclDevice, const async_handler &AsyncHandler,
153  const property_list &PropList = {});
154 
161  queue(const context &SyclContext, const device_selector &DeviceSelector,
162  const property_list &PropList = {});
163 
172  queue(const context &SyclContext, const device_selector &DeviceSelector,
173  const async_handler &AsyncHandler, const property_list &PropList = {});
174 
181  queue(const context &SyclContext, const device &SyclDevice,
182  const property_list &PropList = {});
183 
191  queue(const context &SyclContext, const device &SyclDevice,
192  const async_handler &AsyncHandler, const property_list &PropList = {});
193 
202 #ifdef __SYCL_INTERNAL_API
203  queue(cl_command_queue ClQueue, const context &SyclContext,
204  const async_handler &AsyncHandler = {});
205 #endif
206 
207  queue(const queue &RHS) = default;
208 
209  queue(queue &&RHS) = default;
210 
211  queue &operator=(const queue &RHS) = default;
212 
213  queue &operator=(queue &&RHS) = default;
214 
215  bool operator==(const queue &RHS) const { return impl == RHS.impl; }
216 
217  bool operator!=(const queue &RHS) const { return !(*this == RHS); }
218 
221 #ifdef __SYCL_INTERNAL_API
222  cl_command_queue get() const;
223 #endif
224 
226  context get_context() const;
227 
229  device get_device() const;
230 
232  bool is_host() const;
233 
237  template <info::queue param>
238  typename info::param_traits<info::queue, param>::return_type get_info() const;
239 
240  // A shorthand for `get_device().has()' which is expected to be a bit quicker
241  // than the long version
242  bool device_has(aspect Aspect) const;
243 
244 public:
251  template <typename T> event submit(T CGF _CODELOCPARAM(&CodeLoc)) {
252  _CODELOCARG(&CodeLoc);
253 
254 #if __SYCL_USE_FALLBACK_ASSERT
255  if (!is_host()) {
256  auto PostProcess = [this, &CodeLoc](bool IsKernel, bool KernelUsesAssert,
257  event &E) {
258  if (IsKernel && !device_has(aspect::ext_oneapi_native_assert) &&
259  KernelUsesAssert && !device_has(aspect::accelerator)) {
260  // __devicelib_assert_fail isn't supported by Device-side Runtime
261  // Linking against fallback impl of __devicelib_assert_fail is
262  // performed by program manager class
263  // Fallback assert isn't supported for FPGA
264  submitAssertCapture(*this, E, /* SecondaryQueue = */ nullptr,
265  CodeLoc);
266  }
267  };
268 
269  auto Event = submit_impl_and_postprocess(CGF, CodeLoc, PostProcess);
270  return discard_or_return(Event);
271  } else
272 #endif // __SYCL_USE_FALLBACK_ASSERT
273  {
274  auto Event = submit_impl(CGF, CodeLoc);
275  return discard_or_return(Event);
276  }
277  }
278 
290  template <typename T>
291  event submit(T CGF, queue &SecondaryQueue _CODELOCPARAM(&CodeLoc)) {
292  _CODELOCARG(&CodeLoc);
293 
294 #if __SYCL_USE_FALLBACK_ASSERT
295  if (!is_host()) {
296  auto PostProcess = [this, &SecondaryQueue, &CodeLoc](
297  bool IsKernel, bool KernelUsesAssert, event &E) {
298  if (IsKernel && !device_has(aspect::ext_oneapi_native_assert) &&
299  KernelUsesAssert && !device_has(aspect::accelerator)) {
300  // Only secondary queues on devices need to be added to the assert
301  // capture.
302  // TODO: Handle case where primary queue is host but the secondary
303  // queue is not.
304  queue *DeviceSecondaryQueue =
305  SecondaryQueue.is_host() ? nullptr : &SecondaryQueue;
306  // __devicelib_assert_fail isn't supported by Device-side Runtime
307  // Linking against fallback impl of __devicelib_assert_fail is
308  // performed by program manager class
309  // Fallback assert isn't supported for FPGA
310  submitAssertCapture(*this, E, DeviceSecondaryQueue, CodeLoc);
311  }
312  };
313 
314  auto Event = submit_impl_and_postprocess(CGF, SecondaryQueue, CodeLoc,
315  PostProcess);
316  return discard_or_return(Event);
317  } else
318 #endif // __SYCL_USE_FALLBACK_ASSERT
319  {
320  auto Event = submit_impl(CGF, SecondaryQueue, CodeLoc);
321  return discard_or_return(Event);
322  }
323  }
324 
333  return submit(
334  [=](handler &CGH) { CGH.ext_oneapi_barrier(); } _CODELOCFW(CodeLoc));
335  }
336 
344  __SYCL2020_DEPRECATED("use 'ext_oneapi_submit_barrier' instead")
345  event submit_barrier(_CODELOCONLYPARAM(&CodeLoc)) {
346  _CODELOCARG(&CodeLoc);
347  return ext_oneapi_submit_barrier(CodeLoc);
348  }
349 
360  const std::vector<event> &WaitList _CODELOCPARAM(&CodeLoc)) {
361  return submit([=](handler &CGH) {
362  CGH.ext_oneapi_barrier(WaitList);
363  } _CODELOCFW(CodeLoc));
364  }
365 
375  __SYCL2020_DEPRECATED("use 'ext_oneapi_submit_barrier' instead")
376  event
377  submit_barrier(const std::vector<event> &WaitList _CODELOCPARAM(&CodeLoc)) {
378  _CODELOCARG(&CodeLoc);
379  return ext_oneapi_submit_barrier(WaitList, CodeLoc);
380  }
381 
387  void wait(_CODELOCONLYPARAM(&CodeLoc)) {
388  _CODELOCARG(&CodeLoc);
389 
390  wait_proxy(CodeLoc);
391  }
392 
402  _CODELOCARG(&CodeLoc);
403 
404  wait_and_throw_proxy(CodeLoc);
405  }
406 
409  void wait_proxy(const detail::code_location &CodeLoc);
412  void wait_and_throw_proxy(const detail::code_location &CodeLoc);
413 
419  void throw_asynchronous();
420 
423  template <typename PropertyT> bool has_property() const;
424 
428  template <typename PropertyT> PropertyT get_property() const;
429 
437  template <typename T> event fill(void *Ptr, const T &Pattern, size_t Count) {
438  return submit([&](handler &CGH) { CGH.fill<T>(Ptr, Pattern, Count); });
439  }
440 
449  template <typename T>
450  event fill(void *Ptr, const T &Pattern, size_t Count, event DepEvent) {
451  return submit([&](handler &CGH) {
452  CGH.depends_on(DepEvent);
453  CGH.fill<T>(Ptr, Pattern, Count);
454  });
455  }
456 
466  template <typename T>
467  event fill(void *Ptr, const T &Pattern, size_t Count,
468  const std::vector<event> &DepEvents) {
469  return submit([&](handler &CGH) {
470  CGH.depends_on(DepEvents);
471  CGH.fill<T>(Ptr, Pattern, Count);
472  });
473  }
474 
484  event memset(void *Ptr, int Value, size_t Count);
485 
496  event memset(void *Ptr, int Value, size_t Count, event DepEvent);
497 
509  event memset(void *Ptr, int Value, size_t Count,
510  const std::vector<event> &DepEvents);
511 
522  event memcpy(void *Dest, const void *Src, size_t Count);
523 
535  event memcpy(void *Dest, const void *Src, size_t Count, event DepEvent);
536 
549  event memcpy(void *Dest, const void *Src, size_t Count,
550  const std::vector<event> &DepEvents);
551 
562  template <typename T> event copy(const T *Src, T *Dest, size_t Count) {
563  return this->memcpy(Dest, Src, Count * sizeof(T));
564  }
565 
577  template <typename T>
578  event copy(const T *Src, T *Dest, size_t Count, event DepEvent) {
579  return this->memcpy(Dest, Src, Count * sizeof(T), DepEvent);
580  }
581 
593  template <typename T>
594  event copy(const T *Src, T *Dest, size_t Count,
595  const std::vector<event> &DepEvents) {
596  return this->memcpy(Dest, Src, Count * sizeof(T), DepEvents);
597  }
598 
606  __SYCL2020_DEPRECATED("use the overload with int Advice instead")
607  event mem_advise(const void *Ptr, size_t Length, pi_mem_advice Advice);
608 
616  event mem_advise(const void *Ptr, size_t Length, int Advice);
617 
626  event mem_advise(const void *Ptr, size_t Length, int Advice, event DepEvent);
627 
637  event mem_advise(const void *Ptr, size_t Length, int Advice,
638  const std::vector<event> &DepEvents);
639 
647  event prefetch(const void *Ptr, size_t Count) {
648  return submit([=](handler &CGH) { CGH.prefetch(Ptr, Count); });
649  }
650 
659  event prefetch(const void *Ptr, size_t Count, event DepEvent) {
660  return submit([=](handler &CGH) {
661  CGH.depends_on(DepEvent);
662  CGH.prefetch(Ptr, Count);
663  });
664  }
665 
675  event prefetch(const void *Ptr, size_t Count,
676  const std::vector<event> &DepEvents) {
677  return submit([=](handler &CGH) {
678  CGH.depends_on(DepEvents);
679  CGH.prefetch(Ptr, Count);
680  });
681  }
682 
687  template <typename KernelName = detail::auto_name, typename KernelType>
689  static_assert(
691  void()>::value ||
693  void(kernel_handler)>::value),
694  "sycl::queue.single_task() requires a kernel instead of command group. "
695  "Use queue.submit() instead");
696  _CODELOCARG(&CodeLoc);
697  return submit(
698  [&](handler &CGH) {
699  CGH.template single_task<KernelName, KernelType>(KernelFunc);
700  },
701  CodeLoc);
702  }
703 
709  template <typename KernelName = detail::auto_name, typename KernelType>
710  event single_task(event DepEvent,
712  static_assert(
714  void()>::value ||
716  void(kernel_handler)>::value),
717  "sycl::queue.single_task() requires a kernel instead of command group. "
718  "Use queue.submit() instead");
719  _CODELOCARG(&CodeLoc);
720  return submit(
721  [&](handler &CGH) {
722  CGH.depends_on(DepEvent);
723  CGH.template single_task<KernelName, KernelType>(KernelFunc);
724  },
725  CodeLoc);
726  }
727 
734  template <typename KernelName = detail::auto_name, typename KernelType>
735  event single_task(const std::vector<event> &DepEvents,
737  static_assert(
739  void()>::value ||
741  void(kernel_handler)>::value),
742  "sycl::queue.single_task() requires a kernel instead of command group. "
743  "Use queue.submit() instead");
744  _CODELOCARG(&CodeLoc);
745  return submit(
746  [&](handler &CGH) {
747  CGH.depends_on(DepEvents);
748  CGH.template single_task<KernelName, KernelType>(KernelFunc);
749  },
750  CodeLoc);
751  }
752 
759  template <typename KernelName = detail::auto_name, typename KernelType>
760  event parallel_for(range<1> NumWorkItems,
762  _CODELOCARG(&CodeLoc);
763  return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc, CodeLoc);
764  }
765 
772  template <typename KernelName = detail::auto_name, typename KernelType>
773  event parallel_for(range<2> NumWorkItems,
775  _CODELOCARG(&CodeLoc);
776  return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc, CodeLoc);
777  }
778 
785  template <typename KernelName = detail::auto_name, typename KernelType>
786  event parallel_for(range<3> NumWorkItems,
788  _CODELOCARG(&CodeLoc);
789  return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc, CodeLoc);
790  }
791 
799  template <typename KernelName = detail::auto_name, typename KernelType>
800  event parallel_for(range<1> NumWorkItems, event DepEvent,
802  _CODELOCARG(&CodeLoc);
803  return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc,
804  CodeLoc);
805  }
806 
814  template <typename KernelName = detail::auto_name, typename KernelType>
815  event parallel_for(range<2> NumWorkItems, event DepEvent,
817  _CODELOCARG(&CodeLoc);
818  return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc,
819  CodeLoc);
820  }
821 
829  template <typename KernelName = detail::auto_name, typename KernelType>
830  event parallel_for(range<3> NumWorkItems, event DepEvent,
832  _CODELOCARG(&CodeLoc);
833  return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc,
834  CodeLoc);
835  }
836 
845  template <typename KernelName = detail::auto_name, typename KernelType>
846  event parallel_for(range<1> NumWorkItems, const std::vector<event> &DepEvents,
848  _CODELOCARG(&CodeLoc);
849  return parallel_for_impl<KernelName>(NumWorkItems, DepEvents, KernelFunc,
850  CodeLoc);
851  }
852 
861  template <typename KernelName = detail::auto_name, typename KernelType>
862  event parallel_for(range<2> NumWorkItems, const std::vector<event> &DepEvents,
864  _CODELOCARG(&CodeLoc);
865  return parallel_for_impl<KernelName>(NumWorkItems, DepEvents, KernelFunc,
866  CodeLoc);
867  }
868 
877  template <typename KernelName = detail::auto_name, typename KernelType>
878  event parallel_for(range<3> NumWorkItems, const std::vector<event> &DepEvents,
880  _CODELOCARG(&CodeLoc);
881  return parallel_for_impl<KernelName>(NumWorkItems, DepEvents, KernelFunc,
882  CodeLoc);
883  }
884 
892  template <typename KernelName = detail::auto_name, typename KernelType,
893  int Dims>
894  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
896  _CODELOCARG(&CodeLoc);
897  return submit(
898  [&](handler &CGH) {
899  CGH.template parallel_for<KernelName, KernelType>(
900  NumWorkItems, WorkItemOffset, KernelFunc);
901  },
902  CodeLoc);
903  }
904 
913  template <typename KernelName = detail::auto_name, typename KernelType,
914  int Dims>
915  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
916  event DepEvent,
918  _CODELOCARG(&CodeLoc);
919  return submit(
920  [&](handler &CGH) {
921  CGH.depends_on(DepEvent);
922  CGH.template parallel_for<KernelName, KernelType>(
923  NumWorkItems, WorkItemOffset, KernelFunc);
924  },
925  CodeLoc);
926  }
927 
937  template <typename KernelName = detail::auto_name, typename KernelType,
938  int Dims>
939  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
940  const std::vector<event> &DepEvents,
942  _CODELOCARG(&CodeLoc);
943  return submit(
944  [&](handler &CGH) {
945  CGH.depends_on(DepEvents);
946  CGH.template parallel_for<KernelName, KernelType>(
947  NumWorkItems, WorkItemOffset, KernelFunc);
948  },
949  CodeLoc);
950  }
951 
959  template <typename KernelName = detail::auto_name, typename KernelType,
960  int Dims>
961  event parallel_for(nd_range<Dims> ExecutionRange,
963  _CODELOCARG(&CodeLoc);
964  return submit(
965  [&](handler &CGH) {
966  CGH.template parallel_for<KernelName, KernelType>(ExecutionRange,
967  KernelFunc);
968  },
969  CodeLoc);
970  }
971 
980  template <typename KernelName = detail::auto_name, typename KernelType,
981  int Dims>
982  event parallel_for(nd_range<Dims> ExecutionRange, event DepEvent,
984  _CODELOCARG(&CodeLoc);
985  return submit(
986  [&](handler &CGH) {
987  CGH.depends_on(DepEvent);
988  CGH.template parallel_for<KernelName, KernelType>(ExecutionRange,
989  KernelFunc);
990  },
991  CodeLoc);
992  }
993 
1003  template <typename KernelName = detail::auto_name, typename KernelType,
1004  int Dims>
1005  event parallel_for(nd_range<Dims> ExecutionRange,
1006  const std::vector<event> &DepEvents,
1008  _CODELOCARG(&CodeLoc);
1009  return submit(
1010  [&](handler &CGH) {
1011  CGH.depends_on(DepEvents);
1012  CGH.template parallel_for<KernelName, KernelType>(ExecutionRange,
1013  KernelFunc);
1014  },
1015  CodeLoc);
1016  }
1017 
1026  template <typename KernelName = detail::auto_name, typename KernelType,
1027  int Dims, typename Reduction>
1028  event parallel_for(nd_range<Dims> ExecutionRange, Reduction Redu,
1030  _CODELOCARG(&CodeLoc);
1031  return submit(
1032  [&](handler &CGH) {
1033  CGH.template parallel_for<KernelName, KernelType, Dims, Reduction>(
1034  ExecutionRange, Redu, KernelFunc);
1035  },
1036  CodeLoc);
1037  }
1038 
1039 // Clean up CODELOC and KERNELFUNC macros.
1040 #undef _CODELOCPARAM
1041 #undef _CODELOCONLYPARAM
1042 #undef _CODELOCARG
1043 #undef _CODELOCFW
1044 #undef _KERNELFUNCPARAM
1045 
1049  bool is_in_order() const;
1050 
1054  backend get_backend() const noexcept;
1055 
1059  template <backend Backend>
1060  __SYCL_DEPRECATED("Use SYCL 2020 sycl::get_native free function")
1061  backend_return_t<Backend, queue> get_native() const {
1062  return reinterpret_cast<backend_return_t<Backend, queue>>(getNative());
1063  }
1064 
1065 private:
1066  pi_native_handle getNative() const;
1067 
1068  std::shared_ptr<detail::queue_impl> impl;
1069  queue(std::shared_ptr<detail::queue_impl> impl) : impl(impl) {}
1070 
1071  template <class Obj>
1072  friend decltype(Obj::impl) detail::getSyclObjImpl(const Obj &SyclObject);
1073  template <class T>
1074  friend T detail::createSyclObjFromImpl(decltype(T::impl) ImplObj);
1075 
1076 #if __SYCL_USE_FALLBACK_ASSERT
1077  friend event detail::submitAssertCapture(queue &, event &, queue *,
1078  const detail::code_location &);
1079 #endif
1080 
1082  event submit_impl(std::function<void(handler &)> CGH,
1083  const detail::code_location &CodeLoc);
1085  event submit_impl(std::function<void(handler &)> CGH, queue secondQueue,
1086  const detail::code_location &CodeLoc);
1087 
1090  event discard_or_return(const event &Event);
1091 
1092  // Function to postprocess submitted command
1093  // Arguments:
1094  // bool IsKernel - true if the submitted command was kernel, false otherwise
1095  // bool KernelUsesAssert - true if submitted kernel uses assert, only
1096  // meaningful when IsKernel is true
1097  // event &Event - event after which post processing should be executed
1098  using SubmitPostProcessF = std::function<void(bool, bool, event &)>;
1099 
1105  event submit_impl_and_postprocess(std::function<void(handler &)> CGH,
1106  const detail::code_location &CodeLoc,
1107  const SubmitPostProcessF &PostProcess);
1114  event submit_impl_and_postprocess(std::function<void(handler &)> CGH,
1115  queue secondQueue,
1116  const detail::code_location &CodeLoc,
1117  const SubmitPostProcessF &PostProcess);
1118 
1125  template <typename KernelName = detail::auto_name, typename KernelType,
1126  int Dims>
1127  event parallel_for_impl(
1128  range<Dims> NumWorkItems, KernelType KernelFunc,
1129  const detail::code_location &CodeLoc = detail::code_location::current()) {
1130  return submit(
1131  [&](handler &CGH) {
1132  CGH.template parallel_for<KernelName, KernelType>(NumWorkItems,
1133  KernelFunc);
1134  },
1135  CodeLoc);
1136  }
1137 
1145  template <typename KernelName = detail::auto_name, typename KernelType,
1146  int Dims>
1147  event parallel_for_impl(range<Dims> NumWorkItems, event DepEvent,
1148  KernelType KernelFunc,
1149  const detail::code_location &CodeLoc) {
1150  return submit(
1151  [&](handler &CGH) {
1152  CGH.depends_on(DepEvent);
1153  CGH.template parallel_for<KernelName, KernelType>(NumWorkItems,
1154  KernelFunc);
1155  },
1156  CodeLoc);
1157  }
1158 
1167  template <typename KernelName = detail::auto_name, typename KernelType,
1168  int Dims>
1169  event parallel_for_impl(range<Dims> NumWorkItems,
1170  const std::vector<event> &DepEvents,
1171  KernelType KernelFunc,
1172  const detail::code_location &CodeLoc) {
1173  return submit(
1174  [&](handler &CGH) {
1175  CGH.depends_on(DepEvents);
1176  CGH.template parallel_for<KernelName, KernelType>(NumWorkItems,
1177  KernelFunc);
1178  },
1179  CodeLoc);
1180  }
1181 
1182  buffer<detail::AssertHappened, 1> &getAssertHappenedBuffer();
1183 };
1184 
1185 namespace detail {
1186 #if __SYCL_USE_FALLBACK_ASSERT
1187 #define __SYCL_ASSERT_START 1
1188 
1199 event submitAssertCapture(queue &Self, event &Event, queue *SecondaryQueue,
1200  const detail::code_location &CodeLoc) {
1201  using AHBufT = buffer<detail::AssertHappened, 1>;
1202 
1203  AHBufT &Buffer = Self.getAssertHappenedBuffer();
1204 
1205  event CopierEv, CheckerEv, PostCheckerEv;
1206  auto CopierCGF = [&](handler &CGH) {
1207  CGH.depends_on(Event);
1208 
1209  auto Acc = Buffer.get_access<access::mode::write>(CGH);
1210 
1211  CGH.single_task<__sycl_service_kernel__::AssertInfoCopier>([Acc] {
1212 #if defined(__SYCL_DEVICE_ONLY__) && !defined(__NVPTX__)
1213  __devicelib_assert_read(&Acc[0]);
1214 #else
1215  (void)Acc;
1216 #endif // defined(__SYCL_DEVICE_ONLY__) && !defined(__NVPTX__)
1217  });
1218  };
1219  auto CheckerCGF = [&CopierEv, &Buffer](handler &CGH) {
1220  CGH.depends_on(CopierEv);
1221  using mode = access::mode;
1222  using target = access::target;
1223 
1224  auto Acc = Buffer.get_access<mode::read, target::host_buffer>(CGH);
1225 
1226  CGH.host_task([=] {
1227  const detail::AssertHappened *AH = &Acc[0];
1228 
1229  // Don't use assert here as msvc will insert reference to __imp__wassert
1230  // which won't be properly resolved in separate compile use-case
1231 #ifndef NDEBUG
1232  if (AH->Flag == __SYCL_ASSERT_START)
1233  throw sycl::runtime_error(
1234  "Internal Error. Invalid value in assert description.",
1236 #endif
1237 
1238  if (AH->Flag) {
1239  const char *Expr = AH->Expr[0] ? AH->Expr : "<unknown expr>";
1240  const char *File = AH->File[0] ? AH->File : "<unknown file>";
1241  const char *Func = AH->Func[0] ? AH->Func : "<unknown func>";
1242 
1243  fprintf(stderr,
1244  "%s:%d: %s: global id: [%" PRIu64 ",%" PRIu64 ",%" PRIu64
1245  "], local id: [%" PRIu64 ",%" PRIu64 ",%" PRIu64 "] "
1246  "Assertion `%s` failed.\n",
1247  File, AH->Line, Func, AH->GID0, AH->GID1, AH->GID2, AH->LID0,
1248  AH->LID1, AH->LID2, Expr);
1249  abort(); // no need to release memory as it's abort anyway
1250  }
1251  });
1252  };
1253 
1254  if (SecondaryQueue) {
1255  CopierEv = Self.submit_impl(CopierCGF, *SecondaryQueue, CodeLoc);
1256  CheckerEv = Self.submit_impl(CheckerCGF, *SecondaryQueue, CodeLoc);
1257  } else {
1258  CopierEv = Self.submit_impl(CopierCGF, CodeLoc);
1259  CheckerEv = Self.submit_impl(CheckerCGF, CodeLoc);
1260  }
1261 
1262  return CheckerEv;
1263 }
1264 #undef __SYCL_ASSERT_START
1265 #endif // __SYCL_USE_FALLBACK_ASSERT
1266 } // namespace detail
1267 
1268 } // namespace sycl
1269 } // __SYCL_INLINE_NAMESPACE(cl)
1270 
1271 namespace std {
1272 template <> struct hash<cl::sycl::queue> {
1273  size_t operator()(const cl::sycl::queue &Q) const {
1274  return std::hash<std::shared_ptr<cl::sycl::detail::queue_impl>>()(
1276  }
1277 };
1278 } // namespace std
1279 
1280 #undef __SYCL_USE_FALLBACK_ASSERT
cl::sycl::nd_range
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:23
cl::sycl::backend
backend
Definition: backend_types.hpp:21
property_list.hpp
cl::sycl::device_selector
The device_selector class provides ability to choose the best SYCL device based on heuristics specifi...
Definition: device_selector.hpp:28
cl::sycl::detail::check_fn_signature
Definition: cg_types.hpp:126
cl::sycl::queue::parallel_for
event parallel_for(range< 2 > NumWorkItems, event DepEvent, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:815
cl::sycl::backend_return_t
typename backend_traits< Backend >::template return_type< SyclType > backend_return_t
Definition: backend.hpp:65
cl::sycl::info::device
device
Definition: info_desc.hpp:50
cl::sycl::event
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:31
cl::sycl::device_selector::select_device
virtual device select_device() const
Definition: device_selector.cpp:65
cl::sycl::info::param_traits
Definition: info_desc.hpp:304
cl::sycl::queue::prefetch
event prefetch(const void *Ptr, size_t Count, event DepEvent)
Provides hints to the runtime library that data should be made available on a device earlier than Uni...
Definition: queue.hpp:659
__SYCL2020_DEPRECATED
#define __SYCL2020_DEPRECATED(message)
Definition: defines_elementary.hpp:56
cl::sycl::queue::fill
event fill(void *Ptr, const T &Pattern, size_t Count, event DepEvent)
Fills the specified memory with the specified pattern.
Definition: queue.hpp:450
_CODELOCPARAM
#define _CODELOCPARAM(a)
Definition: queue.hpp:47
cl::sycl::get_native
auto get_native(const SyclObjectT &Obj) -> backend_return_t< BackendName, SyclObjectT >
Definition: backend.hpp:68
cl::sycl::queue::fill
event fill(void *Ptr, const T &Pattern, size_t Count, const std::vector< event > &DepEvents)
Fills the specified memory with the specified pattern.
Definition: queue.hpp:467
stl.hpp
device_selector.hpp
cl::sycl::detail::createSyclObjFromImpl
T createSyclObjFromImpl(decltype(T::impl) ImplObj)
Definition: common.hpp:215
device.hpp
cl::sycl::queue::queue
queue(const property_list &PropList={})
Constructs a SYCL queue instance using the device returned by an instance of default_selector.
Definition: queue.hpp:109
cl::sycl::id< Dims >
cl::sycl::queue::fill
event fill(void *Ptr, const T &Pattern, size_t Count)
Fills the specified memory with the specified pattern.
Definition: queue.hpp:437
cl::sycl::queue::wait
void wait(_CODELOCONLYPARAM(&CodeLoc))
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue.hpp:387
__SYCL_DEPRECATED
#define __SYCL_DEPRECATED(message)
Definition: defines_elementary.hpp:47
_pi_mem_advice
_pi_mem_advice
Definition: pi.h:445
cl::sycl::queue::parallel_for
event parallel_for(range< 1 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:760
cl::sycl::queue::copy
event copy(const T *Src, T *Dest, size_t Count, const std::vector< event > &DepEvents)
Copies data from one memory region to another, both pointed by USM pointers.
Definition: queue.hpp:594
cl::sycl::queue::wait_and_throw
void wait_and_throw(_CODELOCONLYPARAM(&CodeLoc))
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue.hpp:401
event.hpp
detail
Definition: pi_opencl.cpp:86
service_kernel_names.hpp
cl::sycl::queue::parallel_for
event parallel_for(nd_range< Dims > ExecutionRange, const std::vector< event > &DepEvents, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + nd_range that specifies global,...
Definition: queue.hpp:1005
cl::sycl::property_list
Objects of the property_list class are containers for the SYCL properties.
Definition: property_list.hpp:26
cl::sycl::queue::single_task
event single_task(const std::vector< event > &DepEvents, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
single_task version with a kernel represented as a lambda.
Definition: queue.hpp:735
cl::sycl::detail::code_location
Definition: common.hpp:54
cl::sycl::queue
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:103
cl::sycl::buffer
Defines a shared array that can be used by kernels in queues.
Definition: buffer.hpp:46
cl::sycl::default_selector
The default selector chooses the first available SYCL device.
Definition: device_selector.hpp:46
cl::sycl::detail::remove_reference_t
typename std::remove_reference< T >::type remove_reference_t
Definition: stl_type_traits.hpp:35
cl::sycl::info::queue
queue
Definition: info_desc.hpp:223
backend_traits.hpp
cl::sycl::detail::write
void write(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize, unsigned WIOffset, const char *Str, unsigned Len, unsigned Padding=0)
Definition: stream.hpp:110
cl::sycl::range< 1 >
cl::sycl::queue::parallel_for
event parallel_for(range< 1 > NumWorkItems, event DepEvent, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:800
cl::sycl::handler::depends_on
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.cpp:772
cl::sycl::detail::memcpy
void memcpy(void *Dst, const void *Src, std::size_t Size)
cl::sycl::queue::parallel_for
event parallel_for(range< 3 > NumWorkItems, const std::vector< event > &DepEvents, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:878
cl::sycl::queue::parallel_for
event parallel_for(range< Dims > NumWorkItems, id< Dims > WorkItemOffset, event DepEvent, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range and offset that specify global siz...
Definition: queue.hpp:915
cl::sycl::queue::parallel_for
event parallel_for(nd_range< Dims > ExecutionRange, event DepEvent, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + nd_range that specifies global,...
Definition: queue.hpp:982
cl::sycl::queue::queue
queue(const device_selector &DeviceSelector, const property_list &PropList={})
Constructs a SYCL queue instance using the device returned by the DeviceSelector provided.
Definition: queue.hpp:125
export.hpp
cl::sycl::detail::AssertHappened::Func
char Func[128+1]
Definition: assert_happened.hpp:30
cl::sycl::queue::queue
queue(const device &SyclDevice, const property_list &PropList={})
Constructs a SYCL queue instance using the device provided.
Definition: queue.hpp:143
cl::sycl::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:35
cl::sycl::detail::AssertHappened::LID0
uint64_t LID0
Definition: assert_happened.hpp:38
cl::sycl::queue::ext_oneapi_submit_barrier
event ext_oneapi_submit_barrier(const std::vector< event > &WaitList _CODELOCPARAM(&CodeLoc))
Prevents any commands submitted afterward to this queue from executing until all events in WaitList h...
Definition: queue.hpp:359
cl::sycl::handler::prefetch
void prefetch(const void *Ptr, size_t Count)
Provides hints to the runtime library that data should be made available on a device earlier than Uni...
Definition: handler.cpp:720
cl::sycl::detail::usm::free
void free(void *Ptr, const context &Ctxt)
Definition: usm_impl.cpp:132
cl::sycl::handler::ext_oneapi_barrier
void ext_oneapi_barrier()
Prevents any commands submitted afterward to this queue from executing until all commands previously ...
Definition: handler.hpp:2523
cl::sycl::queue::parallel_for
event parallel_for(range< 2 > NumWorkItems, const std::vector< event > &DepEvents, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:862
cl::sycl::queue::submit
event submit(T CGF, queue &SecondaryQueue _CODELOCPARAM(&CodeLoc))
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue.hpp:291
cl::sycl::queue::parallel_for
event parallel_for(range< 3 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:786
cl::sycl::ext::intel::experimental::prefetch
prefetch_impl< _B > prefetch
Definition: fpga_lsu.hpp:46
cl::sycl::access::target
target
Definition: access.hpp:17
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
cl::sycl::queue::parallel_for
event parallel_for(range< 2 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:773
_KERNELFUNCPARAM
#define _KERNELFUNCPARAM(a)
Definition: queue.hpp:63
_CODELOCARG
#define _CODELOCARG(a)
Definition: queue.hpp:50
cl::sycl::detail::AssertHappened::Expr
char Expr[256+1]
Definition: assert_happened.hpp:28
cl::sycl::aspect
aspect
Definition: aspects.hpp:15
cl::sycl::kernel_handler
Reading the value of a specialization constant.
Definition: kernel_handler.hpp:22
cl::sycl::detail::AssertHappened::LID2
uint64_t LID2
Definition: assert_happened.hpp:40
pi_native_handle
uintptr_t pi_native_handle
Definition: pi.h:73
cl::sycl::handler
Command group handler class.
Definition: handler.hpp:361
cl::sycl::detail::AssertHappened::GID2
uint64_t GID2
Definition: assert_happened.hpp:36
cl::sycl::detail::AssertHappened::Flag
int Flag
Definition: assert_happened.hpp:27
__SYCL_ASSERT_START
#define __SYCL_ASSERT_START
Definition: queue.hpp:1187
std::get
constexpr tuple_element< I, tuple< Types... > >::type & get(cl::sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:199
cl::sycl::queue::single_task
event single_task(event DepEvent, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
single_task version with a kernel represented as a lambda.
Definition: queue.hpp:710
KernelFunc
std::function< void(const sycl::nd_item< NDims > &)> KernelFunc
Definition: pi_esimd_emulator.cpp:136
cl::sycl::queue::parallel_for
event parallel_for(nd_range< Dims > ExecutionRange, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + nd_range that specifies global,...
Definition: queue.hpp:961
cl::sycl::queue::operator==
bool operator==(const queue &RHS) const
Definition: queue.hpp:215
cl::sycl::queue::operator!=
bool operator!=(const queue &RHS) const
Definition: queue.hpp:217
PI_INVALID_VALUE
@ PI_INVALID_VALUE
Definition: pi.h:88
cl::sycl::detail::auto_name
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:35
cl::sycl::detail::getSyclObjImpl
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: common.hpp:198
cl::sycl::info::event
event
Definition: info_desc.hpp:283
handler.hpp
cl::sycl::detail::submitAssertCapture
static event submitAssertCapture(queue &, event &, queue *, const detail::code_location &)
Submit copy task for assert failure flag and host-task to check the flag.
Definition: queue.hpp:1199
cl::sycl::queue::parallel_for
event parallel_for(nd_range< Dims > ExecutionRange, Reduction Redu, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + nd_range that specifies global,...
Definition: queue.hpp:1028
std
Definition: accessor.hpp:2532
_CODELOCONLYPARAM
#define _CODELOCONLYPARAM(a)
Definition: queue.hpp:45
cl::sycl::queue::prefetch
event prefetch(const void *Ptr, size_t Count, const std::vector< event > &DepEvents)
Provides hints to the runtime library that data should be made available on a device earlier than Uni...
Definition: queue.hpp:675
cl::sycl::queue::parallel_for
event parallel_for(range< Dims > NumWorkItems, id< Dims > WorkItemOffset, const std::vector< event > &DepEvents, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range and offset that specify global siz...
Definition: queue.hpp:939
cl::sycl::queue::parallel_for
event parallel_for(range< 3 > NumWorkItems, event DepEvent, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:830
cl::sycl::detail::AssertHappened::LID1
uint64_t LID1
Definition: assert_happened.hpp:39
cl::sycl::detail::AssertHappened::GID0
uint64_t GID0
Definition: assert_happened.hpp:34
cl::sycl::context
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:35
cl::sycl::queue::submit
event submit(T CGF _CODELOCPARAM(&CodeLoc))
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue.hpp:251
cl::sycl::detail::AssertHappened::GID1
uint64_t GID1
Definition: assert_happened.hpp:35
cl::sycl::queue::single_task
event single_task(_KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
single_task version with a kernel represented as a lambda.
Definition: queue.hpp:688
cl::sycl::queue::ext_oneapi_submit_barrier
event ext_oneapi_submit_barrier(_CODELOCONLYPARAM(&CodeLoc))
Prevents any commands submitted afterward to this queue from executing until all commands previously ...
Definition: queue.hpp:332
cl::sycl::queue::queue
queue(const device_selector &DeviceSelector, const async_handler &AsyncHandler, const property_list &PropList={})
Constructs a SYCL queue instance with an async_handler using the device returned by the DeviceSelecto...
Definition: queue.hpp:135
cl::sycl::queue::is_host
bool is_host() const
Definition: queue.cpp:77
exception_list.hpp
cl::sycl::detail::AssertHappened::File
char File[256+1]
Definition: assert_happened.hpp:29
info_desc.hpp
common.hpp
cl::sycl::queue::parallel_for
event parallel_for(range< Dims > NumWorkItems, id< Dims > WorkItemOffset, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range and offset that specify global siz...
Definition: queue.hpp:894
cl::sycl::queue::copy
event copy(const T *Src, T *Dest, size_t Count)
Copies data from one memory region to another, both pointed by USM pointers.
Definition: queue.hpp:562
std::hash< cl::sycl::queue >::operator()
size_t operator()(const cl::sycl::queue &Q) const
Definition: queue.hpp:1273
cl::sycl::access::mode
mode
Definition: access.hpp:28
cl::sycl::info::context
context
Definition: info_desc.hpp:41
cl::sycl::handler::fill
void fill(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT > Dst, const T &Pattern)
Fills memory pointed by accessor with the pattern given.
Definition: handler.hpp:2471
assert_happened.hpp
cl::sycl::detail::AssertHappened
Definition: assert_happened.hpp:26
cl::sycl::queue::copy
event copy(const T *Src, T *Dest, size_t Count, event DepEvent)
Copies data from one memory region to another, both pointed by USM pointers.
Definition: queue.hpp:578
cl::sycl::detail::AssertHappened::Line
int32_t Line
Definition: assert_happened.hpp:32
cl::sycl::async_handler
std::function< void(cl::sycl::exception_list)> async_handler
Definition: exception_list.hpp:53
cl::sycl::queue::parallel_for
event parallel_for(range< 1 > NumWorkItems, const std::vector< event > &DepEvents, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:846
_CODELOCFW
#define _CODELOCFW(a)
Definition: queue.hpp:51
cl::sycl::queue::queue
queue(const async_handler &AsyncHandler, const property_list &PropList={})
Constructs a SYCL queue instance with an async_handler using the device returned by an instance of de...
Definition: queue.hpp:117
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:12