DPC++ Runtime
Runtime libraries for oneAPI Data Parallel C++
queue.hpp
Go to the documentation of this file.
1 //==-------------------- queue.hpp - SYCL queue ----------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
16 #include <CL/sycl/device.hpp>
18 #include <CL/sycl/event.hpp>
20 #include <CL/sycl/handler.hpp>
23 #include <CL/sycl/stl.hpp>
24 
25 #include <inttypes.h>
26 #include <utility>
27 
28 // having _TWO_ mid-param #ifdefs makes the functions very difficult to read.
29 // Here we simplify the &CodeLoc declaration to be _CODELOCPARAM(&CodeLoc) and
30 // _CODELOCARG(&CodeLoc) Similarly, the KernelFunc param is simplified to be
31 // _KERNELFUNCPARAM(KernelFunc) Once the queue kernel functions are defined,
32 // these macros are #undef immediately.
33 
34 // replace _CODELOCPARAM(&CodeLoc) with nothing
35 // or : , const detail::code_location &CodeLoc =
36 // detail::code_location::current()
37 // replace _CODELOCARG(&CodeLoc) with nothing
38 // or : const detail::code_location &CodeLoc = {}
39 
40 #ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
41 #define _CODELOCONLYPARAM(a) \
42  const detail::code_location a = detail::code_location::current()
43 #define _CODELOCPARAM(a) \
44  , const detail::code_location a = detail::code_location::current()
45 
46 #define _CODELOCARG(a)
47 #define _CODELOCFW(a) , a
48 #else
49 #define _CODELOCONLYPARAM(a)
50 #define _CODELOCPARAM(a)
51 
52 #define _CODELOCARG(a) const detail::code_location a = {}
53 #define _CODELOCFW(a)
54 #endif
55 
56 // replace _KERNELFUNCPARAM(KernelFunc) with KernelType KernelFunc
57 // or const KernelType &KernelFunc
58 #ifdef __SYCL_NONCONST_FUNCTOR__
59 #define _KERNELFUNCPARAM(a) KernelType a
60 #else
61 #define _KERNELFUNCPARAM(a) const KernelType &a
62 #endif
63 
64 // Helper macro to identify if fallback assert is needed
65 // FIXME remove __NVPTX__ condition once devicelib supports CUDA
66 #if !defined(SYCL_DISABLE_FALLBACK_ASSERT) && !defined(__NVPTX__)
67 #define __SYCL_USE_FALLBACK_ASSERT 1
68 #else
69 #define __SYCL_USE_FALLBACK_ASSERT 0
70 #endif
71 
73 namespace sycl {
74 
75 // Forward declaration
76 class context;
77 class device;
78 class queue;
79 
80 namespace detail {
81 class queue_impl;
82 #if __SYCL_USE_FALLBACK_ASSERT
83 static event submitAssertCapture(queue &, event &, queue *,
84  const detail::code_location &);
85 #endif
86 } // namespace detail
87 
99 class __SYCL_EXPORT queue {
100 public:
105  explicit queue(const property_list &PropList = {})
106  : queue(default_selector(), async_handler{}, PropList) {}
107 
113  queue(const async_handler &AsyncHandler, const property_list &PropList = {})
114  : queue(default_selector(), AsyncHandler, PropList) {}
115 
121  queue(const device_selector &DeviceSelector,
122  const property_list &PropList = {})
123  : queue(DeviceSelector.select_device(), async_handler{}, PropList) {}
124 
131  queue(const device_selector &DeviceSelector,
132  const async_handler &AsyncHandler, const property_list &PropList = {})
133  : queue(DeviceSelector.select_device(), AsyncHandler, PropList) {}
134 
139  explicit queue(const device &SyclDevice, const property_list &PropList = {})
140  : queue(SyclDevice, async_handler{}, PropList) {}
141 
148  explicit queue(const device &SyclDevice, const async_handler &AsyncHandler,
149  const property_list &PropList = {});
150 
157  queue(const context &SyclContext, const device_selector &DeviceSelector,
158  const property_list &PropList = {});
159 
168  queue(const context &SyclContext, const device_selector &DeviceSelector,
169  const async_handler &AsyncHandler, const property_list &PropList = {});
170 
177  queue(const context &SyclContext, const device &SyclDevice,
178  const property_list &PropList = {});
179 
187  queue(const context &SyclContext, const device &SyclDevice,
188  const async_handler &AsyncHandler, const property_list &PropList = {});
189 
198  __SYCL2020_DEPRECATED("OpenCL interop APIs are deprecated")
199  queue(cl_command_queue ClQueue, const context &SyclContext,
200  const async_handler &AsyncHandler = {});
201 
202  queue(const queue &RHS) = default;
203 
204  queue(queue &&RHS) = default;
205 
206  queue &operator=(const queue &RHS) = default;
207 
208  queue &operator=(queue &&RHS) = default;
209 
210  bool operator==(const queue &RHS) const { return impl == RHS.impl; }
211 
212  bool operator!=(const queue &RHS) const { return !(*this == RHS); }
213 
216  __SYCL2020_DEPRECATED("OpenCL interop APIs are deprecated")
217  cl_command_queue get() const;
218 
220  context get_context() const;
221 
223  device get_device() const;
224 
226  bool is_host() const;
227 
231  template <info::queue param>
232  typename info::param_traits<info::queue, param>::return_type get_info() const;
233 
234  // A shorthand for `get_device().has()' which is expected to be a bit quicker
235  // than the long version
236  bool device_has(aspect Aspect) const;
237 
238 public:
245  template <typename T> event submit(T CGF _CODELOCPARAM(&CodeLoc)) {
246  _CODELOCARG(&CodeLoc);
247 
248  event Event;
249 
250 #if __SYCL_USE_FALLBACK_ASSERT
251  if (!is_host()) {
252  auto PostProcess = [this, &CodeLoc](bool IsKernel, bool KernelUsesAssert,
253  event &E) {
254  if (IsKernel && !device_has(aspect::ext_oneapi_native_assert) &&
255  KernelUsesAssert) {
256  // __devicelib_assert_fail isn't supported by Device-side Runtime
257  // Linking against fallback impl of __devicelib_assert_fail is
258  // performed by program manager class
259  submitAssertCapture(*this, E, /* SecondaryQueue = */ nullptr,
260  CodeLoc);
261  }
262  };
263 
264  Event = submit_impl_and_postprocess(CGF, CodeLoc, PostProcess);
265  } else
266 #endif // __SYCL_USE_FALLBACK_ASSERT
267  {
268  Event = submit_impl(CGF, CodeLoc);
269  }
270 
271  return Event;
272  }
273 
285  template <typename T>
286  event submit(T CGF, queue &SecondaryQueue _CODELOCPARAM(&CodeLoc)) {
287  _CODELOCARG(&CodeLoc);
288 
289  event Event;
290 
291 #if __SYCL_USE_FALLBACK_ASSERT
292  auto PostProcess = [this, &SecondaryQueue, &CodeLoc](
293  bool IsKernel, bool KernelUsesAssert, event &E) {
294  if (IsKernel && !device_has(aspect::ext_oneapi_native_assert) &&
295  KernelUsesAssert) {
296  // __devicelib_assert_fail isn't supported by Device-side Runtime
297  // Linking against fallback impl of __devicelib_assert_fail is performed
298  // by program manager class
299  submitAssertCapture(*this, E, /* SecondaryQueue = */ nullptr, CodeLoc);
300  }
301  };
302 
303  Event =
304  submit_impl_and_postprocess(CGF, SecondaryQueue, CodeLoc, PostProcess);
305 #else
306  Event = submit_impl(CGF, SecondaryQueue, CodeLoc);
307 #endif // __SYCL_USE_FALLBACK_ASSERT
308 
309  return Event;
310  }
311 
320  return submit(
321  [=](handler &CGH) { CGH.ext_oneapi_barrier(); } _CODELOCFW(CodeLoc));
322  }
323 
331  __SYCL2020_DEPRECATED("use 'ext_oneapi_submit_barrier' instead")
332  event submit_barrier(_CODELOCONLYPARAM(&CodeLoc)) {
333  _CODELOCARG(&CodeLoc);
334  return ext_oneapi_submit_barrier(CodeLoc);
335  }
336 
347  const std::vector<event> &WaitList _CODELOCPARAM(&CodeLoc)) {
348  return submit([=](handler &CGH) {
349  CGH.ext_oneapi_barrier(WaitList);
350  } _CODELOCFW(CodeLoc));
351  }
352 
362  __SYCL2020_DEPRECATED("use 'ext_oneapi_submit_barrier' instead")
363  event
364  submit_barrier(const std::vector<event> &WaitList _CODELOCPARAM(&CodeLoc)) {
365  _CODELOCARG(&CodeLoc);
366  return ext_oneapi_submit_barrier(WaitList, CodeLoc);
367  }
368 
374  void wait(_CODELOCONLYPARAM(&CodeLoc)) {
375  _CODELOCARG(&CodeLoc);
376 
377  wait_proxy(CodeLoc);
378  }
379 
389  _CODELOCARG(&CodeLoc);
390 
391  wait_and_throw_proxy(CodeLoc);
392  }
393 
396  void wait_proxy(const detail::code_location &CodeLoc);
399  void wait_and_throw_proxy(const detail::code_location &CodeLoc);
400 
406  void throw_asynchronous();
407 
410  template <typename PropertyT> bool has_property() const;
411 
415  template <typename PropertyT> PropertyT get_property() const;
416 
424  template <typename T> event fill(void *Ptr, const T &Pattern, size_t Count) {
425  return submit([&](handler &CGH) { CGH.fill<T>(Ptr, Pattern, Count); });
426  }
427 
436  template <typename T>
437  event fill(void *Ptr, const T &Pattern, size_t Count, event DepEvent) {
438  return submit([&](handler &CGH) {
439  CGH.depends_on(DepEvent);
440  CGH.fill<T>(Ptr, Pattern, Count);
441  });
442  }
443 
453  template <typename T>
454  event fill(void *Ptr, const T &Pattern, size_t Count,
455  const std::vector<event> &DepEvents) {
456  return submit([&](handler &CGH) {
457  CGH.depends_on(DepEvents);
458  CGH.fill<T>(Ptr, Pattern, Count);
459  });
460  }
461 
471  event memset(void *Ptr, int Value, size_t Count);
472 
483  event memset(void *Ptr, int Value, size_t Count, event DepEvent);
484 
496  event memset(void *Ptr, int Value, size_t Count,
497  const std::vector<event> &DepEvents);
498 
509  event memcpy(void *Dest, const void *Src, size_t Count);
510 
522  event memcpy(void *Dest, const void *Src, size_t Count, event DepEvent);
523 
536  event memcpy(void *Dest, const void *Src, size_t Count,
537  const std::vector<event> &DepEvents);
538 
549  template <typename T> event copy(const T *Src, T *Dest, size_t Count) {
550  return this->memcpy(Dest, Src, Count * sizeof(T));
551  }
552 
564  template <typename T>
565  event copy(const T *Src, T *Dest, size_t Count, event DepEvent) {
566  return this->memcpy(Dest, Src, Count * sizeof(T), DepEvent);
567  }
568 
580  template <typename T>
581  event copy(const T *Src, T *Dest, size_t Count,
582  const std::vector<event> &DepEvents) {
583  return this->memcpy(Dest, Src, Count * sizeof(T), DepEvents);
584  }
585 
593  __SYCL2020_DEPRECATED("use the overload with int Advice instead")
594  event mem_advise(const void *Ptr, size_t Length, pi_mem_advice Advice);
595 
603  event mem_advise(const void *Ptr, size_t Length, int Advice);
604 
613  event mem_advise(const void *Ptr, size_t Length, int Advice, event DepEvent);
614 
624  event mem_advise(const void *Ptr, size_t Length, int Advice,
625  const std::vector<event> &DepEvents);
626 
634  event prefetch(const void *Ptr, size_t Count) {
635  return submit([=](handler &CGH) { CGH.prefetch(Ptr, Count); });
636  }
637 
646  event prefetch(const void *Ptr, size_t Count, event DepEvent) {
647  return submit([=](handler &CGH) {
648  CGH.depends_on(DepEvent);
649  CGH.prefetch(Ptr, Count);
650  });
651  }
652 
662  event prefetch(const void *Ptr, size_t Count,
663  const std::vector<event> &DepEvents) {
664  return submit([=](handler &CGH) {
665  CGH.depends_on(DepEvents);
666  CGH.prefetch(Ptr, Count);
667  });
668  }
669 
674  template <typename KernelName = detail::auto_name, typename KernelType>
676  _CODELOCARG(&CodeLoc);
677 
678  return submit(
679  [&](handler &CGH) {
680  CGH.template single_task<KernelName, KernelType>(KernelFunc);
681  },
682  CodeLoc);
683  }
684 
690  template <typename KernelName = detail::auto_name, typename KernelType>
691  event single_task(event DepEvent,
693  _CODELOCARG(&CodeLoc);
694  return submit(
695  [&](handler &CGH) {
696  CGH.depends_on(DepEvent);
697  CGH.template single_task<KernelName, KernelType>(KernelFunc);
698  },
699  CodeLoc);
700  }
701 
708  template <typename KernelName = detail::auto_name, typename KernelType>
709  event single_task(const std::vector<event> &DepEvents,
711  _CODELOCARG(&CodeLoc);
712  return submit(
713  [&](handler &CGH) {
714  CGH.depends_on(DepEvents);
715  CGH.template single_task<KernelName, KernelType>(KernelFunc);
716  },
717  CodeLoc);
718  }
719 
726  template <typename KernelName = detail::auto_name, typename KernelType>
727  event parallel_for(range<1> NumWorkItems,
729  _CODELOCARG(&CodeLoc);
730  return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc, CodeLoc);
731  }
732 
739  template <typename KernelName = detail::auto_name, typename KernelType>
740  event parallel_for(range<2> NumWorkItems,
742  _CODELOCARG(&CodeLoc);
743  return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc, CodeLoc);
744  }
745 
752  template <typename KernelName = detail::auto_name, typename KernelType>
753  event parallel_for(range<3> NumWorkItems,
755  _CODELOCARG(&CodeLoc);
756  return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc, CodeLoc);
757  }
758 
766  template <typename KernelName = detail::auto_name, typename KernelType>
767  event parallel_for(range<1> NumWorkItems, event DepEvent,
769  _CODELOCARG(&CodeLoc);
770  return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc,
771  CodeLoc);
772  }
773 
781  template <typename KernelName = detail::auto_name, typename KernelType>
782  event parallel_for(range<2> NumWorkItems, event DepEvent,
784  _CODELOCARG(&CodeLoc);
785  return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc,
786  CodeLoc);
787  }
788 
796  template <typename KernelName = detail::auto_name, typename KernelType>
797  event parallel_for(range<3> NumWorkItems, event DepEvent,
799  _CODELOCARG(&CodeLoc);
800  return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc,
801  CodeLoc);
802  }
803 
812  template <typename KernelName = detail::auto_name, typename KernelType>
813  event parallel_for(range<1> NumWorkItems, const std::vector<event> &DepEvents,
815  _CODELOCARG(&CodeLoc);
816  return parallel_for_impl<KernelName>(NumWorkItems, DepEvents, KernelFunc,
817  CodeLoc);
818  }
819 
828  template <typename KernelName = detail::auto_name, typename KernelType>
829  event parallel_for(range<2> NumWorkItems, const std::vector<event> &DepEvents,
831  _CODELOCARG(&CodeLoc);
832  return parallel_for_impl<KernelName>(NumWorkItems, DepEvents, KernelFunc,
833  CodeLoc);
834  }
835 
844  template <typename KernelName = detail::auto_name, typename KernelType>
845  event parallel_for(range<3> NumWorkItems, const std::vector<event> &DepEvents,
847  _CODELOCARG(&CodeLoc);
848  return parallel_for_impl<KernelName>(NumWorkItems, DepEvents, KernelFunc,
849  CodeLoc);
850  }
851 
859  template <typename KernelName = detail::auto_name, typename KernelType,
860  int Dims>
861  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
863  _CODELOCARG(&CodeLoc);
864  return submit(
865  [&](handler &CGH) {
866  CGH.template parallel_for<KernelName, KernelType>(
867  NumWorkItems, WorkItemOffset, KernelFunc);
868  },
869  CodeLoc);
870  }
871 
880  template <typename KernelName = detail::auto_name, typename KernelType,
881  int Dims>
882  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
883  event DepEvent,
885  _CODELOCARG(&CodeLoc);
886  return submit(
887  [&](handler &CGH) {
888  CGH.depends_on(DepEvent);
889  CGH.template parallel_for<KernelName, KernelType>(
890  NumWorkItems, WorkItemOffset, KernelFunc);
891  },
892  CodeLoc);
893  }
894 
904  template <typename KernelName = detail::auto_name, typename KernelType,
905  int Dims>
906  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
907  const std::vector<event> &DepEvents,
909  _CODELOCARG(&CodeLoc);
910  return submit(
911  [&](handler &CGH) {
912  CGH.depends_on(DepEvents);
913  CGH.template parallel_for<KernelName, KernelType>(
914  NumWorkItems, WorkItemOffset, KernelFunc);
915  },
916  CodeLoc);
917  }
918 
926  template <typename KernelName = detail::auto_name, typename KernelType,
927  int Dims>
928  event parallel_for(nd_range<Dims> ExecutionRange,
930  _CODELOCARG(&CodeLoc);
931  return submit(
932  [&](handler &CGH) {
933  CGH.template parallel_for<KernelName, KernelType>(ExecutionRange,
934  KernelFunc);
935  },
936  CodeLoc);
937  }
938 
947  template <typename KernelName = detail::auto_name, typename KernelType,
948  int Dims>
949  event parallel_for(nd_range<Dims> ExecutionRange, event DepEvent,
951  _CODELOCARG(&CodeLoc);
952  return submit(
953  [&](handler &CGH) {
954  CGH.depends_on(DepEvent);
955  CGH.template parallel_for<KernelName, KernelType>(ExecutionRange,
956  KernelFunc);
957  },
958  CodeLoc);
959  }
960 
970  template <typename KernelName = detail::auto_name, typename KernelType,
971  int Dims>
972  event parallel_for(nd_range<Dims> ExecutionRange,
973  const std::vector<event> &DepEvents,
975  _CODELOCARG(&CodeLoc);
976  return submit(
977  [&](handler &CGH) {
978  CGH.depends_on(DepEvents);
979  CGH.template parallel_for<KernelName, KernelType>(ExecutionRange,
980  KernelFunc);
981  },
982  CodeLoc);
983  }
984 
993  template <typename KernelName = detail::auto_name, typename KernelType,
994  int Dims, typename Reduction>
995  event parallel_for(nd_range<Dims> ExecutionRange, Reduction Redu,
997  _CODELOCARG(&CodeLoc);
998  return submit(
999  [&](handler &CGH) {
1000  CGH.template parallel_for<KernelName, KernelType, Dims, Reduction>(
1001  ExecutionRange, Redu, KernelFunc);
1002  },
1003  CodeLoc);
1004  }
1005 
1006 // Clean up CODELOC and KERNELFUNC macros.
1007 #undef _CODELOCPARAM
1008 #undef _CODELOCONLYPARAM
1009 #undef _CODELOCARG
1010 #undef _CODELOCFW
1011 #undef _KERNELFUNCPARAM
1012 
1016  bool is_in_order() const;
1017 
1021  backend get_backend() const noexcept;
1022 
1026  template <backend BackendName>
1027  __SYCL_DEPRECATED("Use SYCL 2020 sycl::get_native free function")
1028  auto get_native() const -> typename interop<BackendName, queue>::type {
1029  return reinterpret_cast<typename interop<BackendName, queue>::type>(
1030  getNative());
1031  }
1032 
1033 private:
1034  pi_native_handle getNative() const;
1035 
1036  std::shared_ptr<detail::queue_impl> impl;
1037  queue(std::shared_ptr<detail::queue_impl> impl) : impl(impl) {}
1038 
1039  template <class Obj>
1040  friend decltype(Obj::impl) detail::getSyclObjImpl(const Obj &SyclObject);
1041  template <class T>
1042  friend T detail::createSyclObjFromImpl(decltype(T::impl) ImplObj);
1043 
1044 #if __SYCL_USE_FALLBACK_ASSERT
1045  friend event detail::submitAssertCapture(queue &, event &, queue *,
1046  const detail::code_location &);
1047 #endif
1048 
1050  event submit_impl(std::function<void(handler &)> CGH,
1051  const detail::code_location &CodeLoc);
1053  event submit_impl(std::function<void(handler &)> CGH, queue secondQueue,
1054  const detail::code_location &CodeLoc);
1055 
1056  // Function to postprocess submitted command
1057  // Arguments:
1058  // bool IsKernel - true if the submitted command was kernel, false otherwise
1059  // bool KernelUsesAssert - true if submitted kernel uses assert, only
1060  // meaningful when IsKernel is true
1061  // event &Event - event after which post processing should be executed
1062  using SubmitPostProcessF = std::function<void(bool, bool, event &)>;
1063 
1069  event submit_impl_and_postprocess(std::function<void(handler &)> CGH,
1070  const detail::code_location &CodeLoc,
1071  const SubmitPostProcessF &PostProcess);
1078  event submit_impl_and_postprocess(std::function<void(handler &)> CGH,
1079  queue secondQueue,
1080  const detail::code_location &CodeLoc,
1081  const SubmitPostProcessF &PostProcess);
1082 
1089  template <typename KernelName = detail::auto_name, typename KernelType,
1090  int Dims>
1091  event parallel_for_impl(
1092  range<Dims> NumWorkItems, KernelType KernelFunc,
1093  const detail::code_location &CodeLoc = detail::code_location::current()) {
1094  return submit(
1095  [&](handler &CGH) {
1096  CGH.template parallel_for<KernelName, KernelType>(NumWorkItems,
1097  KernelFunc);
1098  },
1099  CodeLoc);
1100  }
1101 
1109  template <typename KernelName = detail::auto_name, typename KernelType,
1110  int Dims>
1111  event parallel_for_impl(range<Dims> NumWorkItems, event DepEvent,
1112  KernelType KernelFunc,
1113  const detail::code_location &CodeLoc) {
1114  return submit(
1115  [&](handler &CGH) {
1116  CGH.depends_on(DepEvent);
1117  CGH.template parallel_for<KernelName, KernelType>(NumWorkItems,
1118  KernelFunc);
1119  },
1120  CodeLoc);
1121  }
1122 
1131  template <typename KernelName = detail::auto_name, typename KernelType,
1132  int Dims>
1133  event parallel_for_impl(range<Dims> NumWorkItems,
1134  const std::vector<event> &DepEvents,
1135  KernelType KernelFunc,
1136  const detail::code_location &CodeLoc) {
1137  return submit(
1138  [&](handler &CGH) {
1139  CGH.depends_on(DepEvents);
1140  CGH.template parallel_for<KernelName, KernelType>(NumWorkItems,
1141  KernelFunc);
1142  },
1143  CodeLoc);
1144  }
1145 
1146  buffer<detail::AssertHappened, 1> &getAssertHappenedBuffer();
1147 };
1148 
1149 namespace detail {
1150 #if __SYCL_USE_FALLBACK_ASSERT
1151 #define __SYCL_ASSERT_START 1
1152 
1163 event submitAssertCapture(queue &Self, event &Event, queue *SecondaryQueue,
1164  const detail::code_location &CodeLoc) {
1165  using AHBufT = buffer<detail::AssertHappened, 1>;
1166 
1167  AHBufT &Buffer = Self.getAssertHappenedBuffer();
1168 
1169  event CopierEv, CheckerEv, PostCheckerEv;
1170  auto CopierCGF = [&](handler &CGH) {
1171  CGH.depends_on(Event);
1172 
1173  auto Acc = Buffer.get_access<access::mode::write>(CGH);
1174 
1175  CGH.single_task<__sycl_service_kernel__::AssertInfoCopier>([Acc] {
1176 #ifdef __SYCL_DEVICE_ONLY__
1177  __devicelib_assert_read(&Acc[0]);
1178 #else
1179  (void)Acc;
1180 #endif // __SYCL_DEVICE_ONLY__
1181  });
1182  };
1183  auto CheckerCGF = [&CopierEv, &Buffer](handler &CGH) {
1184  CGH.depends_on(CopierEv);
1185  using mode = access::mode;
1186  using target = access::target;
1187 
1188  auto Acc = Buffer.get_access<mode::read, target::host_buffer>(CGH);
1189 
1190  CGH.host_task([=] {
1191  const detail::AssertHappened *AH = &Acc[0];
1192 
1193  // Don't use assert here as msvc will insert reference to __imp__wassert
1194  // which won't be properly resolved in separate compile use-case
1195 #ifndef NDEBUG
1196  if (AH->Flag == __SYCL_ASSERT_START)
1197  throw sycl::runtime_error(
1198  "Internal Error. Invalid value in assert description.",
1200 #endif
1201 
1202  if (AH->Flag) {
1203  const char *Expr = AH->Expr[0] ? AH->Expr : "<unknown expr>";
1204  const char *File = AH->File[0] ? AH->File : "<unknown file>";
1205  const char *Func = AH->Func[0] ? AH->Func : "<unknown func>";
1206 
1207  fprintf(stderr,
1208  "%s:%d: %s: global id: [%" PRIu64 ",%" PRIu64 ",%" PRIu64
1209  "], local id: [%" PRIu64 ",%" PRIu64 ",%" PRIu64 "] "
1210  "Assertion `%s` failed.\n",
1211  File, AH->Line, Func, AH->GID0, AH->GID1, AH->GID2, AH->LID0,
1212  AH->LID1, AH->LID2, Expr);
1213  abort(); // no need to release memory as it's abort anyway
1214  }
1215  });
1216  };
1217 
1218  if (SecondaryQueue) {
1219  CopierEv = Self.submit_impl(CopierCGF, *SecondaryQueue, CodeLoc);
1220  CheckerEv = Self.submit_impl(CheckerCGF, *SecondaryQueue, CodeLoc);
1221  } else {
1222  CopierEv = Self.submit_impl(CopierCGF, CodeLoc);
1223  CheckerEv = Self.submit_impl(CheckerCGF, CodeLoc);
1224  }
1225 
1226  return CheckerEv;
1227 }
1228 #undef __SYCL_ASSERT_START
1229 #endif // __SYCL_USE_FALLBACK_ASSERT
1230 } // namespace detail
1231 
1232 } // namespace sycl
1233 } // __SYCL_INLINE_NAMESPACE(cl)
1234 
1235 namespace std {
1236 template <> struct hash<cl::sycl::queue> {
1237  size_t operator()(const cl::sycl::queue &Q) const {
1238  return std::hash<std::shared_ptr<cl::sycl::detail::queue_impl>>()(
1240  }
1241 };
1242 } // namespace std
1243 
1244 #undef __SYCL_USE_FALLBACK_ASSERT
cl::sycl::nd_range
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:23
cl::sycl::backend
backend
Definition: backend_types.hpp:21
property_list.hpp
cl::sycl::device_selector
The device_selector class provides ability to choose the best SYCL device based on heuristics specifi...
Definition: device_selector.hpp:28
type
cl::sycl::queue::parallel_for
event parallel_for(range< 2 > NumWorkItems, event DepEvent, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:782
cl::sycl::info::device
device
Definition: info_desc.hpp:47
cl::sycl::event
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:31
cl::sycl::device_selector::select_device
virtual device select_device() const
Definition: device_selector.cpp:65
cl::sycl::queue::prefetch
event prefetch(const void *Ptr, size_t Count, event DepEvent)
Provides hints to the runtime library that data should be made available on a device earlier than Uni...
Definition: queue.hpp:646
__SYCL2020_DEPRECATED
#define __SYCL2020_DEPRECATED(message)
Definition: defines_elementary.hpp:54
cl::sycl::queue::fill
event fill(void *Ptr, const T &Pattern, size_t Count, event DepEvent)
Fills the specified memory with the specified pattern.
Definition: queue.hpp:437
_CODELOCPARAM
#define _CODELOCPARAM(a)
Definition: queue.hpp:43
cl::sycl::get_native
auto get_native(const SyclObjectT &Obj) -> backend_return_t< BackendName, SyclObjectT >
Definition: backend.hpp:69
cl::sycl::queue::fill
event fill(void *Ptr, const T &Pattern, size_t Count, const std::vector< event > &DepEvents)
Fills the specified memory with the specified pattern.
Definition: queue.hpp:454
stl.hpp
device_selector.hpp
cl::sycl::detail::createSyclObjFromImpl
T createSyclObjFromImpl(decltype(T::impl) ImplObj)
Definition: common.hpp:180
device.hpp
cl::sycl::queue::queue
queue(const property_list &PropList={})
Constructs a SYCL queue instance using the device returned by an instance of default_selector.
Definition: queue.hpp:105
cl::sycl::id< Dims >
cl::sycl::queue::fill
event fill(void *Ptr, const T &Pattern, size_t Count)
Fills the specified memory with the specified pattern.
Definition: queue.hpp:424
cl::sycl::queue::wait
void wait(_CODELOCONLYPARAM(&CodeLoc))
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue.hpp:374
__SYCL_DEPRECATED
#define __SYCL_DEPRECATED(message)
Definition: defines_elementary.hpp:43
_pi_mem_advice
_pi_mem_advice
Definition: pi.h:430
cl::sycl::queue::parallel_for
event parallel_for(range< 1 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:727
cl::sycl::queue::copy
event copy(const T *Src, T *Dest, size_t Count, const std::vector< event > &DepEvents)
Copies data from one memory region to another, both pointed by USM pointers.
Definition: queue.hpp:581
cl::sycl::queue::wait_and_throw
void wait_and_throw(_CODELOCONLYPARAM(&CodeLoc))
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue.hpp:388
event.hpp
service_kernel_names.hpp
cl::sycl::queue::parallel_for
event parallel_for(nd_range< Dims > ExecutionRange, const std::vector< event > &DepEvents, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + nd_range that specifies global,...
Definition: queue.hpp:972
cl::sycl::property_list
Objects of the property_list class are containers for the SYCL properties.
Definition: property_list.hpp:25
cl::sycl::queue::single_task
event single_task(const std::vector< event > &DepEvents, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
single_task version with a kernel represented as a lambda.
Definition: queue.hpp:709
cl::sycl::detail::code_location
Definition: common.hpp:29
cl::sycl::queue
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:99
cl::sycl::buffer
Defines a shared array that can be used by kernels in queues.
Definition: buffer.hpp:38
cl::sycl::default_selector
The default selector chooses the first available SYCL device.
Definition: device_selector.hpp:46
cl::sycl::info::queue
queue
Definition: info_desc.hpp:212
cl::sycl::detail::write
void write(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize, unsigned WIOffset, const char *Str, unsigned Len, unsigned Padding=0)
Definition: stream.hpp:110
cl::sycl::range< 1 >
cl::sycl::detail::get
Definition: tuple.hpp:59
cl::sycl::queue::parallel_for
event parallel_for(range< 1 > NumWorkItems, event DepEvent, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:767
cl::sycl::handler::depends_on
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.hpp:1173
cl::sycl::detail::memcpy
void memcpy(void *Dst, const void *Src, std::size_t Size)
cl::sycl::queue::parallel_for
event parallel_for(range< 3 > NumWorkItems, const std::vector< event > &DepEvents, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:845
cl::sycl::queue::parallel_for
event parallel_for(range< Dims > NumWorkItems, id< Dims > WorkItemOffset, event DepEvent, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range and offset that specify global siz...
Definition: queue.hpp:882
cl::sycl::queue::parallel_for
event parallel_for(nd_range< Dims > ExecutionRange, event DepEvent, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + nd_range that specifies global,...
Definition: queue.hpp:949
cl::sycl::queue::queue
queue(const device_selector &DeviceSelector, const property_list &PropList={})
Constructs a SYCL queue instance using the device returned by the DeviceSelector provided.
Definition: queue.hpp:121
export.hpp
cl::sycl::detail::AssertHappened::Func
char Func[128+1]
Definition: assert_happened.hpp:28
cl::sycl::queue::queue
queue(const device &SyclDevice, const property_list &PropList={})
Constructs a SYCL queue instance using the device provided.
Definition: queue.hpp:139
cl::sycl::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:34
cl::sycl::detail::AssertHappened::LID0
uint64_t LID0
Definition: assert_happened.hpp:36
cl::sycl::queue::ext_oneapi_submit_barrier
event ext_oneapi_submit_barrier(const std::vector< event > &WaitList _CODELOCPARAM(&CodeLoc))
Prevents any commands submitted afterward to this queue from executing until all events in WaitList h...
Definition: queue.hpp:346
cl::sycl::handler::prefetch
void prefetch(const void *Ptr, size_t Count)
Provides hints to the runtime library that data should be made available on a device earlier than Uni...
Definition: handler.cpp:541
cl::sycl::detail::usm::free
void free(void *Ptr, const context &Ctxt)
Definition: usm_impl.cpp:132
cl::sycl::handler::ext_oneapi_barrier
void ext_oneapi_barrier()
Prevents any commands submitted afterward to this queue from executing until all commands previously ...
Definition: handler.hpp:2313
cl::sycl::queue::parallel_for
event parallel_for(range< 2 > NumWorkItems, const std::vector< event > &DepEvents, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:829
cl::sycl::queue::submit
event submit(T CGF, queue &SecondaryQueue _CODELOCPARAM(&CodeLoc))
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue.hpp:286
cl::sycl::queue::parallel_for
event parallel_for(range< 3 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:753
cl::sycl::access::target
target
Definition: access.hpp:17
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
cl::sycl::queue::parallel_for
event parallel_for(range< 2 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:740
_KERNELFUNCPARAM
#define _KERNELFUNCPARAM(a)
Definition: queue.hpp:59
_CODELOCARG
#define _CODELOCARG(a)
Definition: queue.hpp:46
cl::sycl::detail::AssertHappened::Expr
char Expr[256+1]
Definition: assert_happened.hpp:26
cl::sycl::interop
Definition: backend_types.hpp:33
cl::sycl::aspect
aspect
Definition: aspects.hpp:15
cl::sycl::ext::intel::prefetch
prefetch_impl< _B > prefetch
Definition: fpga_lsu.hpp:44
cl::sycl::detail::AssertHappened::LID2
uint64_t LID2
Definition: assert_happened.hpp:38
pi_native_handle
uintptr_t pi_native_handle
Definition: pi.h:72
cl::sycl::handler
Command group handler class.
Definition: handler.hpp:364
cl::sycl::detail::AssertHappened::GID2
uint64_t GID2
Definition: assert_happened.hpp:34
cl::sycl::detail::AssertHappened::Flag
int Flag
Definition: assert_happened.hpp:25
__SYCL_ASSERT_START
#define __SYCL_ASSERT_START
Definition: queue.hpp:1151
cl::sycl::queue::single_task
event single_task(event DepEvent, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
single_task version with a kernel represented as a lambda.
Definition: queue.hpp:691
cl::sycl::queue::parallel_for
event parallel_for(nd_range< Dims > ExecutionRange, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + nd_range that specifies global,...
Definition: queue.hpp:928
cl::sycl::queue::operator==
bool operator==(const queue &RHS) const
Definition: queue.hpp:210
cl::sycl::queue::operator!=
bool operator!=(const queue &RHS) const
Definition: queue.hpp:212
PI_INVALID_VALUE
@ PI_INVALID_VALUE
Definition: pi.h:87
cl::sycl::detail::auto_name
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:33
cl::sycl::detail::getSyclObjImpl
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: common.hpp:163
backend_types.hpp
cl::sycl::info::event
event
Definition: info_desc.hpp:267
handler.hpp
cl::sycl::detail::submitAssertCapture
static event submitAssertCapture(queue &, event &, queue *, const detail::code_location &)
Submit copy task for assert failure flag and host-task to check the flag.
Definition: queue.hpp:1163
cl::sycl::queue::parallel_for
event parallel_for(nd_range< Dims > ExecutionRange, Reduction Redu, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + nd_range that specifies global,...
Definition: queue.hpp:995
std
Definition: accessor.hpp:2356
_CODELOCONLYPARAM
#define _CODELOCONLYPARAM(a)
Definition: queue.hpp:41
cl::sycl::queue::prefetch
event prefetch(const void *Ptr, size_t Count, const std::vector< event > &DepEvents)
Provides hints to the runtime library that data should be made available on a device earlier than Uni...
Definition: queue.hpp:662
cl::sycl::queue::parallel_for
event parallel_for(range< Dims > NumWorkItems, id< Dims > WorkItemOffset, const std::vector< event > &DepEvents, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range and offset that specify global siz...
Definition: queue.hpp:906
cl::sycl::queue::parallel_for
event parallel_for(range< 3 > NumWorkItems, event DepEvent, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:797
cl::sycl::detail::AssertHappened::LID1
uint64_t LID1
Definition: assert_happened.hpp:37
cl::sycl::detail::AssertHappened::GID0
uint64_t GID0
Definition: assert_happened.hpp:32
cl::sycl::context
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:35
cl::sycl::detail::AssertHappened::GID1
uint64_t GID1
Definition: assert_happened.hpp:33
cl::sycl::queue::single_task
event single_task(_KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
single_task version with a kernel represented as a lambda.
Definition: queue.hpp:675
cl::sycl::queue::ext_oneapi_submit_barrier
event ext_oneapi_submit_barrier(_CODELOCONLYPARAM(&CodeLoc))
Prevents any commands submitted afterward to this queue from executing until all commands previously ...
Definition: queue.hpp:319
cl::sycl::queue::queue
queue(const device_selector &DeviceSelector, const async_handler &AsyncHandler, const property_list &PropList={})
Constructs a SYCL queue instance with an async_handler using the device returned by the DeviceSelecto...
Definition: queue.hpp:131
exception_list.hpp
cl::sycl::detail::AssertHappened::File
char File[256+1]
Definition: assert_happened.hpp:27
info_desc.hpp
common.hpp
cl::sycl::queue::parallel_for
event parallel_for(range< Dims > NumWorkItems, id< Dims > WorkItemOffset, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range and offset that specify global siz...
Definition: queue.hpp:861
cl::sycl::queue::copy
event copy(const T *Src, T *Dest, size_t Count)
Copies data from one memory region to another, both pointed by USM pointers.
Definition: queue.hpp:549
std::hash< cl::sycl::queue >::operator()
size_t operator()(const cl::sycl::queue &Q) const
Definition: queue.hpp:1237
cl::sycl::access::mode
mode
Definition: access.hpp:28
KernelFunc
std::function< void(const sycl::nd_item< NDims > &)> KernelFunc
Definition: pi_esimd_cpu.cpp:129
cl::sycl::info::context
context
Definition: info_desc.hpp:38
cl::sycl::handler::fill
void fill(accessor< T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT > Dst, const T &Pattern)
Fills memory pointed by accessor with the pattern given.
Definition: handler.hpp:2261
assert_happened.hpp
cl::sycl::detail::AssertHappened
Definition: assert_happened.hpp:24
cl::sycl::queue::copy
event copy(const T *Src, T *Dest, size_t Count, event DepEvent)
Copies data from one memory region to another, both pointed by USM pointers.
Definition: queue.hpp:565
cl::sycl::detail::AssertHappened::Line
int32_t Line
Definition: assert_happened.hpp:30
cl::sycl::async_handler
std::function< void(cl::sycl::exception_list)> async_handler
Definition: exception_list.hpp:53
cl::sycl::queue::parallel_for
event parallel_for(range< 1 > NumWorkItems, const std::vector< event > &DepEvents, _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc))
parallel_for version with a kernel represented as a lambda + range that specifies global size only.
Definition: queue.hpp:813
_CODELOCFW
#define _CODELOCFW(a)
Definition: queue.hpp:47
cl::sycl::queue::queue
queue(const async_handler &AsyncHandler, const property_list &PropList={})
Constructs a SYCL queue instance with an async_handler using the device returned by an instance of de...
Definition: queue.hpp:113
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:12