DPC++ Runtime
Runtime libraries for oneAPI Data Parallel C++
enqueue_kernel.cpp
Go to the documentation of this file.
1 //===------------------- enqueue_kernel.cpp ---------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // SYCL error handling of enqueue kernel operations
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "error_handling.hpp"
14 
16 #include <CL/sycl/detail/pi.hpp>
17 #include <detail/plugin.hpp>
18 
20 namespace sycl {
21 namespace detail {
22 
23 namespace enqueue_kernel_launch {
24 
25 bool handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
26  const NDRDescT &NDRDesc) {
27  const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
28 
29  const plugin &Plugin = DeviceImpl.getPlugin();
30  RT::PiDevice Device = DeviceImpl.getHandleRef();
31  cl::sycl::platform Platform = DeviceImpl.get_platform();
32 
33  if (HasLocalSize) {
34  size_t MaxThreadsPerBlock[3] = {};
36  Device, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES, sizeof(MaxThreadsPerBlock),
37  MaxThreadsPerBlock, nullptr);
38 
39  for (size_t I = 0; I < 3; ++I) {
40  if (MaxThreadsPerBlock[I] < NDRDesc.LocalSize[I]) {
41  throw sycl::nd_range_error(
42  "The number of work-items in each dimension of a work-group cannot "
43  "exceed {" +
44  std::to_string(MaxThreadsPerBlock[0]) + ", " +
45  std::to_string(MaxThreadsPerBlock[1]) + ", " +
46  std::to_string(MaxThreadsPerBlock[2]) + "} for this device",
48  }
49  }
50  }
51 
52  // Some of the error handling below is special for particular OpenCL
53  // versions. If this is an OpenCL backend, get the version.
54  bool IsOpenCL = false; // Backend is any OpenCL version
55  bool IsOpenCLV1x = false; // Backend is OpenCL 1.x
56  bool IsOpenCLV20 = false; // Backend is OpenCL 2.0
57  if (Platform.get_backend() == cl::sycl::backend::opencl) {
58  std::string VersionString = DeviceImpl.get_info<info::device::version>();
59  IsOpenCL = true;
60  IsOpenCLV1x = (VersionString.find("1.") == 0);
61  IsOpenCLV20 = (VersionString.find("2.0") == 0);
62  }
63 
64  size_t CompileWGSize[3] = {0};
67  sizeof(size_t) * 3, CompileWGSize, nullptr);
68 
69  if (CompileWGSize[0] != 0) {
70  // OpenCL 1.x && 2.0:
71  // PI_INVALID_WORK_GROUP_SIZE if local_work_size is NULL and the
72  // reqd_work_group_size attribute is used to declare the work-group size
73  // for kernel in the program source.
74  if (!HasLocalSize && (IsOpenCLV1x || IsOpenCLV20)) {
75  throw sycl::nd_range_error(
76  "OpenCL 1.x and 2.0 requires to pass local size argument even if "
77  "required work-group size was specified in the program source",
79  }
80  // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and does not
81  // match the required work-group size for kernel in the program source.
82  if (NDRDesc.LocalSize[0] != CompileWGSize[0] ||
83  NDRDesc.LocalSize[1] != CompileWGSize[1] ||
84  NDRDesc.LocalSize[2] != CompileWGSize[2])
85  throw sycl::nd_range_error(
86  "The specified local size {" + std::to_string(NDRDesc.LocalSize[0]) +
87  ", " + std::to_string(NDRDesc.LocalSize[1]) + ", " +
88  std::to_string(NDRDesc.LocalSize[2]) +
89  "} doesn't match the required work-group size specified "
90  "in the program source {" +
91  std::to_string(CompileWGSize[0]) + ", " +
92  std::to_string(CompileWGSize[1]) + ", " +
93  std::to_string(CompileWGSize[2]) + "}",
95  }
96  if (IsOpenCL) {
97  if (IsOpenCLV1x) {
98  // OpenCL 1.x:
99  // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the
100  // total number of work-items in the work-group computed as
101  // local_work_size[0] * ... * local_work_size[work_dim - 1] is greater
102  // than the value specified by PI_DEVICE_MAX_WORK_GROUP_SIZE in
103  // table 4.3
104  size_t MaxWGSize = 0;
106  Device, PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE, sizeof(size_t),
107  &MaxWGSize, nullptr);
108  const size_t TotalNumberOfWIs =
109  NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
110  if (TotalNumberOfWIs > MaxWGSize)
111  throw sycl::nd_range_error(
112  "Total number of work-items in a work-group cannot exceed " +
113  std::to_string(MaxWGSize),
115  } else {
116  // OpenCL 2.x:
117  // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the
118  // total number of work-items in the work-group computed as
119  // local_work_size[0] * ... * local_work_size[work_dim - 1] is greater
120  // than the value specified by PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE in
121  // table 5.21.
122  size_t KernelWGSize = 0;
124  Kernel, Device, PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE, sizeof(size_t),
125  &KernelWGSize, nullptr);
126  const size_t TotalNumberOfWIs =
127  NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
128  if (TotalNumberOfWIs > KernelWGSize)
129  throw sycl::nd_range_error(
130  "Total number of work-items in a work-group cannot exceed " +
131  std::to_string(KernelWGSize) + " for this kernel",
133  }
134  } else {
135  // TODO: Should probably have something similar for the other backends
136  }
137 
138  if (HasLocalSize) {
139  // Is the global range size evenly divisible by the local workgroup size?
140  const bool NonUniformWGs =
141  (NDRDesc.LocalSize[0] != 0 &&
142  NDRDesc.GlobalSize[0] % NDRDesc.LocalSize[0] != 0) ||
143  (NDRDesc.LocalSize[1] != 0 &&
144  NDRDesc.GlobalSize[1] % NDRDesc.LocalSize[1] != 0) ||
145  (NDRDesc.LocalSize[2] != 0 &&
146  NDRDesc.GlobalSize[2] % NDRDesc.LocalSize[2] != 0);
147  // Is the local size of the workgroup greater than the global range size in
148  // any dimension?
149  if (IsOpenCL) {
150  const bool LocalExceedsGlobal =
151  NonUniformWGs && (NDRDesc.LocalSize[0] > NDRDesc.GlobalSize[0] ||
152  NDRDesc.LocalSize[1] > NDRDesc.GlobalSize[1] ||
153  NDRDesc.LocalSize[2] > NDRDesc.GlobalSize[2]);
154 
155  if (NonUniformWGs) {
156  if (IsOpenCLV1x) {
157  // OpenCL 1.x:
158  // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and
159  // number of workitems specified by global_work_size is not evenly
160  // divisible by size of work-group given by local_work_size
161  if (LocalExceedsGlobal)
162  throw sycl::nd_range_error("Local workgroup size cannot be greater "
163  "than global range in any dimension",
165  else
166  throw sycl::nd_range_error(
167  "Global_work_size must be evenly divisible by local_work_size. "
168  "Non-uniform work-groups are not supported by the target "
169  "device",
171  } else {
172  // OpenCL 2.x:
173  // PI_INVALID_WORK_GROUP_SIZE if the program was compiled with
174  // –cl-uniform-work-group-size and the number of work-items specified
175  // by global_work_size is not evenly divisible by size of work-group
176  // given by local_work_size
177 
178  pi_program Program = nullptr;
180  Kernel, PI_KERNEL_INFO_PROGRAM, sizeof(pi_program), &Program,
181  nullptr);
182  size_t OptsSize = 0;
184  Program, Device, PI_PROGRAM_BUILD_INFO_OPTIONS, 0, nullptr,
185  &OptsSize);
186  std::string Opts(OptsSize, '\0');
188  Program, Device, PI_PROGRAM_BUILD_INFO_OPTIONS, OptsSize,
189  &Opts.front(), nullptr);
190  const bool HasStd20 = Opts.find("-cl-std=CL2.0") != std::string::npos;
191  const bool RequiresUniformWGSize =
192  Opts.find("-cl-uniform-work-group-size") != std::string::npos;
193  std::string LocalWGSize = std::to_string(NDRDesc.LocalSize[0]) +
194  ", " +
195  std::to_string(NDRDesc.LocalSize[1]) +
196  ", " + std::to_string(NDRDesc.LocalSize[2]);
197  std::string GlobalWGSize =
198  std::to_string(NDRDesc.GlobalSize[0]) + ", " +
199  std::to_string(NDRDesc.GlobalSize[1]) + ", " +
200  std::to_string(NDRDesc.GlobalSize[2]);
201  std::string message =
202  LocalExceedsGlobal
203  ? "Local work-group size {" + LocalWGSize +
204  "} is greater than global range size {" + GlobalWGSize +
205  "}. "
206  : "Global work size {" + GlobalWGSize +
207  "} is not evenly divisible by local work-group size {" +
208  LocalWGSize + "}. ";
209  if (!HasStd20)
210  throw sycl::nd_range_error(
211  message.append(
212  "Non-uniform work-groups are not allowed by "
213  "default. Underlying "
214  "OpenCL 2.x implementation supports this feature "
215  "and to enable "
216  "it, build device program with -cl-std=CL2.0"),
218  else if (RequiresUniformWGSize)
219  throw sycl::nd_range_error(
220  message.append(
221  "Non-uniform work-groups are not allowed by when "
222  "-cl-uniform-work-group-size flag is used. Underlying "
223  "OpenCL 2.x implementation supports this feature, but it "
224  "is "
225  "being "
226  "disabled by -cl-uniform-work-group-size build flag"),
228  // else unknown. fallback (below)
229  }
230  }
231  } else {
232  // TODO: Decide what checks (if any) we need for the other backends
233  }
234  throw sycl::nd_range_error(
235  "Non-uniform work-groups are not supported by the target device",
237  }
238  // TODO: required number of sub-groups, OpenCL 2.1:
239  // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and is not
240  // consistent with the required number of sub-groups for kernel in the
241  // program source.
242 
243  // Fallback
244  constexpr pi_result Error = PI_INVALID_WORK_GROUP_SIZE;
245  throw runtime_error(
246  "PI backend failed. PI backend returns: " + codeToString(Error), Error);
247 }
248 
249 bool handleInvalidWorkItemSize(const device_impl &DeviceImpl,
250  const NDRDescT &NDRDesc) {
251 
252  const plugin &Plugin = DeviceImpl.getPlugin();
253  RT::PiDevice Device = DeviceImpl.getHandleRef();
254 
255  size_t MaxWISize[] = {0, 0, 0};
256 
258  Device, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES, sizeof(MaxWISize), &MaxWISize,
259  nullptr);
260  for (unsigned I = 0; I < NDRDesc.Dims; I++) {
261  if (NDRDesc.LocalSize[I] > MaxWISize[I])
262  throw sycl::nd_range_error(
263  "Number of work-items in a work-group exceed limit for dimension " +
264  std::to_string(I) + " : " + std::to_string(NDRDesc.LocalSize[I]) +
265  " > " + std::to_string(MaxWISize[I]),
267  }
268  return 0;
269 }
270 
271 bool handleError(pi_result Error, const device_impl &DeviceImpl,
272  pi_kernel Kernel, const NDRDescT &NDRDesc) {
273  assert(Error != PI_SUCCESS &&
274  "Success is expected to be handled on caller side");
275  switch (Error) {
277  return handleInvalidWorkGroupSize(DeviceImpl, Kernel, NDRDesc);
278 
280  throw sycl::nd_range_error(
281  "The kernel argument values have not been specified "
282  " OR "
283  "a kernel argument declared to be a pointer to a type.",
285 
287  return handleInvalidWorkItemSize(DeviceImpl, NDRDesc);
288 
290  throw sycl::nd_range_error(
291  "image object is specified as an argument value"
292  " and the image format is not supported by device associated"
293  " with queue",
295 
297  throw sycl::nd_range_error(
298  "a sub-buffer object is specified as the value for an argument "
299  " that is a buffer object and the offset specified "
300  "when the sub-buffer object is created is not aligned "
301  "to CL_DEVICE_MEM_BASE_ADDR_ALIGN value for device associated"
302  " with queue",
304 
306  throw sycl::nd_range_error(
307  "failure to allocate memory for data store associated with image"
308  " or buffer objects specified as arguments to kernel",
310 
312  throw sycl::nd_range_error(
313  "image object is specified as an argument value and the image "
314  "dimensions (image width, height, specified or compute row and/or "
315  "slice pitch) are not supported by device associated with queue",
317 
318  // TODO: Handle other error codes
319 
320  default:
321  throw runtime_error(
322  "Native API failed. Native API returns: " + codeToString(Error), Error);
323  }
324 }
325 
326 } // namespace enqueue_kernel_launch
327 
328 } // namespace detail
329 } // namespace sycl
330 } // __SYCL_INLINE_NAMESPACE(cl)
PI_IMAGE_FORMAT_NOT_SUPPORTED
@ PI_IMAGE_FORMAT_NOT_SUPPORTED
Definition: pi.h:113
PI_SUCCESS
@ PI_SUCCESS
Definition: pi.h:82
cl::sycl::detail::NDRDescT
Definition: cg_types.hpp:41
cl::sycl::detail::device_impl::getPlugin
const plugin & getPlugin() const
Definition: device_impl.hpp:122
pi_program
_pi_program * pi_program
Definition: pi.h:871
cl::sycl::detail::device_impl
Definition: device_impl.hpp:32
cl::sycl::backend::opencl
@ opencl
cl::sycl::detail::NDRDescT::GlobalSize
sycl::range< 3 > GlobalSize
Definition: cg_types.hpp:116
PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES
@ PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES
Definition: pi.h:191
_pi_result
_pi_result
Definition: pi.h:81
PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE
@ PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE
Definition: pi.h:352
PI_PROGRAM_BUILD_INFO_OPTIONS
@ PI_PROGRAM_BUILD_INFO_OPTIONS
Definition: pi.h:139
plugin.hpp
PI_KERNEL_INFO_PROGRAM
@ PI_KERNEL_INFO_PROGRAM
Definition: pi.h:346
pi.hpp
cl::sycl::detail::device_impl::get_platform
platform get_platform() const
Get associated SYCL platform.
Definition: device_impl.cpp:107
_pi_kernel
Implementation of a PI Kernel for CUDA.
Definition: pi_cuda.hpp:578
piProgramGetBuildInfo
pi_result piProgramGetBuildInfo(pi_program program, pi_device device, cl_program_build_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1017
PI_MISALIGNED_SUB_BUFFER_OFFSET
@ PI_MISALIGNED_SUB_BUFFER_OFFSET
Definition: pi.h:102
piKernelGetGroupInfo
pi_result piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1056
cl::sycl::detail::plugin::call
void call(ArgsT... Args) const
Calls the API, traces the call, checks the result.
Definition: plugin.hpp:182
piDeviceGetInfo
pi_result piDeviceGetInfo(pi_device device, pi_device_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Returns requested info for provided native device Return PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT fo...
Definition: pi_esimd_emulator.cpp:485
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
error_handling.hpp
_pi_program
Implementation of PI Program on CUDA Module object.
Definition: pi_cuda.hpp:523
PI_INVALID_IMAGE_SIZE
@ PI_INVALID_IMAGE_SIZE
Definition: pi.h:111
cl::sycl::detail::plugin
The plugin class provides a unified interface to the underlying low-level runtimes for the device-agn...
Definition: plugin.hpp:89
cl::sycl::detail::device_impl::get_info
info::param_traits< info::device, param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
Definition: device_impl.hpp:189
cl::sycl::detail::enqueue_kernel_launch::handleError
bool handleError(pi_result Error, const device_impl &DeviceImpl, pi_kernel Kernel, const NDRDescT &NDRDesc)
Analyzes error code and arguments of piEnqueueKernelLaunch to emit user-friendly exception describing...
Definition: enqueue_kernel.cpp:271
cl::sycl::detail::NDRDescT::Dims
size_t Dims
Definition: cg_types.hpp:123
PI_INVALID_KERNEL_ARGS
@ PI_INVALID_KERNEL_ARGS
Definition: pi.h:110
cl::sycl::detail::device_impl::getHandleRef
RT::PiDevice & getHandleRef()
Get reference to PI device.
Definition: device_impl.hpp:61
piKernelGetInfo
pi_result piKernelGetInfo(pi_kernel kernel, pi_kernel_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1052
backend_types.hpp
PI_INVALID_WORK_GROUP_SIZE
@ PI_INVALID_WORK_GROUP_SIZE
Definition: pi.h:104
cl::sycl::platform
Encapsulates a SYCL platform on which kernels may be executed.
Definition: platform.hpp:33
PI_INVALID_WORK_ITEM_SIZE
@ PI_INVALID_WORK_ITEM_SIZE
Definition: pi.h:108
cl::sycl::detail::NDRDescT::LocalSize
sycl::range< 3 > LocalSize
Definition: cg_types.hpp:117
cl::sycl::detail::codeToString
static std::string codeToString(cl_int code)
Definition: common.hpp:80
cl::sycl::platform::get_backend
backend get_backend() const noexcept
Returns the backend associated with this platform.
Definition: platform.cpp:50
PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE
@ PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE
Definition: pi.h:192
PI_MEM_OBJECT_ALLOCATION_FAILURE
@ PI_MEM_OBJECT_ALLOCATION_FAILURE
Definition: pi.h:114
cl::sycl::detail::enqueue_kernel_launch::handleInvalidWorkItemSize
bool handleInvalidWorkItemSize(const device_impl &DeviceImpl, const NDRDescT &NDRDesc)
Definition: enqueue_kernel.cpp:249
cl::sycl::detail::enqueue_kernel_launch::handleInvalidWorkGroupSize
bool handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel, const NDRDescT &NDRDesc)
Definition: enqueue_kernel.cpp:25
PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
Definition: pi.h:353
_pi_device
PI device mapping to a CUdevice.
Definition: pi_cuda.hpp:71
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:12