20 inline namespace _V1 {
21 namespace detail::enqueue_kernel_launch {
24 ur_kernel_handle_t Kernel,
const NDRDescT &NDRDesc) {
27 if (Backend == sycl::backend::ext_oneapi_cuda) {
36 const size_t TotalNumberOfWIs =
40 uint32_t NumRegisters = 0;
41 Plugin->call<UrApiKind::urKernelGetInfo>(
Kernel, UR_KERNEL_INFO_NUM_REGS,
43 &NumRegisters,
nullptr);
45 uint32_t MaxRegistersPerBlock =
46 DeviceImpl.
get_info<ext::codeplay::experimental::info::device::
47 max_registers_per_work_group>();
49 const bool HasExceededAvailableRegisters =
50 TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;
52 if (HasExceededAvailableRegisters) {
54 "Exceeded the number of registers available on the hardware.\n");
59 "\tThe number registers per work-group cannot exceed " +
60 std::to_string(MaxRegistersPerBlock) +
61 " for this kernel on this device.\n"
62 "\tThe kernel uses " +
63 std::to_string(NumRegisters) +
64 " registers per work-item for a total of " +
65 std::to_string(TotalNumberOfWIs) +
66 " work-items per work-group.\n"));
70 constexpr ur_result_t Error = UR_RESULT_ERROR_OUT_OF_RESOURCES;
72 "UR backend failed. UR backend returns:" +
77 ur_kernel_handle_t Kernel,
83 bool IsOpenCL =
false;
84 bool IsOpenCLV1x =
false;
85 bool IsOpenCLVGE20 =
false;
86 bool IsLevelZero =
false;
88 if (Backend == sycl::backend::opencl) {
89 std::string VersionString =
90 DeviceImpl.
get_info<info::device::version>().substr(7, 3);
92 IsOpenCLV1x = (VersionString.find(
"1.") == 0);
94 (VersionString.find(
"2.") == 0) || (VersionString.find(
"3.") == 0);
95 }
else if (Backend == sycl::backend::ext_oneapi_level_zero) {
102 size_t CompileWGSize[3] = {0};
103 Plugin->call<UrApiKind::urKernelGetGroupInfo>(
104 Kernel, Device, UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
105 sizeof(size_t) * 3, CompileWGSize,
nullptr);
107 size_t MaxWGSize = 0;
108 Plugin->call<UrApiKind::urDeviceGetInfo>(Device,
109 UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
110 sizeof(size_t), &MaxWGSize,
nullptr);
112 const bool HasLocalSize = (NDRDesc.
LocalSize[0] != 0);
114 if (CompileWGSize[0] != 0) {
115 if (CompileWGSize[0] > MaxWGSize || CompileWGSize[1] > MaxWGSize ||
116 CompileWGSize[2] > MaxWGSize)
119 "Submitting a kernel decorated with reqd_work_group_size attribute "
120 "to a device that does not support this work group size is invalid.");
126 if (!HasLocalSize && (IsOpenCLV1x || IsOpenCLVGE20)) {
129 "OpenCL 1.x and 2.0 requires to pass local size argument even if "
130 "required work-group size was specified in the program source");
135 if (NDRDesc.
LocalSize[0] != CompileWGSize[0] ||
136 NDRDesc.
LocalSize[1] != CompileWGSize[1] ||
137 NDRDesc.
LocalSize[2] != CompileWGSize[2])
140 "The specified local size {" + std::to_string(NDRDesc.
LocalSize[2]) +
141 ", " + std::to_string(NDRDesc.
LocalSize[1]) +
", " +
143 "} doesn't match the required work-group size specified "
144 "in the program source {" +
145 std::to_string(CompileWGSize[2]) +
", " +
146 std::to_string(CompileWGSize[1]) +
", " +
147 std::to_string(CompileWGSize[0]) +
"}");
151 size_t MaxThreadsPerBlock[3] = {};
152 Plugin->call<UrApiKind::urDeviceGetInfo>(
153 Device, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
sizeof(MaxThreadsPerBlock),
154 MaxThreadsPerBlock,
nullptr);
156 for (
size_t I = 0; I < 3; ++I) {
157 if (MaxThreadsPerBlock[I] < NDRDesc.
LocalSize[I]) {
159 "The number of work-items in each dimension of a "
160 "work-group cannot exceed {" +
161 std::to_string(MaxThreadsPerBlock[0]) +
", " +
162 std::to_string(MaxThreadsPerBlock[1]) +
", " +
163 std::to_string(MaxThreadsPerBlock[2]) +
164 "} for this device");
176 const size_t TotalNumberOfWIs =
178 if (TotalNumberOfWIs > MaxWGSize)
181 "Total number of work-items in a work-group cannot exceed " +
182 std::to_string(MaxWGSize));
183 }
else if (IsOpenCLVGE20 || IsLevelZero) {
190 size_t KernelWGSize = 0;
191 Plugin->call<UrApiKind::urKernelGetGroupInfo>(
192 Kernel, Device, UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE,
sizeof(size_t),
193 &KernelWGSize,
nullptr);
194 const size_t TotalNumberOfWIs =
196 if (TotalNumberOfWIs > KernelWGSize)
199 "Total number of work-items in a work-group cannot exceed " +
200 std::to_string(KernelWGSize) +
" for this kernel");
207 const bool NonUniformWGs =
217 const bool LocalExceedsGlobal =
228 if (LocalExceedsGlobal)
230 "Local workgroup size cannot be greater than "
231 "global range in any dimension");
234 "Global_work_size must be evenly divisible "
235 "by local_work_size. Non-uniform work-groups "
236 "are not supported by the target device");
244 ur_program_handle_t Program =
nullptr;
245 Plugin->call<UrApiKind::urKernelGetInfo>(
246 Kernel, UR_KERNEL_INFO_PROGRAM,
sizeof(ur_program_handle_t),
249 Plugin->call<UrApiKind::urProgramGetBuildInfo>(
250 Program, Device, UR_PROGRAM_BUILD_INFO_OPTIONS, 0,
nullptr,
252 std::string Opts(OptsSize,
'\0');
253 Plugin->call<UrApiKind::urProgramGetBuildInfo>(
254 Program, Device, UR_PROGRAM_BUILD_INFO_OPTIONS, OptsSize,
255 &Opts.front(),
nullptr);
256 const bool HasStd20 = Opts.find(
"-cl-std=CL2.0") != std::string::npos;
257 const bool RequiresUniformWGSize =
258 Opts.find(
"-cl-uniform-work-group-size") != std::string::npos;
259 std::string LocalWGSize = std::to_string(NDRDesc.
LocalSize[0]) +
262 ", " + std::to_string(NDRDesc.
LocalSize[2]);
263 std::string GlobalWGSize =
264 std::to_string(NDRDesc.
GlobalSize[0]) +
", " +
265 std::to_string(NDRDesc.
GlobalSize[1]) +
", " +
267 std::string message =
269 ?
"Local work-group size {" + LocalWGSize +
270 "} is greater than global range size {" + GlobalWGSize +
272 :
"Global work size {" + GlobalWGSize +
273 "} is not evenly divisible by local work-group size {" +
278 message.append(
"Non-uniform work-groups are not allowed by "
279 "default. Underlying OpenCL 2.x implementation "
280 "supports this feature and to enable it, build "
281 "device program with -cl-std=CL2.0"));
282 else if (RequiresUniformWGSize)
285 message.append(
"Non-uniform work-groups are not allowed when "
286 "-cl-uniform-work-group-size flag is used. "
287 "Underlying OpenCL 2.x implementation supports "
288 "this feature, but it is being disabled by "
289 "-cl-uniform-work-group-size build flag"));
298 "Non-uniform work-groups are not supported by the target device");
306 "internal error: expected HasLocalSize");
315 size_t MaxWISize[] = {0, 0, 0};
317 Plugin->call<UrApiKind::urDeviceGetInfo>(
318 Device, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
sizeof(MaxWISize), &MaxWISize,
320 for (
unsigned I = 0; I < NDRDesc.
Dims; I++) {
324 "Number of work-items in a work-group exceed limit for dimension " +
325 std::to_string(I) +
" : " + std::to_string(NDRDesc.
LocalSize[I]) +
326 " > " + std::to_string(MaxWISize[I]));
335 size_t MaxNWGs[] = {0, 0, 0};
336 Plugin->call<UrApiKind::urDeviceGetInfo>(Device,
337 UR_DEVICE_INFO_MAX_WORK_GROUPS_3D,
338 sizeof(MaxNWGs), &MaxNWGs,
nullptr);
339 for (
unsigned int I = 0; I < NDRDesc.
Dims; I++) {
341 if (NWgs > MaxNWGs[I])
344 "Number of work-groups exceed limit for dimension " +
345 std::to_string(I) +
" : " + std::to_string(NWgs) +
" > " +
346 std::to_string(MaxNWGs[I]));
354 ur_kernel_handle_t Kernel,
const NDRDescT &NDRDesc) {
355 assert(Error != UR_RESULT_SUCCESS &&
356 "Success is expected to be handled on caller side");
358 case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY:
359 case UR_RESULT_ERROR_OUT_OF_RESOURCES:
362 case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE:
365 case UR_RESULT_ERROR_INVALID_KERNEL_ARGS:
369 "The kernel argument values have not been specified OR a kernel "
370 "argument declared to be a pointer to a type."),
371 UR_RESULT_ERROR_INVALID_KERNEL_ARGS);
373 case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE:
376 case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT:
380 "image object is specified as an argument value and the image "
381 "format is not supported by device associated with queue"),
382 UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT);
384 case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET:
387 "a sub-buffer object is specified as the value for an "
388 "argument that is a buffer object and the offset "
389 "specified when the sub-buffer object is created is "
390 "not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN value "
391 "for device associated with queue"),
392 UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET);
394 case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE:
398 "failure to allocate memory for data store associated with image "
399 "or buffer objects specified as arguments to kernel"),
400 UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE);
402 case UR_RESULT_ERROR_INVALID_IMAGE_SIZE:
406 "image object is specified as an argument value and the image "
407 "dimensions (image width, height, specified or compute row and/or "
408 "slice pitch) are not supported by device associated with queue"),
409 UR_RESULT_ERROR_INVALID_IMAGE_SIZE);
411 case UR_RESULT_ERROR_INVALID_VALUE:
414 case UR_RESULT_ERROR_ADAPTER_SPECIFIC:
421 return DeviceImpl.
getPlugin()->checkUrResult(Error);
433 namespace detail::kernel_get_group_info {
436 assert(Error != UR_RESULT_SUCCESS &&
437 "Success is expected to be handled on caller side");
439 case UR_RESULT_ERROR_INVALID_VALUE:
440 if (Descriptor == UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE)
443 "info::kernel_device_specific::global_work_size descriptor may only "
444 "be used if the device type is device_type::custom or if the kernel "
445 "is a built-in kernel.");
449 Plugin->checkUrResult(Error);
sycl::range< 3 > GlobalSize
sycl::range< 3 > LocalSize
Param::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
platform get_platform() const
Get associated SYCL platform.
const PluginPtr & getPlugin() const
ur_device_handle_t & getHandleRef()
Get reference to UR device.
void handleErrorOrWarning(ur_result_t Error, const device_impl &DeviceImpl, ur_kernel_handle_t Kernel, const NDRDescT &NDRDesc)
Analyzes error code and arguments of urEnqueueKernelLaunch to emit user-friendly exception describing...
void handleInvalidValue(const device_impl &DeviceImpl, const NDRDescT &NDRDesc)
void handleInvalidWorkItemSize(const device_impl &DeviceImpl, const NDRDescT &NDRDesc)
void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, ur_kernel_handle_t Kernel, const NDRDescT &NDRDesc)
void handleOutOfResources(const device_impl &DeviceImpl, ur_kernel_handle_t Kernel, const NDRDescT &NDRDesc)
void handleErrorOrWarning(ur_result_t Error, ur_kernel_group_info_t Descriptor, const PluginPtr &Plugin)
Analyzes error code of urKernelGetGroupInfo.
std::string codeToString(int32_t code)
std::shared_ptr< plugin > PluginPtr
exception set_ur_error(exception &&e, int32_t ur_err)
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
C++ utilities for Unified Runtime integration.