23 #include <cuda_device_runtime_api.h>
34 std::string getCudaVersionString() {
35 int driver_version = 0;
36 cuDriverGetVersion(&driver_version);
38 std::stringstream stream;
39 stream <<
"CUDA " << driver_version / 1000 <<
"."
40 << driver_version % 1000 / 10;
48 case CUDA_ERROR_NOT_PERMITTED:
49 return PI_ERROR_INVALID_OPERATION;
50 case CUDA_ERROR_INVALID_CONTEXT:
51 return PI_ERROR_INVALID_CONTEXT;
52 case CUDA_ERROR_INVALID_DEVICE:
53 return PI_ERROR_INVALID_DEVICE;
54 case CUDA_ERROR_INVALID_VALUE:
55 return PI_ERROR_INVALID_VALUE;
56 case CUDA_ERROR_OUT_OF_MEMORY:
57 return PI_ERROR_OUT_OF_HOST_MEMORY;
58 case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
59 return PI_ERROR_OUT_OF_RESOURCES;
61 return PI_ERROR_UNKNOWN;
78 pi_result cuda_piPluginGetLastError(
char **message) {
86 template <
typename Func>
88 std::size_t num_events_in_wait_list, Func &&f) {
90 if (event_wait_list ==
nullptr || num_events_in_wait_list == 0) {
91 return PI_ERROR_INVALID_EVENT_WAIT_LIST;
95 if (num_events_in_wait_list == 1) {
96 return f(event_wait_list[0]);
99 std::vector<pi_event> events{event_wait_list,
100 event_wait_list + num_events_in_wait_list};
104 return e0->get_stream() < e1->get_stream() ||
105 (e0->get_stream() == e1->get_stream() &&
106 e0->get_event_id() > e1->get_event_id());
112 if (!event || (!first &&
event->get_stream() == lastSeenStream)) {
117 lastSeenStream =
event->get_stream();
119 auto result = f(event);
120 if (result != PI_SUCCESS) {
135 pi_result check_error(CUresult result,
const char *
function,
int line,
137 if (result == CUDA_SUCCESS || result == CUDA_ERROR_DEINITIALIZED) {
141 if (std::getenv(
"SYCL_PI_SUPPRESS_ERROR_MESSAGE") ==
nullptr) {
142 const char *errorString =
nullptr;
143 const char *errorName =
nullptr;
144 cuGetErrorName(result, &errorName);
145 cuGetErrorString(result, &errorString);
146 std::stringstream ss;
147 ss <<
"\nPI CUDA ERROR:"
148 <<
"\n\tValue: " << result
149 <<
"\n\tName: " << errorName
150 <<
"\n\tDescription: " << errorString
151 <<
"\n\tFunction: " <<
function <<
"\n\tSource Location: " << file
152 <<
":" <<
line <<
"\n"
157 if (std::getenv(
"PI_CUDA_ABORT") !=
nullptr) {
161 throw map_error(result);
165 #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)
189 class ScopedContext {
193 throw PI_ERROR_INVALID_CONTEXT;
196 set_context(ctxt->
get());
199 ScopedContext(
CUcontext ctxt) { set_context(ctxt); }
207 PI_CHECK_ERROR(cuCtxGetCurrent(&original));
211 if (original != desired) {
212 PI_CHECK_ERROR(cuCtxSetCurrent(desired));
218 template <
typename T,
typename Assign>
220 size_t *param_value_size_ret, T value,
size_t value_size,
221 Assign &&assign_func) {
223 if (param_value !=
nullptr) {
225 if (param_value_size < value_size) {
226 return PI_ERROR_INVALID_VALUE;
229 assign_func(param_value, value, value_size);
232 if (param_value_size_ret !=
nullptr) {
233 *param_value_size_ret = value_size;
239 template <
typename T>
241 size_t *param_value_size_ret, T value) {
243 auto assignment = [](
void *param_value, T value,
size_t value_size) {
247 *
static_cast<T *
>(param_value) = value;
250 return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
251 sizeof(T), assignment);
254 template <
typename T>
256 void *param_value,
size_t *param_value_size_ret,
258 return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
259 array_length *
sizeof(T),
memcpy);
264 size_t *param_value_size_ret,
266 return getInfoArray(strlen(value) + 1, param_value_size, param_value,
267 param_value_size_ret, value);
270 int getAttribute(
pi_device device, CUdevice_attribute attribute) {
273 cuDeviceGetAttribute(&value, attribute,
device->get()) == CUDA_SUCCESS);
281 void guessLocalWorkSize(
_pi_device *device,
size_t *threadsPerBlock,
282 const size_t *global_work_size,
283 const size_t maxThreadsPerBlock[3],
pi_kernel kernel,
285 assert(threadsPerBlock !=
nullptr);
286 assert(global_work_size !=
nullptr);
287 assert(kernel !=
nullptr);
288 int minGrid, maxBlockSize, gridDim[3];
290 cuDeviceGetAttribute(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
292 cuDeviceGetAttribute(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
295 threadsPerBlock[1] = ((global_work_size[1] - 1) / gridDim[1]) + 1;
296 threadsPerBlock[2] = ((global_work_size[2] - 1) / gridDim[2]) + 1;
298 PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
299 &minGrid, &maxBlockSize,
kernel->get(), NULL, local_size,
300 maxThreadsPerBlock[0]));
302 gridDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
305 std::min(maxThreadsPerBlock[0],
306 std::min(global_work_size[0],
static_cast<size_t>(gridDim[0])));
310 while (0u != (global_work_size[0] % threadsPerBlock[0])) {
311 --threadsPerBlock[0];
318 if (!event_wait_list) {
322 ScopedContext active(command_queue->
get_context());
324 auto result = forLatestEvents(
325 event_wait_list, num_events_in_wait_list,
327 if (
event->get_stream() == stream) {
330 return PI_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0));
334 if (result != PI_SUCCESS) {
341 return PI_ERROR_UNKNOWN;
345 template <
typename PtrT>
346 void getUSMHostOrDevicePtr(PtrT usm_ptr, CUmemorytype *out_mem_type,
350 CUresult ret = cuPointerGetAttribute(
351 out_mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (
CUdeviceptr)usm_ptr);
352 assert((*out_mem_type != CU_MEMORYTYPE_ARRAY &&
353 *out_mem_type != CU_MEMORYTYPE_UNIFIED) &&
354 "ARRAY, UNIFIED types are not supported!");
357 if (ret == CUDA_ERROR_INVALID_VALUE) {
358 *out_mem_type = CU_MEMORYTYPE_HOST;
360 *out_host_ptr = usm_ptr;
363 }
else if (ret == CUDA_SUCCESS) {
364 *out_dev_ptr = (*out_mem_type == CU_MEMORYTYPE_DEVICE)
367 *out_host_ptr = (*out_mem_type == CU_MEMORYTYPE_HOST) ? usm_ptr :
nullptr;
385 [[noreturn]]
void die(
const char *Message) {
386 std::cerr <<
"pi_die: " << Message << std::endl;
392 std::cerr <<
"pi_print: " << Message << std::endl;
395 void assertion(
bool Condition,
const char *Message) {
468 *stream_token = token;
479 for (
pi_uint32 i = 0; i < num_events_in_wait_list; i++) {
482 std::unique_lock<std::mutex> compute_sync_guard(
490 *stream_token = token;
525 : commandType_{type}, refCount_{1}, has_ownership_{
true},
526 hasBeenWaitedOn_{
false}, isRecorded_{
false}, isStarted_{
false},
527 streamToken_{stream_token}, evEnd_{
nullptr}, evStart_{
nullptr},
528 evQueued_{
nullptr}, queue_{queue}, stream_{stream}, context_{context} {
532 PI_CHECK_ERROR(cuEventCreate(
533 &evEnd_, profilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
535 if (profilingEnabled) {
536 PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
537 PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
540 if (queue_ !=
nullptr) {
548 hasBeenWaitedOn_{
false}, isRecorded_{
false}, isStarted_{
false},
550 evStart_{
nullptr}, evQueued_{
nullptr}, queue_{
nullptr}, context_{
556 if (queue_ !=
nullptr) {
569 result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0));
570 result = PI_CHECK_ERROR(cuEventRecord(evStart_, stream_));
584 if (!hasBeenWaitedOn_) {
585 const CUresult ret = cuEventQuery(evEnd_);
586 if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_READY) {
590 if (ret == CUDA_ERROR_NOT_READY) {
598 float miliSeconds = 0.0f;
600 PI_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, evBase_, ev));
602 return static_cast<pi_uint64>(miliSeconds * 1.0e6);
623 return PI_ERROR_INVALID_EVENT;
626 pi_result result = PI_ERROR_INVALID_OPERATION;
629 return PI_ERROR_INVALID_QUEUE;
636 "Unrecoverable program state reached in event identifier overflow");
638 result = PI_CHECK_ERROR(cuEventRecord(evEnd_, stream_));
643 if (result == PI_SUCCESS) {
653 retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
654 hasBeenWaitedOn_ =
true;
666 assert(queue_ !=
nullptr);
668 PI_CHECK_ERROR(cuEventDestroy(evEnd_));
671 PI_CHECK_ERROR(cuEventDestroy(evQueued_));
672 PI_CHECK_ERROR(cuEventDestroy(evStart_));
684 PI_CHECK_ERROR(cuStreamWaitEvent(s, e, 0));
690 : module_{
nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1},
691 context_{ctxt}, kernelReqdWorkGroupSizeMD_{} {
697 std::pair<std::string, std::string>
699 size_t splitPos = metadataName.rfind(
'@');
700 if (splitPos == std::string::npos)
701 return std::make_pair(metadataName, std::string{});
702 return std::make_pair(metadataName.substr(0, splitPos),
703 metadataName.substr(splitPos, metadataName.length()));
708 for (
size_t i = 0; i < length; ++i) {
710 std::string metadataElementName{metadataElement->
Name};
717 size_t MDElemsSize = metadataElement->
ValSize -
sizeof(std::uint64_t);
720 assert(MDElemsSize >=
sizeof(std::uint32_t) &&
721 MDElemsSize <=
sizeof(std::uint32_t) * 3 &&
722 "Unexpected size for reqd_work_group_size metadata");
725 const char *ValuePtr =
726 reinterpret_cast<const char *
>(metadataElement->
ValAddr) +
727 sizeof(std::uint64_t);
729 std::uint32_t reqdWorkGroupElements[] = {1, 1, 1};
730 std::memcpy(reqdWorkGroupElements, ValuePtr, MDElemsSize);
733 reqdWorkGroupElements[2]);
735 const char *metadataValPtr =
736 reinterpret_cast<const char *
>(metadataElement->
ValAddr) +
737 sizeof(std::uint64_t);
738 const char *metadataValPtrEnd =
739 metadataValPtr + metadataElement->
ValSize -
sizeof(std::uint64_t);
740 globalIDMD_[prefix] = std::string{metadataValPtr, metadataValPtrEnd};
748 "Re-setting program binary data which has already been set");
758 constexpr
const unsigned int numberOfOptions = 4u;
760 CUjit_option options[numberOfOptions];
761 void *optionVals[numberOfOptions];
764 options[0] = CU_JIT_INFO_LOG_BUFFER;
767 options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
770 options[2] = CU_JIT_ERROR_LOG_BUFFER;
773 options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
776 auto result = PI_CHECK_ERROR(
778 numberOfOptions, options, optionVals));
780 const auto success = (result == PI_SUCCESS);
786 return success ? PI_SUCCESS : PI_ERROR_BUILD_PROGRAM_FAILURE;
805 size_t param_value_size,
void *param_value,
806 size_t *param_value_size_ret);
820 static std::once_flag initFlag;
822 static std::vector<_pi_platform> platformIds;
824 if (num_entries == 0 && platforms !=
nullptr) {
825 return PI_ERROR_INVALID_VALUE;
827 if (platforms ==
nullptr && num_platforms ==
nullptr) {
828 return PI_ERROR_INVALID_VALUE;
836 if (cuInit(0) != CUDA_SUCCESS) {
841 err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices));
842 if (numDevices == 0) {
848 numPlatforms = numDevices;
849 platformIds.resize(numDevices);
851 for (
int i = 0; i < numDevices; ++i) {
853 err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
855 err = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device));
857 ScopedContext active(context);
859 err = PI_CHECK_ERROR(cuEventCreate(&evBase, CU_EVENT_DEFAULT));
862 err = PI_CHECK_ERROR(cuEventRecord(evBase, 0));
864 platformIds[i].devices_.emplace_back(
865 new _pi_device{device, context, evBase, &platformIds[i]});
868 const auto &dev = platformIds[i].devices_.back().get();
869 size_t maxWorkGroupSize = 0u;
870 size_t maxThreadsPerBlock[3] = {};
873 sizeof(maxThreadsPerBlock), maxThreadsPerBlock,
nullptr);
874 assert(retError == PI_SUCCESS);
879 sizeof(maxWorkGroupSize), &maxWorkGroupSize,
nullptr);
880 assert(retError == PI_SUCCESS);
882 dev->save_max_work_item_sizes(
sizeof(maxThreadsPerBlock),
884 dev->save_max_work_group_size(maxWorkGroupSize);
887 }
catch (
const std::bad_alloc &) {
889 for (
int i = 0; i < numDevices; ++i) {
890 platformIds[i].devices_.clear();
893 err = PI_ERROR_OUT_OF_HOST_MEMORY;
896 for (
int i = 0; i < numDevices; ++i) {
897 platformIds[i].devices_.clear();
905 if (num_platforms !=
nullptr) {
906 *num_platforms = numPlatforms;
909 if (platforms !=
nullptr) {
910 for (
unsigned i = 0; i < std::min(num_entries, numPlatforms); ++i) {
911 platforms[i] = &platformIds[i];
919 return PI_ERROR_OUT_OF_RESOURCES;
925 size_t param_value_size,
void *param_value,
926 size_t *param_value_size_ret) {
927 assert(platform !=
nullptr);
929 switch (param_name) {
931 return getInfo(param_value_size, param_value, param_value_size_ret,
932 "NVIDIA CUDA BACKEND");
934 return getInfo(param_value_size, param_value, param_value_size_ret,
935 "NVIDIA Corporation");
937 return getInfo(param_value_size, param_value, param_value_size_ret,
940 auto version = getCudaVersionString();
941 return getInfo(param_value_size, param_value, param_value_size_ret,
945 return getInfo(param_value_size, param_value, param_value_size_ret,
"");
966 const bool returnDevices = askingForDefault || askingForGPU;
968 size_t numDevices = returnDevices ? platform->devices_.size() : 0;
972 *num_devices = numDevices;
975 if (returnDevices && devices) {
976 for (
size_t i = 0; i < std::min(
size_t(num_entries), numDevices); ++i) {
977 devices[i] = platform->devices_[i].get();
985 return PI_ERROR_OUT_OF_RESOURCES;
994 size_t param_value_size,
void *param_value,
995 size_t *param_value_size_ret) {
997 switch (param_name) {
999 return getInfo(param_value_size, param_value, param_value_size_ret, 1);
1001 return getInfo(param_value_size, param_value, param_value_size_ret,
1004 return getInfo(param_value_size, param_value, param_value_size_ret,
1013 PI_ERROR_INVALID_ARG_VALUE);
1014 return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
1017 return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
1022 return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
1028 return PI_ERROR_OUT_OF_RESOURCES;
1032 assert(context !=
nullptr);
1064 if (num_binaries < 1) {
1070 for (
pi_uint32 i = 0; i < num_binaries; i++) {
1071 if (strcmp(binaries[i]->DeviceTargetSpec,
1073 *selected_binary = i;
1079 return PI_ERROR_INVALID_BINARY;
1084 const char *func_name,
1088 assert(func_pointer_ret !=
nullptr);
1091 CUresult ret = cuModuleGetFunction(&func, program->
get(), func_name);
1092 *func_pointer_ret =
reinterpret_cast<pi_uint64>(func);
1095 if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_FOUND)
1096 retError = PI_CHECK_ERROR(ret);
1097 if (ret == CUDA_ERROR_NOT_FOUND) {
1098 *func_pointer_ret = 0;
1099 retError = PI_ERROR_INVALID_KERNEL_NAME;
1110 size_t param_value_size,
void *param_value,
1111 size_t *param_value_size_ret) {
1113 static constexpr
pi_uint32 max_work_item_dimensions = 3u;
1115 assert(device !=
nullptr);
1117 ScopedContext active(device->get_context());
1119 switch (param_name) {
1121 return getInfo(param_value_size, param_value, param_value_size_ret,
1125 return getInfo(param_value_size, param_value, param_value_size_ret, 4318u);
1128 int compute_units = 0;
1130 cuDeviceGetAttribute(&compute_units,
1131 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1132 device->get()) == CUDA_SUCCESS);
1134 return getInfo(param_value_size, param_value, param_value_size_ret,
1138 return getInfo(param_value_size, param_value, param_value_size_ret,
1139 max_work_item_dimensions);
1142 size_t return_sizes[max_work_item_dimensions];
1144 int max_x = 0, max_y = 0, max_z = 0;
1146 cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
1147 device->get()) == CUDA_SUCCESS);
1151 cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
1152 device->get()) == CUDA_SUCCESS);
1156 cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
1157 device->get()) == CUDA_SUCCESS);
1160 return_sizes[0] = size_t(max_x);
1161 return_sizes[1] = size_t(max_y);
1162 return_sizes[2] = size_t(max_z);
1163 return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
1164 param_value_size_ret, return_sizes);
1168 size_t return_sizes[max_work_item_dimensions];
1169 int max_x = 0, max_y = 0, max_z = 0;
1171 cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
1172 device->get()) == CUDA_SUCCESS);
1176 cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
1177 device->get()) == CUDA_SUCCESS);
1181 cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
1182 device->get()) == CUDA_SUCCESS);
1185 return_sizes[0] = size_t(max_x);
1186 return_sizes[1] = size_t(max_y);
1187 return_sizes[2] = size_t(max_z);
1188 return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
1189 param_value_size_ret, return_sizes);
1193 int max_work_group_size = 0;
1195 cuDeviceGetAttribute(&max_work_group_size,
1196 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1197 device->get()) == CUDA_SUCCESS);
1201 return getInfo(param_value_size, param_value, param_value_size_ret,
1202 size_t(max_work_group_size));
1205 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1208 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1211 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1214 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1217 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1220 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1223 return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1226 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1229 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1232 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1235 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1238 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1241 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1244 return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1248 int max_threads = 0;
1250 cuDeviceGetAttribute(&max_threads,
1251 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1252 device->get()) == CUDA_SUCCESS);
1255 cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
1256 device->get()) == CUDA_SUCCESS);
1257 int maxWarps = (max_threads + warpSize - 1) / warpSize;
1258 return getInfo(param_value_size, param_value, param_value_size_ret,
1259 static_cast<uint32_t
>(maxWarps));
1266 cuDeviceGetAttribute(&major,
1267 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1268 device->get()) == CUDA_SUCCESS);
1269 bool ifp = (major >= 7);
1270 return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
1276 cuDeviceGetAttribute(&major,
1277 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1278 device->get()) == CUDA_SUCCESS);
1280 bool atomic64 = (major >= 6) ?
true :
false;
1281 return getInfo(param_value_size, param_value, param_value_size_ret,
1288 return getInfo(param_value_size, param_value, param_value_size_ret,
1294 cuDeviceGetAttribute(&major,
1295 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1296 device->get()) == CUDA_SUCCESS);
1303 return getInfo(param_value_size, param_value, param_value_size_ret,
1310 PI_ERROR_INVALID_ARG_VALUE);
1311 return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
1315 cuDeviceGetAttribute(&major,
1316 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1317 device->get()) == CUDA_SUCCESS);
1319 bool bfloat16 = (major >= 8) ?
true :
false;
1320 return getInfo(param_value_size, param_value, param_value_size_ret,
1327 cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
1328 device->get()) == CUDA_SUCCESS);
1329 size_t sizes[1] = {
static_cast<size_t>(warpSize)};
1330 return getInfoArray<size_t>(1, param_value_size, param_value,
1331 param_value_size_ret, sizes);
1336 cuDeviceGetAttribute(&clock_freq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
1337 device->get()) == CUDA_SUCCESS);
1339 return getInfo(param_value_size, param_value, param_value_size_ret,
1343 auto bits =
pi_uint32{std::numeric_limits<uintptr_t>::digits};
1344 return getInfo(param_value_size, param_value, param_value_size_ret, bits);
1357 auto quarter_global =
static_cast<pi_uint32>(global / 4u);
1359 auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global),
1360 32u * 1024u * 1024u);
1362 return getInfo(param_value_size, param_value, param_value_size_ret,
1368 if (std::getenv(
"SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") !=
nullptr) {
1372 "Images are not fully supported by the CUDA BE, their support is "
1373 "disabled by default. Their partial support can be activated by "
1374 "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at "
1378 return getInfo(param_value_size, param_value, param_value_size_ret,
1385 return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1391 return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1397 cuDeviceGetAttribute(&tex_height,
1398 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT,
1399 device->get()) == CUDA_SUCCESS);
1401 int surf_height = 0;
1403 cuDeviceGetAttribute(&surf_height,
1404 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT,
1405 device->get()) == CUDA_SUCCESS);
1408 int min = std::min(tex_height, surf_height);
1410 return getInfo(param_value_size, param_value, param_value_size_ret,
min);
1416 cuDeviceGetAttribute(&tex_width,
1417 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH,
1418 device->get()) == CUDA_SUCCESS);
1422 cuDeviceGetAttribute(&surf_width,
1423 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH,
1424 device->get()) == CUDA_SUCCESS);
1427 int min = std::min(tex_width, surf_width);
1429 return getInfo(param_value_size, param_value, param_value_size_ret,
min);
1435 cuDeviceGetAttribute(&tex_height,
1436 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT,
1437 device->get()) == CUDA_SUCCESS);
1439 int surf_height = 0;
1441 cuDeviceGetAttribute(&surf_height,
1442 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT,
1443 device->get()) == CUDA_SUCCESS);
1446 int min = std::min(tex_height, surf_height);
1448 return getInfo(param_value_size, param_value, param_value_size_ret,
min);
1454 cuDeviceGetAttribute(&tex_width,
1455 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH,
1456 device->get()) == CUDA_SUCCESS);
1460 cuDeviceGetAttribute(&surf_width,
1461 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH,
1462 device->get()) == CUDA_SUCCESS);
1465 int min = std::min(tex_width, surf_width);
1467 return getInfo(param_value_size, param_value, param_value_size_ret,
min);
1473 cuDeviceGetAttribute(&tex_depth,
1474 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH,
1475 device->get()) == CUDA_SUCCESS);
1479 cuDeviceGetAttribute(&surf_depth,
1480 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH,
1481 device->get()) == CUDA_SUCCESS);
1484 int min = std::min(tex_depth, surf_depth);
1486 return getInfo(param_value_size, param_value, param_value_size_ret,
min);
1492 cuDeviceGetAttribute(&tex_width,
1493 CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH,
1494 device->get()) == CUDA_SUCCESS);
1498 cuDeviceGetAttribute(&surf_width,
1499 CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH,
1500 device->get()) == CUDA_SUCCESS);
1503 int min = std::min(tex_width, surf_width);
1505 return getInfo(param_value_size, param_value, param_value_size_ret,
min);
1508 return getInfo(param_value_size, param_value, param_value_size_ret,
1514 return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1520 return getInfo(param_value_size, param_value, param_value_size_ret,
1524 int mem_base_addr_align = 0;
1526 cuDeviceGetAttribute(&mem_base_addr_align,
1527 CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
1528 device->get()) == CUDA_SUCCESS);
1530 mem_base_addr_align *= 8;
1531 return getInfo(param_value_size, param_value, param_value_size_ret,
1532 mem_base_addr_align);
1536 return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1543 return getInfo(param_value_size, param_value, param_value_size_ret, config);
1549 return getInfo(param_value_size, param_value, param_value_size_ret, config);
1553 return getInfo(param_value_size, param_value, param_value_size_ret,
1559 return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1564 cuDeviceGetAttribute(&cache_size, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1565 device->get()) == CUDA_SUCCESS);
1568 return getInfo(param_value_size, param_value, param_value_size_ret,
1576 return getInfo(param_value_size, param_value, param_value_size_ret,
1580 int constant_memory = 0;
1582 cuDeviceGetAttribute(&constant_memory,
1583 CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
1584 device->get()) == CUDA_SUCCESS);
1587 return getInfo(param_value_size, param_value, param_value_size_ret,
1594 return getInfo(param_value_size, param_value, param_value_size_ret, 9u);
1597 return getInfo(param_value_size, param_value, param_value_size_ret,
1604 int local_mem_size = 0;
1606 cuDeviceGetAttribute(&local_mem_size,
1607 CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
1608 device->get()) == CUDA_SUCCESS);
1610 return getInfo(param_value_size, param_value, param_value_size_ret,
1614 int ecc_enabled = 0;
1616 cuDeviceGetAttribute(&ecc_enabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
1617 device->get()) == CUDA_SUCCESS);
1620 auto result =
static_cast<pi_bool>(ecc_enabled);
1621 return getInfo(param_value_size, param_value, param_value_size_ret, result);
1624 int is_integrated = 0;
1626 cuDeviceGetAttribute(&is_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
1627 device->get()) == CUDA_SUCCESS);
1630 auto result =
static_cast<pi_bool>(is_integrated);
1631 return getInfo(param_value_size, param_value, param_value_size_ret, result);
1636 return getInfo(param_value_size, param_value, param_value_size_ret,
1640 return getInfo(param_value_size, param_value, param_value_size_ret,
1644 return getInfo(param_value_size, param_value, param_value_size_ret,
1648 return getInfo(param_value_size, param_value, param_value_size_ret,
1652 return getInfo(param_value_size, param_value, param_value_size_ret,
1656 return getInfo(param_value_size, param_value, param_value_size_ret,
1661 return getInfo(param_value_size, param_value, param_value_size_ret,
1668 return getInfo(param_value_size, param_value, param_value_size_ret,
1674 return getInfo(param_value_size, param_value, param_value_size_ret,
1680 return getInfo(param_value_size, param_value, param_value_size_ret,
"");
1683 return getInfo(param_value_size, param_value, param_value_size_ret,
1684 device->get_platform());
1687 static constexpr
size_t MAX_DEVICE_NAME_LENGTH = 256u;
1688 char name[MAX_DEVICE_NAME_LENGTH];
1690 device->get()) == CUDA_SUCCESS);
1691 return getInfoArray(strlen(name) + 1, param_value_size, param_value,
1692 param_value_size_ret, name);
1695 return getInfo(param_value_size, param_value, param_value_size_ret,
1696 "NVIDIA Corporation");
1699 auto version = getCudaVersionString();
1700 return getInfo(param_value_size, param_value, param_value_size_ret,
1704 return getInfo(param_value_size, param_value, param_value_size_ret,
"CUDA");
1707 return getInfo(param_value_size, param_value, param_value_size_ret,
1708 device->get_reference_count());
1711 return getInfo(param_value_size, param_value, param_value_size_ret,
1715 return getInfo(param_value_size, param_value, param_value_size_ret,
"");
1719 std::string SupportedExtensions =
"cl_khr_fp64 ";
1721 SupportedExtensions +=
" ";
1727 cuDeviceGetAttribute(&major,
1728 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1729 device->get()) == CUDA_SUCCESS);
1731 cuDeviceGetAttribute(&minor,
1732 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
1733 device->get()) == CUDA_SUCCESS);
1735 if ((major >= 6) || ((major == 5) && (minor >= 3))) {
1736 SupportedExtensions +=
"cl_khr_fp16 ";
1739 return getInfo(param_value_size, param_value, param_value_size_ret,
1740 SupportedExtensions.c_str());
1744 return getInfo(param_value_size, param_value, param_value_size_ret,
1748 return getInfo(param_value_size, param_value, param_value_size_ret,
1752 return getInfo(param_value_size, param_value, param_value_size_ret,
1756 return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1759 return getInfo(param_value_size, param_value, param_value_size_ret,
1763 return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1766 return getInfo(param_value_size, param_value, param_value_size_ret,
1779 if (getAttribute(device, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
1781 if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
1794 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1805 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1814 if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
1818 if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
1822 if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
1829 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1841 if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
1845 if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
1851 if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
1860 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1870 if (getAttribute(device, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) {
1873 if (getAttribute(device,
1874 CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) {
1885 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1889 getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8;
1890 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1894 getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
1896 getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
1897 std::string result = std::to_string(major) +
"." + std::to_string(minor);
1898 return getInfo(param_value_size, param_value, param_value_size_ret,
1903 size_t FreeMemory = 0;
1904 size_t TotalMemory = 0;
1907 "failed cuMemGetInfo() API.");
1908 return getInfo(param_value_size, param_value, param_value_size_ret,
1914 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
1915 device->get()) == CUDA_SUCCESS);
1918 return getInfo(param_value_size, param_value, param_value_size_ret,
1924 cuDeviceGetAttribute(&value,
1925 CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
1926 device->get()) == CUDA_SUCCESS);
1928 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1931 return getInfo(param_value_size, param_value, param_value_size_ret,
1938 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
1939 device->get()) == CUDA_SUCCESS);
1941 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1946 #if (CUDA_VERSION >= 11040)
1953 std::array<unsigned char, 16> name;
1954 std::copy(uuid.bytes, uuid.bytes + 16, name.begin());
1955 return getInfoArray(16, param_value_size, param_value, param_value_size_ret,
1962 cuDeviceGetAttribute(&major,
1963 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1964 device->get()) == CUDA_SUCCESS);
1968 cuDeviceGetAttribute(&minor,
1969 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
1970 device->get()) == CUDA_SUCCESS);
1974 bool is_xavier_agx = major == 7 && minor == 2;
1975 bool is_orin_agx = major == 8 && minor == 7;
1977 int memory_clock_khz = 0;
1978 if (is_xavier_agx) {
1979 memory_clock_khz = 2133000;
1980 }
else if (is_orin_agx) {
1981 memory_clock_khz = 3200000;
1984 cuDeviceGetAttribute(&memory_clock_khz,
1985 CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
1986 device->get()) == CUDA_SUCCESS);
1989 int memory_bus_width = 0;
1991 memory_bus_width = 256;
1994 cuDeviceGetAttribute(&memory_bus_width,
1995 CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
1996 device->get()) == CUDA_SUCCESS);
1999 uint64_t memory_bandwidth =
2000 uint64_t(memory_clock_khz) * memory_bus_width * 250;
2002 return getInfo(param_value_size, param_value, param_value_size_ret,
2014 return PI_ERROR_INVALID_VALUE;
2046 assert(piDevice !=
nullptr);
2050 auto is_device = [=](std::unique_ptr<_pi_device> &dev) {
2051 return dev->get() == cu_device;
2056 auto search_res = std::find_if(begin(platform->devices_),
2057 end(platform->devices_), is_device);
2058 if (search_res != end(platform->devices_)) {
2059 *piDevice = (*search_res).
get();
2067 if (result != PI_SUCCESS)
2073 if (result != PI_SUCCESS)
2077 for (
pi_uint32 j = 0; j < num_platforms; ++j) {
2078 auto search_res = std::find_if(begin(plat[j]->devices_),
2079 end(plat[j]->devices_), is_device);
2080 if (search_res != end(plat[j]->devices_)) {
2081 *piDevice = (*search_res).
get();
2088 return PI_ERROR_INVALID_OPERATION;
2114 const void *private_info,
2115 size_t cb,
void *user_data),
2118 assert(devices !=
nullptr);
2121 assert(user_data ==
nullptr);
2122 assert(num_devices == 1);
2124 assert(retcontext !=
nullptr);
2127 std::unique_ptr<_pi_context> piContextPtr{
nullptr};
2129 piContextPtr = std::unique_ptr<_pi_context>(
new _pi_context{*devices});
2130 *retcontext = piContextPtr.release();
2134 errcode_ret = PI_ERROR_OUT_OF_RESOURCES;
2140 assert(ctxt !=
nullptr);
2147 std::unique_ptr<_pi_context> context{ctxt};
2174 bool ownNativeHandle,
2179 (void)ownNativeHandle;
2181 assert(piContext !=
nullptr);
2182 assert(ownNativeHandle ==
false);
2184 return PI_ERROR_INVALID_OPERATION;
2195 assert(ret_mem !=
nullptr);
2196 assert((properties ==
nullptr || *properties == 0) &&
2197 "no mem properties goes to cuda RT yet");
2201 const bool enableUseHostPtr =
false;
2202 const bool performInitialCopy =
2206 pi_mem retMemObj =
nullptr;
2209 ScopedContext active(context);
2215 retErr = PI_CHECK_ERROR(
2216 cuMemHostRegister(
host_ptr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
2217 retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr,
host_ptr, 0));
2220 retErr = PI_CHECK_ERROR(cuMemAllocHost(&
host_ptr, size));
2221 retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr,
host_ptr, 0));
2224 retErr = PI_CHECK_ERROR(cuMemAlloc(&ptr, size));
2230 if (retErr == PI_SUCCESS) {
2231 pi_mem parentBuffer =
nullptr;
2233 auto piMemObj = std::unique_ptr<_pi_mem>(
2235 if (piMemObj !=
nullptr) {
2236 retMemObj = piMemObj.release();
2237 if (performInitialCopy) {
2239 retErr = PI_CHECK_ERROR(cuMemcpyHtoD(ptr,
host_ptr, size));
2243 if (retErr == PI_SUCCESS) {
2245 retErr = PI_CHECK_ERROR(cuStreamSynchronize(defaultStream));
2249 retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
2255 retErr = PI_ERROR_OUT_OF_RESOURCES;
2258 *ret_mem = retMemObj;
2268 assert((memObj !=
nullptr) &&
"PI_ERROR_INVALID_MEM_OBJECTS");
2280 std::unique_ptr<_pi_mem> uniqueMemObj(memObj);
2286 ScopedContext active(uniqueMemObj->get_context());
2289 switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) {
2292 ret = PI_CHECK_ERROR(cuMemFree(uniqueMemObj->mem_.buffer_mem_.ptr_));
2295 ret = PI_CHECK_ERROR(
2296 cuMemHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
2299 ret = PI_CHECK_ERROR(
2300 cuMemFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
2303 ret = PI_CHECK_ERROR(
2304 cuSurfObjectDestroy(uniqueMemObj->mem_.surface_mem_.get_surface()));
2305 ret = PI_CHECK_ERROR(
2306 cuArrayDestroy(uniqueMemObj->mem_.surface_mem_.get_array()));
2312 ret = PI_ERROR_OUT_OF_RESOURCES;
2315 if (ret != PI_SUCCESS) {
2321 "Unrecoverable program state reached in cuda_piMemRelease");
2333 void *buffer_create_info,
pi_mem *memObj) {
2334 assert((parent_buffer !=
nullptr) &&
"PI_ERROR_INVALID_MEM_OBJECT");
2335 assert(parent_buffer->
is_buffer() &&
"PI_ERROR_INVALID_MEM_OBJECTS");
2336 assert(!parent_buffer->
is_sub_buffer() &&
"PI_ERROR_INVALID_MEM_OBJECT");
2345 "PI_ERROR_INVALID_VALUE");
2346 assert((buffer_create_info !=
nullptr) &&
"PI_ERROR_INVALID_VALUE");
2347 assert(memObj !=
nullptr);
2349 const auto bufferRegion =
2351 assert((bufferRegion.size != 0u) &&
"PI_ERROR_INVALID_BUFFER_SIZE");
2353 assert((bufferRegion.origin <= (bufferRegion.origin + bufferRegion.size)) &&
2355 assert(((bufferRegion.origin + bufferRegion.size) <=
2357 "PI_ERROR_INVALID_BUFFER_SIZE");
2368 void *hostPtr =
nullptr;
2371 bufferRegion.origin;
2374 std::unique_ptr<_pi_mem> retMemObj{
nullptr};
2376 retMemObj = std::unique_ptr<_pi_mem>{
new _pi_mem{
2377 context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}};
2383 return PI_ERROR_OUT_OF_HOST_MEMORY;
2386 *memObj = retMemObj.release();
2419 bool ownNativeHandle,
2422 "Creation of PI mem from native handle not implemented");
2435 std::unique_ptr<_pi_queue> queueImpl{
nullptr};
2439 return PI_ERROR_INVALID_DEVICE;
2442 unsigned int flags = 0;
2444 flags = CU_STREAM_DEFAULT;
2448 flags = CU_STREAM_NON_BLOCKING;
2451 const bool is_out_of_order =
2454 std::vector<CUstream> computeCuStreams(
2456 std::vector<CUstream> transferCuStreams(
2459 queueImpl = std::unique_ptr<_pi_queue>(
2460 new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams),
2461 context, device, properties, flags});
2463 *queue = queueImpl.release();
2472 return PI_ERROR_OUT_OF_RESOURCES;
2482 return PI_ERROR_INVALID_VALUE;
2485 assert(Properties[2] == 0);
2486 if (Properties[2] != 0)
2487 return PI_ERROR_INVALID_VALUE;
2492 size_t param_value_size,
void *param_value,
2493 size_t *param_value_size_ret) {
2494 assert(command_queue !=
nullptr);
2496 switch (param_name) {
2498 return getInfo(param_value_size, param_value, param_value_size_ret,
2501 return getInfo(param_value_size, param_value, param_value_size_ret,
2504 return getInfo(param_value_size, param_value, param_value_size_ret,
2507 return getInfo(param_value_size, param_value, param_value_size_ret,
2512 const CUresult ret = cuStreamQuery(
s);
2513 if (ret == CUDA_SUCCESS)
2516 if (ret == CUDA_ERROR_NOT_READY)
2519 PI_CHECK_ERROR(ret);
2522 return getInfo(param_value_size, param_value, param_value_size_ret,
2527 return PI_ERROR_OUT_OF_RESOURCES;
2538 assert(command_queue !=
nullptr);
2546 assert(command_queue !=
nullptr);
2553 std::unique_ptr<_pi_queue> queueImpl(command_queue);
2558 ScopedContext active(command_queue->
get_context());
2561 PI_CHECK_ERROR(cuStreamSynchronize(
s));
2562 PI_CHECK_ERROR(cuStreamDestroy(
s));
2569 return PI_ERROR_OUT_OF_RESOURCES;
2578 assert(command_queue !=
2580 ScopedContext active(command_queue->
get_context());
2583 result = PI_CHECK_ERROR(cuStreamSynchronize(
s));
2592 result = PI_ERROR_OUT_OF_RESOURCES;
2602 (void)command_queue;
2633 bool ownNativeHandle,
2636 (void)ownNativeHandle;
2637 assert(ownNativeHandle ==
false);
2642 auto retErr = PI_CHECK_ERROR(cuStreamGetFlags(cuStream, &flags));
2645 if (flags == CU_STREAM_DEFAULT)
2647 else if (flags == CU_STREAM_NON_BLOCKING)
2652 std::vector<CUstream> computeCuStreams(1, cuStream);
2653 std::vector<CUstream> transferCuStreams(0);
2657 *queue =
new _pi_queue{std::move(computeCuStreams),
2658 std::move(transferCuStreams),
2664 (*queue)->num_compute_streams_ = 1;
2670 pi_bool blocking_write,
size_t offset,
2671 size_t size,
const void *ptr,
2676 assert(buffer !=
nullptr);
2677 assert(command_queue !=
nullptr);
2680 std::unique_ptr<_pi_event> retImplEv{
nullptr};
2683 ScopedContext active(command_queue->
get_context());
2686 retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
2696 PI_CHECK_ERROR(cuMemcpyHtoDAsync(devPtr + offset, ptr, size, cuStream));
2699 retErr = retImplEv->record();
2702 if (blocking_write) {
2703 retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
2707 *
event = retImplEv.release();
2716 pi_bool blocking_read,
size_t offset,
2717 size_t size,
void *ptr,
2722 assert(buffer !=
nullptr);
2723 assert(command_queue !=
nullptr);
2726 std::unique_ptr<_pi_event> retImplEv{
nullptr};
2729 ScopedContext active(command_queue->
get_context());
2732 retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
2742 PI_CHECK_ERROR(cuMemcpyDtoHAsync(ptr, devPtr + offset, size, cuStream));
2745 retErr = retImplEv->record();
2748 if (blocking_read) {
2749 retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
2753 *
event = retImplEv.release();
2765 assert(num_events != 0);
2767 if (num_events == 0) {
2768 return PI_ERROR_INVALID_VALUE;
2772 return PI_ERROR_INVALID_EVENT;
2776 ScopedContext active(context);
2780 return PI_ERROR_INVALID_EVENT;
2783 if (event->get_context() != context) {
2784 return PI_ERROR_INVALID_CONTEXT;
2787 return event->wait();
2789 return forLatestEvents(event_list, num_events, waitFunc);
2793 return PI_ERROR_OUT_OF_RESOURCES;
2799 assert(kernel !=
nullptr);
2800 assert(program !=
nullptr);
2803 std::unique_ptr<_pi_kernel> retKernel{
nullptr};
2809 retErr = PI_CHECK_ERROR(
2810 cuModuleGetFunction(&cuFunc, program->
get(), kernel_name));
2812 std::string kernel_name_woffset = std::string(kernel_name) +
"_with_offset";
2813 CUfunction cuFuncWithOffsetParam;
2814 CUresult offsetRes = cuModuleGetFunction(
2815 &cuFuncWithOffsetParam, program->
get(), kernel_name_woffset.c_str());
2818 if (offsetRes == CUDA_ERROR_NOT_FOUND) {
2819 cuFuncWithOffsetParam =
nullptr;
2821 retErr = PI_CHECK_ERROR(offsetRes);
2824 retKernel = std::unique_ptr<_pi_kernel>(
2825 new _pi_kernel{cuFunc, cuFuncWithOffsetParam, kernel_name, program,
2830 retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
2833 *kernel = retKernel.release();
2838 size_t arg_size,
const void *arg_value) {
2840 assert(kernel !=
nullptr);
2844 kernel->set_kernel_arg(arg_index, arg_size, arg_value);
2846 kernel->set_kernel_local_arg(arg_index, arg_size);
2855 const pi_mem *arg_value) {
2857 assert(kernel !=
nullptr);
2858 assert(arg_value !=
nullptr);
2862 pi_mem arg_mem = *arg_value;
2864 CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
2865 PI_CHECK_ERROR(cuArray3DGetDescriptor(
2867 if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 &&
2868 arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 &&
2869 arrayDesc.Format != CU_AD_FORMAT_HALF &&
2870 arrayDesc.Format != CU_AD_FORMAT_FLOAT) {
2872 "types int32, uint32, float, and half.",
2873 PI_ERROR_PLUGIN_SPECIFIC_ERROR);
2874 return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
2877 kernel->set_kernel_arg(arg_index,
sizeof(cuSurf), (
void *)&cuSurf);
2880 kernel->set_kernel_arg(arg_index,
sizeof(
CUdeviceptr), (
void *)&cuPtr);
2891 assert(kernel !=
nullptr);
2892 assert(arg_value !=
nullptr);
2896 pi_uint32 samplerProps = (*arg_value)->props_;
2897 kernel->set_kernel_arg(arg_index,
sizeof(
pi_uint32), (
void *)&samplerProps);
2906 size_t param_value_size,
void *param_value,
2907 size_t *param_value_size_ret) {
2911 if (kernel !=
nullptr) {
2913 switch (param_name) {
2915 int max_threads = 0;
2917 cuFuncGetAttribute(&max_threads,
2918 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
2919 kernel->get()) == CUDA_SUCCESS);
2920 return getInfo(param_value_size, param_value, param_value_size_ret,
2921 size_t(max_threads));
2924 size_t group_size[3] = {0, 0, 0};
2925 const auto &reqd_wg_size_md_map =
2926 kernel->program_->kernelReqdWorkGroupSizeMD_;
2927 const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_);
2928 if (reqd_wg_size_md != reqd_wg_size_md_map.end()) {
2929 const auto reqd_wg_size = reqd_wg_size_md->second;
2930 group_size[0] = std::get<0>(reqd_wg_size);
2931 group_size[1] = std::get<1>(reqd_wg_size);
2932 group_size[2] = std::get<2>(reqd_wg_size);
2935 param_value_size_ret, group_size);
2941 cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
2942 kernel->get()) == CUDA_SUCCESS);
2943 return getInfo(param_value_size, param_value, param_value_size_ret,
2950 cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
2951 device->get()) == CUDA_SUCCESS);
2952 return getInfo(param_value_size, param_value, param_value_size_ret,
2953 static_cast<size_t>(warpSize));
2959 cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
2960 kernel->get()) == CUDA_SUCCESS);
2961 return getInfo(param_value_size, param_value, param_value_size_ret,
2967 cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
2968 kernel->get()) == CUDA_SUCCESS);
2969 return getInfo(param_value_size, param_value, param_value_size_ret,
2977 return PI_ERROR_INVALID_KERNEL;
2982 const size_t *global_work_offset,
const size_t *global_work_size,
2983 const size_t *local_work_size,
pi_uint32 num_events_in_wait_list,
2987 assert(command_queue !=
nullptr);
2988 assert(command_queue->
get_context() == kernel->get_context());
2989 assert(kernel !=
nullptr);
2990 assert(global_work_offset !=
nullptr);
2991 assert(work_dim > 0);
2992 assert(work_dim < 4);
2994 if (*global_work_size == 0) {
2996 command_queue, num_events_in_wait_list, event_wait_list, event);
3001 size_t threadsPerBlock[3] = {32u, 1u, 1u};
3002 size_t maxWorkGroupSize = 0u;
3003 size_t maxThreadsPerBlock[3] = {};
3004 bool providedLocalWorkGroupSize = (local_work_size !=
nullptr);
3005 pi_uint32 local_size = kernel->get_local_size();
3010 ScopedContext active(command_queue->
get_context());
3012 size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_;
3015 sizeof(maxThreadsPerBlock), maxThreadsPerBlock);
3017 if (providedLocalWorkGroupSize) {
3018 auto isValid = [&](
int dim) {
3019 if (reqdThreadsPerBlock[dim] != 0 &&
3020 local_work_size[dim] != reqdThreadsPerBlock[dim])
3021 return PI_ERROR_INVALID_WORK_GROUP_SIZE;
3023 if (local_work_size[dim] > maxThreadsPerBlock[dim])
3024 return PI_ERROR_INVALID_WORK_GROUP_SIZE;
3028 if (0u == local_work_size[dim])
3029 return PI_ERROR_INVALID_WORK_GROUP_SIZE;
3030 if (0u != (global_work_size[dim] % local_work_size[dim]))
3031 return PI_ERROR_INVALID_WORK_GROUP_SIZE;
3032 threadsPerBlock[dim] = local_work_size[dim];
3036 for (
size_t dim = 0; dim < work_dim; dim++) {
3037 auto err = isValid(dim);
3038 if (err != PI_SUCCESS)
3042 guessLocalWorkSize(command_queue->
device_, threadsPerBlock,
3043 global_work_size, maxThreadsPerBlock, kernel,
3048 if (maxWorkGroupSize <
3049 size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
3050 return PI_ERROR_INVALID_WORK_GROUP_SIZE;
3053 size_t blocksPerGrid[3] = {1u, 1u, 1u};
3055 for (
size_t i = 0; i < work_dim; i++) {
3057 (global_work_size[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i];
3060 std::unique_ptr<_pi_event> retImplEv{
nullptr};
3065 num_events_in_wait_list, event_wait_list, guard, &stream_token);
3066 CUfunction cuFunc = kernel->get();
3068 retError = enqueueEventsWait(command_queue, cuStream,
3069 num_events_in_wait_list, event_wait_list);
3072 if (kernel->get_with_offset_parameter()) {
3073 std::uint32_t cuda_implicit_offset[3] = {0, 0, 0};
3074 if (global_work_offset) {
3075 for (
size_t i = 0; i < work_dim; i++) {
3076 cuda_implicit_offset[i] =
3077 static_cast<std::uint32_t
>(global_work_offset[i]);
3078 if (global_work_offset[i] != 0) {
3079 cuFunc = kernel->get_with_offset_parameter();
3083 kernel->set_implicit_offset_arg(
sizeof(cuda_implicit_offset),
3084 cuda_implicit_offset);
3087 auto &argIndices = kernel->get_arg_indices();
3090 retImplEv = std::unique_ptr<_pi_event>(
3092 cuStream, stream_token));
3097 static const char *local_mem_sz_ptr =
3098 std::getenv(
"SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE");
3100 if (local_mem_sz_ptr) {
3101 int device_max_local_mem = 0;
3102 cuDeviceGetAttribute(
3103 &device_max_local_mem,
3104 CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
3107 static const int env_val = std::atoi(local_mem_sz_ptr);
3108 if (env_val <= 0 || env_val > device_max_local_mem) {
3110 "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
3111 PI_ERROR_PLUGIN_SPECIFIC_ERROR);
3112 return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
3114 PI_CHECK_ERROR(cuFuncSetAttribute(
3115 cuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, env_val));
3118 retError = PI_CHECK_ERROR(cuLaunchKernel(
3119 cuFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2],
3120 threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], local_size,
3121 cuStream,
const_cast<void **
>(argIndices.data()),
nullptr));
3122 if (local_size != 0)
3123 kernel->clear_local_size();
3126 retError = retImplEv->record();
3127 *
event = retImplEv.release();
3156 assert(ret_mem !=
nullptr);
3166 "cuda_piMemImageCreate only supports RGBA channel order");
3172 CUDA_ARRAY3D_DESCRIPTOR array_desc;
3173 array_desc.NumChannels = 4;
3174 array_desc.Flags = 0;
3177 array_desc.Height = 0;
3178 array_desc.Depth = 0;
3181 array_desc.Depth = 0;
3188 size_t pixel_type_size_bytes;
3193 array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
3194 pixel_type_size_bytes = 1;
3197 array_desc.Format = CU_AD_FORMAT_SIGNED_INT8;
3198 pixel_type_size_bytes = 1;
3202 array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
3203 pixel_type_size_bytes = 2;
3206 array_desc.Format = CU_AD_FORMAT_SIGNED_INT16;
3207 pixel_type_size_bytes = 2;
3210 array_desc.Format = CU_AD_FORMAT_HALF;
3211 pixel_type_size_bytes = 2;
3214 array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
3215 pixel_type_size_bytes = 4;
3218 array_desc.Format = CU_AD_FORMAT_SIGNED_INT32;
3219 pixel_type_size_bytes = 4;
3222 array_desc.Format = CU_AD_FORMAT_FLOAT;
3223 pixel_type_size_bytes = 4;
3227 "cuda_piMemImageCreate given unsupported image_channel_data_type");
3231 size_t pixel_size_bytes =
3232 pixel_type_size_bytes * 4;
3233 size_t image_size_bytes = pixel_size_bytes * image_desc->
image_width *
3236 ScopedContext active(context);
3237 CUarray image_array;
3238 retErr = PI_CHECK_ERROR(cuArray3DCreate(&image_array, &array_desc));
3241 if (performInitialCopy) {
3244 retErr = PI_CHECK_ERROR(
3245 cuMemcpyHtoA(image_array, 0,
host_ptr, image_size_bytes));
3247 CUDA_MEMCPY2D cpy_desc;
3248 memset(&cpy_desc, 0,
sizeof(cpy_desc));
3249 cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
3251 cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
3252 cpy_desc.dstArray = image_array;
3253 cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->
image_width;
3255 retErr = PI_CHECK_ERROR(cuMemcpy2D(&cpy_desc));
3257 CUDA_MEMCPY3D cpy_desc;
3258 memset(&cpy_desc, 0,
sizeof(cpy_desc));
3259 cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
3261 cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
3262 cpy_desc.dstArray = image_array;
3263 cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->
image_width;
3266 retErr = PI_CHECK_ERROR(cuMemcpy3D(&cpy_desc));
3279 CUDA_RESOURCE_DESC image_res_desc;
3280 image_res_desc.res.array.hArray = image_array;
3281 image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
3282 image_res_desc.flags = 0;
3284 CUsurfObject surface;
3285 retErr = PI_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc));
3287 auto piMemObj = std::unique_ptr<_pi_mem>(
new _pi_mem{
3290 if (piMemObj ==
nullptr) {
3291 return PI_ERROR_OUT_OF_HOST_MEMORY;
3294 *ret_mem = piMemObj.release();
3296 cuArrayDestroy(image_array);
3299 cuArrayDestroy(image_array);
3300 return PI_ERROR_UNKNOWN;
3314 assert(mem !=
nullptr);
3326 return PI_ERROR_INVALID_OPERATION;
3334 const pi_device *device_list,
const char *options,
3339 assert(program !=
nullptr);
3340 assert(num_devices == 1 || num_devices == 0);
3341 assert(device_list !=
nullptr || num_devices == 0);
3343 assert(user_data ==
nullptr);
3371 const size_t *lengths,
const unsigned char **binaries,
3375 (void)binary_status;
3377 assert(context !=
nullptr);
3378 assert(binaries !=
nullptr);
3379 assert(program !=
nullptr);
3380 assert(device_list !=
nullptr);
3381 assert(num_devices == 1 &&
"CUDA contexts are for a single device");
3383 "Mismatch between devices context and passed context when creating "
3384 "program from binary");
3388 std::unique_ptr<_pi_program> retProgram{
new _pi_program{context}};
3390 retProgram->set_metadata(metadata, num_metadata_entries);
3392 const bool has_length = (lengths !=
nullptr);
3393 size_t length = has_length
3395 : strlen(
reinterpret_cast<const char *
>(binaries[0])) + 1;
3397 assert(length != 0);
3399 retProgram->set_binary(
reinterpret_cast<const char *
>(binaries[0]), length);
3401 *program = retProgram.release();
3407 size_t param_value_size,
void *param_value,
3408 size_t *param_value_size_ret) {
3409 assert(program !=
nullptr);
3411 switch (param_name) {
3413 return getInfo(param_value_size, param_value, param_value_size_ret,
3416 return getInfo(param_value_size, param_value, param_value_size_ret,
3419 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
3421 return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3424 return getInfo(param_value_size, param_value, param_value_size_ret,
3427 return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3430 return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3433 return getInfo(param_value_size, param_value, param_value_size_ret,
3448 const pi_device *device_list,
const char *options,
3455 assert(ret_program !=
nullptr);
3456 assert(num_devices == 1 || num_devices == 0);
3457 assert(device_list !=
nullptr || num_devices == 0);
3459 assert(user_data ==
nullptr);
3463 ScopedContext active(context);
3466 std::unique_ptr<_pi_program> retProgram{
new _pi_program{context}};
3468 retError = PI_CHECK_ERROR(cuLinkCreate(0,
nullptr,
nullptr, &state));
3470 for (
size_t i = 0; i < num_input_programs; ++i) {
3472 retError = PI_CHECK_ERROR(cuLinkAddData(
3473 state, CU_JIT_INPUT_PTX,
const_cast<char *
>(program->
binary_),
3476 void *cubin =
nullptr;
3477 size_t cubinSize = 0;
3478 retError = PI_CHECK_ERROR(cuLinkComplete(state, &cubin, &cubinSize));
3481 retProgram->set_binary(
static_cast<const char *
>(cubin), cubinSize);
3483 if (retError != PI_SUCCESS) {
3487 retError = retProgram->build_program(options);
3489 if (retError != PI_SUCCESS) {
3494 PI_CHECK_ERROR(cuLinkDestroy(state));
3498 retError = PI_CHECK_ERROR(cuLinkDestroy(state));
3499 *ret_program = retProgram.release();
3513 const char *options,
pi_uint32 num_input_headers,
3514 const pi_program *input_headers,
const char **header_include_names,
3517 (void)header_include_names;
3518 (void)input_headers;
3520 assert(program !=
nullptr);
3521 assert(num_devices == 1 || num_devices == 0);
3522 assert(device_list !=
nullptr || num_devices == 0);
3524 assert(user_data ==
nullptr);
3525 assert(num_input_headers == 0);
3541 size_t param_value_size,
void *param_value,
3542 size_t *param_value_size_ret) {
3546 assert(program !=
nullptr);
3548 switch (param_name) {
3550 return getInfo(param_value_size, param_value, param_value_size_ret,
3554 return getInfo(param_value_size, param_value, param_value_size_ret,
3558 param_value_size_ret, program->
infoLog_);
3567 assert(program !=
nullptr);
3577 assert(program !=
nullptr);
3582 "Reference count overflow detected in cuda_piProgramRelease.");
3587 std::unique_ptr<_pi_program> program_ptr{program};
3589 pi_result result = PI_ERROR_INVALID_PROGRAM;
3593 auto cuModule = program->
get();
3594 result = PI_CHECK_ERROR(cuModuleUnload(cuModule));
3596 result = PI_ERROR_OUT_OF_RESOURCES;
3629 "Creation of PI program from native handle not implemented");
3634 size_t param_value_size,
void *param_value,
3635 size_t *param_value_size_ret) {
3637 if (kernel !=
nullptr) {
3639 switch (param_name) {
3641 return getInfo(param_value_size, param_value, param_value_size_ret,
3642 kernel->get_name());
3644 return getInfo(param_value_size, param_value, param_value_size_ret,
3645 kernel->get_num_args());
3647 return getInfo(param_value_size, param_value, param_value_size_ret,
3648 kernel->get_reference_count());
3650 return getInfo(param_value_size, param_value, param_value_size_ret,
3651 kernel->get_context());
3654 return getInfo(param_value_size, param_value, param_value_size_ret,
3655 kernel->get_program());
3658 return getInfo(param_value_size, param_value, param_value_size_ret,
"");
3666 return PI_ERROR_INVALID_KERNEL;
3671 size_t input_value_size,
const void *input_value,
size_t param_value_size,
3672 void *param_value,
size_t *param_value_size_ret) {
3674 (void)input_value_size;
3677 if (kernel !=
nullptr) {
3678 switch (param_name) {
3683 cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
3684 device->get()) == CUDA_SUCCESS);
3685 return getInfo(param_value_size, param_value, param_value_size_ret,
3686 static_cast<uint32_t
>(warpSize));
3690 int max_threads = 0;
3692 cuFuncGetAttribute(&max_threads,
3693 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
3694 kernel->get()) == CUDA_SUCCESS);
3697 0,
nullptr,
sizeof(uint32_t), &warpSize,
3699 int maxWarps = (max_threads + warpSize - 1) / warpSize;
3700 return getInfo(param_value_size, param_value, param_value_size_ret,
3701 static_cast<uint32_t
>(maxWarps));
3706 return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3713 return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3719 return PI_ERROR_INVALID_KERNEL;
3723 assert(kernel !=
nullptr);
3724 assert(kernel->get_reference_count() > 0u);
3726 kernel->increment_reference_count();
3731 assert(kernel !=
nullptr);
3735 assert(kernel->get_reference_count() != 0 &&
3736 "Reference count overflow detected in cuda_piKernelRelease.");
3739 if (kernel->decrement_reference_count() == 0) {
3755 size_t,
const void *) {
3764 const void *arg_value) {
3765 kernel->set_kernel_arg(arg_index, arg_size, arg_value);
3777 size_t param_value_size,
void *param_value,
3778 size_t *param_value_size_ret) {
3779 assert(event !=
nullptr);
3781 switch (param_name) {
3783 return getInfo(param_value_size, param_value, param_value_size_ret,
3784 event->get_queue());
3786 return getInfo(param_value_size, param_value, param_value_size_ret,
3787 event->get_command_type());
3789 return getInfo(param_value_size, param_value, param_value_size_ret,
3790 event->get_reference_count());
3792 return getInfo(param_value_size, param_value, param_value_size_ret,
3796 return getInfo(param_value_size, param_value, param_value_size_ret,
3797 event->get_context());
3802 return PI_ERROR_INVALID_EVENT;
3809 size_t param_value_size,
3811 size_t *param_value_size_ret) {
3813 assert(event !=
nullptr);
3815 pi_queue queue =
event->get_queue();
3816 if (queue ==
nullptr ||
3818 return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE;
3821 switch (param_name) {
3825 return getInfo<pi_uint64>(param_value_size, param_value,
3826 param_value_size_ret, event->get_queued_time());
3828 return getInfo<pi_uint64>(param_value_size, param_value,
3829 param_value_size_ret, event->get_start_time());
3831 return getInfo<pi_uint64>(param_value_size, param_value,
3832 param_value_size_ret, event->get_end_time());
3847 return PI_ERROR_INVALID_VALUE;
3851 assert(event !=
nullptr);
3853 const auto refCount =
event->increment_reference_count();
3857 "Reference count overflow detected in cuda_piEventRetain.");
3863 assert(event !=
nullptr);
3868 event->get_reference_count() != 0,
3869 "Reference count overflow detected in cuda_piEventRelease.");
3872 if (event->decrement_reference_count() == 0) {
3873 std::unique_ptr<_pi_event> event_ptr{
event};
3874 pi_result result = PI_ERROR_INVALID_EVENT;
3876 ScopedContext active(event->get_context());
3877 result =
event->release();
3879 result = PI_ERROR_OUT_OF_RESOURCES;
3897 command_queue, num_events_in_wait_list, event_wait_list, event);
3918 if (!command_queue) {
3919 return PI_ERROR_INVALID_QUEUE;
3925 ScopedContext active(command_queue->
get_context());
3929 num_events_in_wait_list, event_wait_list, guard, &stream_token);
3931 std::lock_guard<std::mutex> guard(command_queue->
barrier_mutex_);
3934 CU_EVENT_DISABLE_TIMING));
3936 if (num_events_in_wait_list == 0) {
3939 CU_EVENT_DISABLE_TIMING));
3944 if (cuStream != s) {
3947 PI_CHECK_ERROR(cuEventRecord(tmp_event, s));
3948 PI_CHECK_ERROR(cuStreamWaitEvent(cuStream, tmp_event, 0));
3952 forLatestEvents(event_wait_list, num_events_in_wait_list,
3954 if (event->get_queue()->has_been_synchronized(
3955 event->get_compute_stream_token())) {
3958 return PI_CHECK_ERROR(
3959 cuStreamWaitEvent(cuStream, event->get(), 0));
3964 result = PI_CHECK_ERROR(
3966 for (
unsigned int i = 0;
3970 for (
unsigned int i = 0;
3975 if (result != PI_SUCCESS) {
3981 cuStream, stream_token);
3990 return PI_ERROR_UNKNOWN;
4016 bool ownNativeHandle,
4018 (void)ownNativeHandle;
4019 assert(!ownNativeHandle);
4021 std::unique_ptr<_pi_event> event_ptr{
nullptr};
4024 reinterpret_cast<CUevent>(nativeHandle));
4041 std::unique_ptr<_pi_sampler> retImplSampl{
new _pi_sampler(context)};
4043 bool propSeen[3] = {
false,
false,
false};
4044 for (
size_t i = 0; sampler_properties[i] != 0; i += 2) {
4045 switch (sampler_properties[i]) {
4048 return PI_ERROR_INVALID_VALUE;
4051 retImplSampl->props_ |= sampler_properties[i + 1];
4055 return PI_ERROR_INVALID_VALUE;
4058 retImplSampl->props_ |=
4063 return PI_ERROR_INVALID_VALUE;
4066 retImplSampl->props_ |=
4070 return PI_ERROR_INVALID_VALUE;
4075 retImplSampl->props_ |=
PI_TRUE;
4079 retImplSampl->props_ |=
4084 *result_sampler = retImplSampl.release();
4098 size_t param_value_size,
void *param_value,
4099 size_t *param_value_size_ret) {
4100 assert(sampler !=
nullptr);
4102 switch (param_name) {
4104 return getInfo(param_value_size, param_value, param_value_size_ret,
4107 return getInfo(param_value_size, param_value, param_value_size_ret,
4111 return getInfo(param_value_size, param_value, param_value_size_ret,
4117 return getInfo(param_value_size, param_value, param_value_size_ret,
4124 return getInfo(param_value_size, param_value, param_value_size_ret,
4139 assert(sampler !=
nullptr);
4151 assert(sampler !=
nullptr);
4157 "Reference count overflow detected in cuda_piSamplerRelease.");
4175 size_t src_row_pitch,
size_t src_slice_pitch,
void *dst_ptr,
4177 size_t dst_row_pitch,
size_t dst_slice_pitch) {
4179 assert(region !=
nullptr);
4180 assert(src_offset !=
nullptr);
4181 assert(dst_offset !=
nullptr);
4183 assert(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST);
4184 assert(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST);
4199 CUDA_MEMCPY3D params = {};
4205 params.srcMemoryType = src_type;
4206 params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE
4209 params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr :
nullptr;
4210 params.srcXInBytes = src_offset->
x_bytes;
4211 params.srcY = src_offset->
y_scalar;
4212 params.srcZ = src_offset->
z_scalar;
4213 params.srcPitch = src_row_pitch;
4214 params.srcHeight = src_slice_pitch / src_row_pitch;
4216 params.dstMemoryType = dst_type;
4217 params.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE
4220 params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr :
nullptr;
4221 params.dstXInBytes = dst_offset->
x_bytes;
4222 params.dstY = dst_offset->
y_scalar;
4223 params.dstZ = dst_offset->
z_scalar;
4224 params.dstPitch = dst_row_pitch;
4225 params.dstHeight = dst_slice_pitch / dst_row_pitch;
4227 return PI_CHECK_ERROR(cuMemcpy3DAsync(¶ms, cu_stream));
4234 size_t buffer_slice_pitch,
size_t host_row_pitch,
size_t host_slice_pitch,
4235 void *ptr,
pi_uint32 num_events_in_wait_list,
4238 assert(buffer !=
nullptr);
4239 assert(command_queue !=
nullptr);
4243 std::unique_ptr<_pi_event> retImplEv{
nullptr};
4246 ScopedContext active(command_queue->
get_context());
4249 retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4259 cuStream, region, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset,
4260 buffer_row_pitch, buffer_slice_pitch, ptr, CU_MEMORYTYPE_HOST,
4261 host_offset, host_row_pitch, host_slice_pitch);
4264 retErr = retImplEv->record();
4267 if (blocking_read) {
4268 retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
4272 *
event = retImplEv.release();
4285 size_t buffer_slice_pitch,
size_t host_row_pitch,
size_t host_slice_pitch,
4286 const void *ptr,
pi_uint32 num_events_in_wait_list,
4289 assert(buffer !=
nullptr);
4290 assert(command_queue !=
nullptr);
4294 std::unique_ptr<_pi_event> retImplEv{
nullptr};
4297 ScopedContext active(command_queue->
get_context());
4299 retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4309 cuStream, region, ptr, CU_MEMORYTYPE_HOST, host_offset, host_row_pitch,
4310 host_slice_pitch, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset,
4311 buffer_row_pitch, buffer_slice_pitch);
4314 retErr = retImplEv->record();
4317 if (blocking_write) {
4318 retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
4322 *
event = retImplEv.release();
4332 pi_mem dst_buffer,
size_t src_offset,
4333 size_t dst_offset,
size_t size,
4337 if (!command_queue) {
4338 return PI_ERROR_INVALID_QUEUE;
4341 std::unique_ptr<_pi_event> retImplEv{
nullptr};
4344 ScopedContext active(command_queue->
get_context());
4348 result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
4354 result = retImplEv->start();
4360 result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream));
4363 result = retImplEv->record();
4364 *
event = retImplEv.release();
4371 return PI_ERROR_UNKNOWN;
4379 size_t dst_row_pitch,
size_t dst_slice_pitch,
4383 assert(src_buffer !=
nullptr);
4384 assert(dst_buffer !=
nullptr);
4385 assert(command_queue !=
nullptr);
4390 std::unique_ptr<_pi_event> retImplEv{
nullptr};
4393 ScopedContext active(command_queue->
get_context());
4395 retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4405 cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin,
4406 src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE,
4407 dst_origin, dst_row_pitch, dst_slice_pitch);
4410 retImplEv->record();
4411 *
event = retImplEv.release();
4421 const void *pattern,
size_t pattern_size,
4422 size_t offset,
size_t size,
4426 assert(command_queue !=
nullptr);
4428 auto args_are_multiples_of_pattern_size =
4429 (offset % pattern_size == 0) || (size % pattern_size == 0);
4431 auto pattern_is_valid = (pattern !=
nullptr);
4433 auto pattern_size_is_valid =
4434 ((pattern_size & (pattern_size - 1)) == 0) &&
4435 (pattern_size > 0) && (pattern_size <= 128);
4437 assert(args_are_multiples_of_pattern_size && pattern_is_valid &&
4438 pattern_size_is_valid);
4439 (void)args_are_multiples_of_pattern_size;
4440 (void)pattern_is_valid;
4441 (void)pattern_size_is_valid;
4443 std::unique_ptr<_pi_event> retImplEv{
nullptr};
4446 ScopedContext active(command_queue->
get_context());
4450 result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
4456 result = retImplEv->start();
4460 auto N = size / pattern_size;
4463 switch (pattern_size) {
4465 auto value = *
static_cast<const uint8_t *
>(pattern);
4466 result = PI_CHECK_ERROR(cuMemsetD8Async(dstDevice, value, N, stream));
4470 auto value = *
static_cast<const uint16_t *
>(pattern);
4471 result = PI_CHECK_ERROR(cuMemsetD16Async(dstDevice, value, N, stream));
4475 auto value = *
static_cast<const uint32_t *
>(pattern);
4476 result = PI_CHECK_ERROR(cuMemsetD32Async(dstDevice, value, N, stream));
4487 auto number_of_steps = pattern_size /
sizeof(uint32_t);
4491 for (
auto step = 0u; step < number_of_steps; ++step) {
4493 auto value = *(
static_cast<const uint32_t *
>(pattern) + step);
4496 auto offset_ptr = dstDevice + (step *
sizeof(uint32_t));
4499 result = PI_CHECK_ERROR(
4500 cuMemsetD2D32Async(offset_ptr, pattern_size, value, 1, N, stream));
4508 result = retImplEv->record();
4509 *
event = retImplEv.release();
4516 return PI_ERROR_UNKNOWN;
4521 switch (array_desc.Format) {
4522 case CU_AD_FORMAT_UNSIGNED_INT8:
4523 case CU_AD_FORMAT_SIGNED_INT8:
4525 case CU_AD_FORMAT_UNSIGNED_INT16:
4526 case CU_AD_FORMAT_SIGNED_INT16:
4527 case CU_AD_FORMAT_HALF:
4529 case CU_AD_FORMAT_UNSIGNED_INT32:
4530 case CU_AD_FORMAT_SIGNED_INT32:
4531 case CU_AD_FORMAT_FLOAT:
4546 const void *src_ptr,
const CUmemorytype_enum src_type,
4547 const size_t *src_offset,
void *dst_ptr,
const CUmemorytype_enum dst_type,
4548 const size_t *dst_offset) {
4549 assert(region !=
nullptr);
4551 assert(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST);
4552 assert(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST);
4555 CUDA_MEMCPY2D cpyDesc;
4556 memset(&cpyDesc, 0,
sizeof(cpyDesc));
4557 cpyDesc.srcMemoryType = src_type;
4558 if (src_type == CU_MEMORYTYPE_ARRAY) {
4559 cpyDesc.srcArray = *
static_cast<const CUarray *
>(src_ptr);
4560 cpyDesc.srcXInBytes = src_offset[0];
4561 cpyDesc.srcY = src_offset[1];
4563 cpyDesc.srcHost = src_ptr;
4565 cpyDesc.dstMemoryType = dst_type;
4566 if (dst_type == CU_MEMORYTYPE_ARRAY) {
4567 cpyDesc.dstArray = *
static_cast<CUarray *
>(dst_ptr);
4568 cpyDesc.dstXInBytes = dst_offset[0];
4569 cpyDesc.dstY = dst_offset[1];
4571 cpyDesc.dstHost = dst_ptr;
4573 cpyDesc.WidthInBytes = region[0];
4574 cpyDesc.Height = region[1];
4575 return PI_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cu_stream));
4578 CUDA_MEMCPY3D cpyDesc;
4579 memset(&cpyDesc, 0,
sizeof(cpyDesc));
4580 cpyDesc.srcMemoryType = src_type;
4581 if (src_type == CU_MEMORYTYPE_ARRAY) {
4582 cpyDesc.srcArray = *
static_cast<const CUarray *
>(src_ptr);
4583 cpyDesc.srcXInBytes = src_offset[0];
4584 cpyDesc.srcY = src_offset[1];
4585 cpyDesc.srcZ = src_offset[2];
4587 cpyDesc.srcHost = src_ptr;
4589 cpyDesc.dstMemoryType = dst_type;
4590 if (dst_type == CU_MEMORYTYPE_ARRAY) {
4591 cpyDesc.dstArray = *
static_cast<CUarray *
>(dst_ptr);
4592 cpyDesc.dstXInBytes = dst_offset[0];
4593 cpyDesc.dstY = dst_offset[1];
4594 cpyDesc.dstZ = dst_offset[2];
4596 cpyDesc.dstHost = dst_ptr;
4598 cpyDesc.WidthInBytes = region[0];
4599 cpyDesc.Height = region[1];
4600 cpyDesc.Depth = region[2];
4601 return PI_CHECK_ERROR(cuMemcpy3DAsync(&cpyDesc, cu_stream));
4603 return PI_ERROR_INVALID_VALUE;
4608 const size_t *origin,
const size_t *region,
size_t row_pitch,
4609 size_t slice_pitch,
void *ptr,
pi_uint32 num_events_in_wait_list,
4615 assert(command_queue !=
nullptr);
4616 assert(image !=
nullptr);
4622 ScopedContext active(command_queue->
get_context());
4624 retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4627 CUarray array = image->mem_.surface_mem_.get_array();
4629 CUDA_ARRAY_DESCRIPTOR arrayDesc;
4630 retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
4634 size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels;
4635 size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0];
4637 pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
4639 retErr = PI_CHECK_ERROR(
4640 cuMemcpyAtoHAsync(ptr, array, byteOffsetX, bytesToCopy, cuStream));
4642 size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4643 size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]};
4646 cuStream, imgType, adjustedRegion, &array, CU_MEMORYTYPE_ARRAY,
4647 srcOffset, ptr, CU_MEMORYTYPE_HOST,
nullptr);
4649 if (retErr != PI_SUCCESS) {
4656 command_queue, cuStream);
4657 new_event->record();
4661 if (blocking_read) {
4662 retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
4667 return PI_ERROR_UNKNOWN;
4675 pi_bool blocking_write,
const size_t *origin,
4676 const size_t *region,
size_t input_row_pitch,
4677 size_t input_slice_pitch,
const void *ptr,
4681 (void)blocking_write;
4682 (void)input_row_pitch;
4683 (void)input_slice_pitch;
4685 assert(command_queue !=
nullptr);
4686 assert(image !=
nullptr);
4692 ScopedContext active(command_queue->
get_context());
4694 retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4697 CUarray array = image->mem_.surface_mem_.get_array();
4699 CUDA_ARRAY_DESCRIPTOR arrayDesc;
4700 retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
4704 size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels;
4705 size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0];
4707 pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
4709 retErr = PI_CHECK_ERROR(
4710 cuMemcpyHtoAAsync(array, byteOffsetX, ptr, bytesToCopy, cuStream));
4712 size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4713 size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]};
4716 cuStream, imgType, adjustedRegion, ptr, CU_MEMORYTYPE_HOST,
nullptr,
4717 &array, CU_MEMORYTYPE_ARRAY, dstOffset);
4719 if (retErr != PI_SUCCESS) {
4726 command_queue, cuStream);
4727 new_event->record();
4733 return PI_ERROR_UNKNOWN;
4740 pi_mem dst_image,
const size_t *src_origin,
4741 const size_t *dst_origin,
4742 const size_t *region,
4754 ScopedContext active(command_queue->
get_context());
4756 retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4762 CUDA_ARRAY_DESCRIPTOR srcArrayDesc;
4763 retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&srcArrayDesc, srcArray));
4764 CUDA_ARRAY_DESCRIPTOR dstArrayDesc;
4765 retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&dstArrayDesc, dstArray));
4767 assert(srcArrayDesc.Format == dstArrayDesc.Format);
4768 assert(srcArrayDesc.NumChannels == dstArrayDesc.NumChannels);
4772 size_t dstByteOffsetX =
4773 dst_origin[0] * elementByteSize * srcArrayDesc.NumChannels;
4774 size_t srcByteOffsetX =
4775 src_origin[0] * elementByteSize * dstArrayDesc.NumChannels;
4776 size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0];
4780 retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray,
4781 srcByteOffsetX, bytesToCopy));
4783 size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4784 size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
4785 size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
4788 cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY,
4789 srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset);
4791 if (retErr != PI_SUCCESS) {
4798 command_queue, cuStream);
4799 new_event->record();
4805 return PI_ERROR_UNKNOWN;
4813 const size_t *,
const size_t *,
pi_uint32,
4831 assert(ret_map !=
nullptr);
4832 assert(command_queue !=
nullptr);
4833 assert(buffer !=
nullptr);
4836 pi_result ret_err = PI_ERROR_INVALID_OPERATION;
4849 ret_err = PI_SUCCESS;
4855 command_queue, buffer, blocking_map, offset, size, hostPtr,
4856 num_events_in_wait_list, event_wait_list, event);
4858 ScopedContext active(command_queue->
get_context());
4862 event_wait_list,
nullptr);
4892 assert(command_queue !=
nullptr);
4893 assert(mapped_ptr !=
nullptr);
4894 assert(memobj !=
nullptr);
4908 command_queue, memobj,
true,
4911 num_events_in_wait_list, event_wait_list, event);
4913 ScopedContext active(command_queue->
get_context());
4917 event_wait_list,
nullptr);
4942 assert(result_ptr !=
nullptr);
4943 assert(context !=
nullptr);
4944 assert(properties ==
nullptr || *properties == 0);
4947 ScopedContext active(context);
4948 result = PI_CHECK_ERROR(cuMemAllocHost(result_ptr, size));
4954 (result == PI_SUCCESS &&
4955 reinterpret_cast<std::uintptr_t
>(*result_ptr) %
alignment == 0));
4965 assert(result_ptr !=
nullptr);
4966 assert(context !=
nullptr);
4967 assert(device !=
nullptr);
4968 assert(properties ==
nullptr || *properties == 0);
4971 ScopedContext active(context);
4972 result = PI_CHECK_ERROR(cuMemAlloc((
CUdeviceptr *)result_ptr, size));
4978 (result == PI_SUCCESS &&
4979 reinterpret_cast<std::uintptr_t
>(*result_ptr) %
alignment == 0));
4989 assert(result_ptr !=
nullptr);
4990 assert(context !=
nullptr);
4991 assert(device !=
nullptr);
4992 assert(properties ==
nullptr || *properties == 0);
4995 ScopedContext active(context);
4996 result = PI_CHECK_ERROR(cuMemAllocManaged((
CUdeviceptr *)result_ptr, size,
4997 CU_MEM_ATTACH_GLOBAL));
5003 (result == PI_SUCCESS &&
5004 reinterpret_cast<std::uintptr_t
>(*result_ptr) %
alignment == 0));
5011 assert(context !=
nullptr);
5014 ScopedContext active(context);
5017 void *attribute_values[2] = {&is_managed, &type};
5018 CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
5019 CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
5020 result = PI_CHECK_ERROR(cuPointerGetAttributes(
5021 2, attributes, attribute_values, (
CUdeviceptr)ptr));
5022 assert(type == CU_MEMORYTYPE_DEVICE || type == CU_MEMORYTYPE_HOST);
5023 if (is_managed || type == CU_MEMORYTYPE_DEVICE) {
5026 result = PI_CHECK_ERROR(cuMemFree((
CUdeviceptr)ptr));
5029 result = PI_CHECK_ERROR(cuMemFreeHost(ptr));
5042 assert(queue !=
nullptr);
5043 assert(ptr !=
nullptr);
5045 std::unique_ptr<_pi_event> event_ptr{
nullptr};
5052 num_events_in_waitlist, events_waitlist, guard, &stream_token);
5053 result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
5060 result = PI_CHECK_ERROR(cuMemsetD8Async(
5061 (
CUdeviceptr)ptr, (
unsigned char)value & 0xFF, count, cuStream));
5063 result = event_ptr->record();
5064 *
event = event_ptr.release();
5073 void *dst_ptr,
const void *src_ptr,
5078 assert(queue !=
nullptr);
5079 assert(dst_ptr !=
nullptr);
5080 assert(src_ptr !=
nullptr);
5083 std::unique_ptr<_pi_event> event_ptr{
nullptr};
5088 result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
5095 result = PI_CHECK_ERROR(cuMemcpyAsync(
5098 result = event_ptr->record();
5101 result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
5104 *
event = event_ptr.release();
5124 if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
5126 "concurrent managed access",
5128 return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5131 unsigned int is_managed;
5132 PI_CHECK_ERROR(cuPointerGetAttribute(
5133 &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (
CUdeviceptr)ptr));
5135 setErrorMessage(
"Prefetch hint ignored as prefetch only works with USM",
5137 return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5142 return PI_ERROR_INVALID_VALUE;
5143 assert(queue !=
nullptr);
5144 assert(ptr !=
nullptr);
5146 std::unique_ptr<_pi_event> event_ptr{
nullptr};
5151 result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
5158 result = PI_CHECK_ERROR(
5159 cuMemPrefetchAsync((
CUdeviceptr)ptr, size, device->get(), cuStream));
5161 result = event_ptr->record();
5162 *
event = event_ptr.release();
5174 assert(queue !=
nullptr);
5175 assert(ptr !=
nullptr);
5189 if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
5191 "concurrent managed access",
5193 return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5201 unsigned int is_managed;
5202 PI_CHECK_ERROR(cuPointerGetAttribute(
5203 &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (
CUdeviceptr)ptr));
5206 "Memory advice ignored as memory advices only works with USM",
5208 return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5212 std::unique_ptr<_pi_event> event_ptr{
nullptr};
5230 result = PI_CHECK_ERROR(cuMemAdvise(
5239 result = PI_CHECK_ERROR(cuMemAdvise(
5247 PI_CHECK_ERROR(cuMemAdvise((
CUdeviceptr)ptr, length,
5248 CU_MEM_ADVISE_UNSET_READ_MOSTLY,
5250 PI_CHECK_ERROR(cuMemAdvise((
CUdeviceptr)ptr, length,
5251 CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
5253 PI_CHECK_ERROR(cuMemAdvise((
CUdeviceptr)ptr, length,
5254 CU_MEM_ADVISE_UNSET_ACCESSED_BY,
5261 result = event_ptr->record();
5262 *
event = event_ptr.release();
5267 result = PI_ERROR_UNKNOWN;
5275 const void *,
size_t,
size_t,
pi_uint32,
5305 void *dst_ptr,
size_t dst_pitch,
5306 const void *src_ptr,
size_t src_pitch,
5307 size_t width,
size_t height,
5312 assert(queue !=
nullptr);
5319 result = enqueueEventsWait(queue, cuStream, num_events_in_wait_list,
5329 CUDA_MEMCPY2D cpyDesc = {0};
5331 getUSMHostOrDevicePtr(src_ptr, &cpyDesc.srcMemoryType, &cpyDesc.srcDevice,
5333 getUSMHostOrDevicePtr(dst_ptr, &cpyDesc.dstMemoryType, &cpyDesc.dstDevice,
5336 cpyDesc.dstPitch = dst_pitch;
5337 cpyDesc.srcPitch = src_pitch;
5338 cpyDesc.WidthInBytes = width;
5339 cpyDesc.Height = height;
5341 result = PI_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cuStream));
5347 result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
5373 size_t param_value_size,
5375 size_t *param_value_size_ret) {
5376 assert(context !=
nullptr);
5377 assert(ptr !=
nullptr);
5381 ScopedContext active(context);
5382 switch (param_name) {
5386 CUresult ret = cuPointerGetAttribute(
5387 &value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (
CUdeviceptr)ptr);
5388 if (ret == CUDA_ERROR_INVALID_VALUE) {
5390 return getInfo(param_value_size, param_value, param_value_size_ret,
5393 result = check_error(ret, __func__, __LINE__ - 5, __FILE__);
5396 return getInfo(param_value_size, param_value, param_value_size_ret,
5399 result = PI_CHECK_ERROR(cuPointerGetAttribute(
5400 &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (
CUdeviceptr)ptr));
5401 assert(value == CU_MEMORYTYPE_DEVICE || value == CU_MEMORYTYPE_HOST);
5402 if (value == CU_MEMORYTYPE_DEVICE) {
5404 return getInfo(param_value_size, param_value, param_value_size_ret,
5407 if (value == CU_MEMORYTYPE_HOST) {
5409 return getInfo(param_value_size, param_value, param_value_size_ret,
5416 __builtin_unreachable();
5418 return getInfo(param_value_size, param_value, param_value_size_ret,
5422 #if CUDA_VERSION >= 10020
5425 result = PI_CHECK_ERROR(cuPointerGetAttribute(
5426 &value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (
CUdeviceptr)ptr));
5427 return getInfo(param_value_size, param_value, param_value_size_ret,
5430 return PI_ERROR_INVALID_VALUE;
5434 #if CUDA_VERSION >= 10020
5437 result = PI_CHECK_ERROR(cuPointerGetAttribute(
5438 &value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (
CUdeviceptr)ptr));
5439 return getInfo(param_value_size, param_value, param_value_size_ret,
5442 return PI_ERROR_INVALID_VALUE;
5447 unsigned int device_idx;
5448 result = PI_CHECK_ERROR(cuPointerGetAttribute(
5449 &device_idx, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (
CUdeviceptr)ptr));
5453 std::vector<pi_platform> platforms;
5454 platforms.resize(device_idx + 1);
5458 pi_device device = platforms[device_idx]->devices_[0].get();
5459 return getInfo(param_value_size, param_value, param_value_size_ret,
5471 pi_bool blocking_write,
size_t count,
size_t offset,
const void *src,
5474 assert(queue !=
nullptr);
5475 assert(program !=
nullptr);
5477 if (name ==
nullptr || src ==
nullptr)
5478 return PI_ERROR_INVALID_VALUE;
5482 auto device_global_name_it = program->
globalIDMD_.find(name);
5483 if (device_global_name_it == program->
globalIDMD_.end())
5484 return PI_ERROR_INVALID_VALUE;
5485 std::string device_global_name = device_global_name_it->second;
5490 size_t device_global_size = 0;
5491 result = PI_CHECK_ERROR(
5492 cuModuleGetGlobal(&device_global, &device_global_size, program->
get(),
5493 device_global_name.c_str()));
5495 if (offset + count > device_global_size)
5496 return PI_ERROR_INVALID_VALUE;
5499 queue, blocking_write,
reinterpret_cast<void *
>(device_global + offset),
5500 src, count, num_events_in_wait_list, event_wait_list, event);
5509 size_t count,
size_t offset,
void *dst,
pi_uint32 num_events_in_wait_list,
5511 assert(queue !=
nullptr);
5512 assert(program !=
nullptr);
5514 if (name ==
nullptr || dst ==
nullptr)
5515 return PI_ERROR_INVALID_VALUE;
5519 auto device_global_name_it = program->
globalIDMD_.find(name);
5520 if (device_global_name_it == program->
globalIDMD_.end())
5521 return PI_ERROR_INVALID_VALUE;
5522 std::string device_global_name = device_global_name_it->second;
5527 size_t device_global_size = 0;
5528 result = PI_CHECK_ERROR(
5529 cuModuleGetGlobal(&device_global, &device_global_size, program->
get(),
5530 device_global_name.c_str()));
5532 if (offset + count > device_global_size)
5533 return PI_ERROR_INVALID_VALUE;
5536 queue, blocking_read, dst,
5537 reinterpret_cast<const void *
>(device_global + offset), count,
5538 num_events_in_wait_list, event_wait_list, event);
5558 uint64_t *HostTime) {
5560 ScopedContext active(Device->get_context());
5563 PI_CHECK_ERROR(cuEventCreate(&event, CU_EVENT_DEFAULT));
5564 PI_CHECK_ERROR(cuEventRecord(event, 0));
5568 using namespace std::chrono;
5570 duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
5575 PI_CHECK_ERROR(cuEventSynchronize(event));
5576 *DeviceTime = Device->get_elapsed_time(event);
5589 size_t PluginVersionSize =
sizeof(PluginInit->
PluginVersion);
5591 return PI_ERROR_INVALID_VALUE;
5602 #define _PI_CL(pi_api, cuda_api) \
5603 (PluginInit->PiFunctionTable).pi_api = (decltype(&::pi_api))(&cuda_api);
5743 #define __SYCL_PLUGIN_DLL_NAME "pi_cuda.dll"
5744 #include "../common_win_pi_trace/common_win_pi_trace.hpp"
5745 #undef __SYCL_PLUGIN_DLL_NAME