22 #include <hip/hip_runtime.h>
34 inline void getArrayDesc(hipArray *array, hipArray_Format &format,
36 #if defined(__HIP_PLATFORM_AMD__)
37 format = array->Format;
38 channels = array->NumChannels;
39 #elif defined(__HIP_PLATFORM_NVIDIA__)
40 CUDA_ARRAY_DESCRIPTOR arrayDesc;
41 cuArrayGetDescriptor(&arrayDesc, (CUarray)array);
43 format = arrayDesc.Format;
44 channels = arrayDesc.NumChannels;
46 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
53 #if defined(__HIP_PLATFORM_NVIDIA__) && !defined(__CUDACC__)
54 inline static hipError_t
55 hipArray3DCreate(hiparray *pHandle,
56 const HIP_ARRAY3D_DESCRIPTOR *pAllocateArray) {
57 return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray));
68 #if defined(__HIP_PLATFORM_NVIDIA__)
69 typedef CUarray hipCUarray;
70 #elif defined(__HIP_PLATFORM_AMD__)
71 typedef hipArray *hipCUarray;
73 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
77 #if defined(__HIP_PLATFORM_NVIDIA__)
78 #define hipMemoryType CUmemorytype
79 #define hipMemoryTypeHost CU_MEMORYTYPE_HOST
80 #define hipMemoryTypeDevice CU_MEMORYTYPE_DEVICE
81 #define hipMemoryTypeArray CU_MEMORYTYPE_ARRAY
82 #define hipMemoryTypeUnified CU_MEMORYTYPE_UNIFIED
85 std::string getHipVersionString() {
86 int driver_version = 0;
87 if (hipDriverGetVersion(&driver_version) != hipSuccess) {
91 std::stringstream stream;
92 stream <<
"HIP " << driver_version / 1000 <<
"."
93 << driver_version % 1000 / 10;
101 case hipErrorInvalidContext:
102 return PI_ERROR_INVALID_CONTEXT;
103 case hipErrorInvalidDevice:
104 return PI_ERROR_INVALID_DEVICE;
105 case hipErrorInvalidValue:
106 return PI_ERROR_INVALID_VALUE;
107 case hipErrorOutOfMemory:
108 return PI_ERROR_OUT_OF_HOST_MEMORY;
109 case hipErrorLaunchOutOfResources:
110 return PI_ERROR_OUT_OF_RESOURCES;
112 return PI_ERROR_UNKNOWN;
130 pi_result hip_piPluginGetLastError(
char **message) {
138 template <
typename Func>
140 std::size_t num_events_in_wait_list, Func &&f) {
142 if (event_wait_list ==
nullptr || num_events_in_wait_list == 0) {
143 return PI_ERROR_INVALID_EVENT_WAIT_LIST;
147 if (num_events_in_wait_list == 1) {
148 return f(event_wait_list[0]);
151 std::vector<pi_event> events{event_wait_list,
152 event_wait_list + num_events_in_wait_list};
156 return e0->get_stream() < e1->get_stream() ||
157 (e0->get_stream() == e1->get_stream() &&
158 e0->get_event_id() > e1->get_event_id());
162 hipStream_t lastSeenStream = 0;
164 if (!event || (!first &&
event->get_stream() == lastSeenStream)) {
169 lastSeenStream =
event->get_stream();
171 auto result = f(event);
172 if (result != PI_SUCCESS) {
187 pi_result check_error(hipError_t result,
const char *
function,
int line,
189 if (result == hipSuccess) {
193 if (std::getenv(
"SYCL_PI_SUPPRESS_ERROR_MESSAGE") ==
nullptr) {
194 const char *errorString =
nullptr;
195 const char *errorName =
nullptr;
196 errorName = hipGetErrorName(result);
197 errorString = hipGetErrorString(result);
198 std::stringstream ss;
199 ss <<
"\nPI HIP ERROR:"
200 <<
"\n\tValue: " << result
201 <<
"\n\tName: " << errorName
202 <<
"\n\tDescription: " << errorString
203 <<
"\n\tFunction: " <<
function <<
"\n\tSource Location: " << file
204 <<
":" <<
line <<
"\n"
209 if (std::getenv(
"PI_HIP_ABORT") !=
nullptr) {
213 throw map_error(result);
217 #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)
225 class ScopedContext {
231 ScopedContext(
pi_context ctxt) : placedContext_{ctxt}, needToRecover_{
false} {
233 if (!placedContext_) {
234 throw PI_ERROR_INVALID_CONTEXT;
237 hipCtx_t desired = placedContext_->
get();
238 PI_CHECK_ERROR(hipCtxGetCurrent(&original_));
239 if (original_ != desired) {
241 PI_CHECK_ERROR(hipCtxSetCurrent(desired));
242 if (original_ ==
nullptr) {
250 needToRecover_ =
true;
256 if (needToRecover_) {
257 PI_CHECK_ERROR(hipCtxSetCurrent(original_));
263 template <
typename T,
typename Assign>
265 size_t *param_value_size_ret, T value,
size_t value_size,
266 Assign &&assign_func) {
268 if (param_value !=
nullptr) {
270 if (param_value_size < value_size) {
271 return PI_ERROR_INVALID_VALUE;
274 assign_func(param_value, value, value_size);
277 if (param_value_size_ret !=
nullptr) {
278 *param_value_size_ret = value_size;
284 template <
typename T>
286 size_t *param_value_size_ret, T value) {
288 auto assignment = [](
void *param_value, T value,
size_t value_size) {
290 *
static_cast<T *
>(param_value) = value;
293 return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
294 sizeof(T), std::move(assignment));
297 template <
typename T>
299 void *param_value,
size_t *param_value_size_ret,
302 auto assignment = [](
void *param_value, T *value,
size_t value_size) {
303 memcpy(param_value,
static_cast<const void *
>(value), value_size);
306 return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
307 array_length *
sizeof(T), std::move(assignment));
312 size_t *param_value_size_ret,
314 return getInfoArray(strlen(value) + 1, param_value_size, param_value,
315 param_value_size_ret, value);
318 int getAttribute(
pi_device device, hipDeviceAttribute_t attribute) {
321 hipDeviceGetAttribute(&value, attribute,
device->get()) == hipSuccess);
326 void simpleGuessLocalWorkSize(
size_t *threadsPerBlock,
327 const size_t *global_work_size,
328 const size_t maxThreadsPerBlock[3],
330 assert(threadsPerBlock !=
nullptr);
331 assert(global_work_size !=
nullptr);
332 assert(kernel !=
nullptr);
341 threadsPerBlock[0] = std::min(maxThreadsPerBlock[0], global_work_size[0]);
345 while (0u != (global_work_size[0] % threadsPerBlock[0])) {
346 --threadsPerBlock[0];
353 if (!event_wait_list) {
357 ScopedContext active(command_queue->
get_context());
359 auto result = forLatestEvents(
360 event_wait_list, num_events_in_wait_list,
362 if (
event->get_stream() == stream) {
365 return PI_CHECK_ERROR(hipStreamWaitEvent(stream, event->get(), 0));
369 if (result != PI_SUCCESS) {
376 return PI_ERROR_UNKNOWN;
392 [[noreturn]]
void die(
const char *Message) {
393 std::cerr <<
"pi_die: " << Message << std::endl;
399 std::cerr <<
"pi_print: " << Message << std::endl;
402 void assertion(
bool Condition,
const char *Message) {
459 PI_CHECK_ERROR(hipStreamCreateWithFlags(
475 *stream_token = token;
485 for (
pi_uint32 i = 0; i < num_events_in_wait_list; i++) {
488 std::unique_lock<std::mutex> compute_sync_guard(
496 *stream_token = token;
499 hipStream_t res = event_wait_list[i]->
get_stream();
519 PI_CHECK_ERROR(hipStreamCreateWithFlags(
530 hipStream_t stream,
pi_uint32 stream_token)
531 : commandType_{type}, refCount_{1}, hasBeenWaitedOn_{
false},
532 isRecorded_{
false}, isStarted_{
false},
533 streamToken_{stream_token}, evEnd_{
nullptr}, evStart_{
nullptr},
534 evQueued_{
nullptr}, queue_{queue}, stream_{stream}, context_{context} {
540 PI_CHECK_ERROR(hipEventCreateWithFlags(
541 &evEnd_, profilingEnabled ? hipEventDefault : hipEventDisableTiming));
543 if (profilingEnabled) {
544 PI_CHECK_ERROR(hipEventCreateWithFlags(&evQueued_, hipEventDefault));
545 PI_CHECK_ERROR(hipEventCreateWithFlags(&evStart_, hipEventDefault));
548 if (queue_ !=
nullptr) {
555 if (queue_ !=
nullptr) {
568 PI_CHECK_ERROR(hipEventRecord(evQueued_, 0));
569 PI_CHECK_ERROR(hipEventRecord(evStart_, queue_->
get()));
583 if (!hasBeenWaitedOn_) {
584 const hipError_t ret = hipEventQuery(evEnd_);
585 if (ret != hipSuccess && ret != hipErrorNotReady) {
589 if (ret == hipErrorNotReady) {
597 float miliSeconds = 0.0f;
600 PI_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, evStart_, evEnd_));
601 return static_cast<pi_uint64>(miliSeconds * 1.0e6);
605 float miliSeconds = 0.0f;
610 return static_cast<pi_uint64>(miliSeconds * 1.0e6);
614 float miliSeconds = 0.0f;
619 return static_cast<pi_uint64>(miliSeconds * 1.0e6);
625 return PI_ERROR_INVALID_EVENT;
628 pi_result result = PI_ERROR_INVALID_OPERATION;
631 return PI_ERROR_INVALID_QUEUE;
638 "Unrecoverable program state reached in event identifier overflow");
640 result = PI_CHECK_ERROR(hipEventRecord(evEnd_, stream_));
645 if (result == PI_SUCCESS) {
655 retErr = PI_CHECK_ERROR(hipEventSynchronize(evEnd_));
656 hasBeenWaitedOn_ =
true;
665 assert(queue_ !=
nullptr);
666 PI_CHECK_ERROR(hipEventDestroy(evEnd_));
669 PI_CHECK_ERROR(hipEventDestroy(evQueued_));
670 PI_CHECK_ERROR(hipEventDestroy(evStart_));
682 PI_CHECK_ERROR(hipStreamWaitEvent(s, e, 0));
688 : module_{
nullptr}, binary_{},
689 binarySizeInBytes_{0}, refCount_{1}, context_{ctxt} {
697 "Re-setting program binary data which has already been set");
707 constexpr
const unsigned int numberOfOptions = 4u;
709 hipJitOption options[numberOfOptions];
710 void *optionVals[numberOfOptions];
713 options[0] = hipJitOptionInfoLogBuffer;
716 options[1] = hipJitOptionInfoLogBufferSizeBytes;
719 options[2] = hipJitOptionErrorLogBuffer;
722 options[3] = hipJitOptionErrorLogBufferSizeBytes;
725 auto result = PI_CHECK_ERROR(
727 numberOfOptions, options, optionVals));
729 const auto success = (result == PI_SUCCESS);
735 return success ? PI_SUCCESS : PI_ERROR_BUILD_PROGRAM_FAILURE;
790 Other.Captive =
nullptr;
798 if (Captive !=
nullptr) {
800 if (ret != PI_SUCCESS) {
806 "Unrecoverable program state reached in hip_piMemRelease");
814 Captive = Other.Captive;
815 Other.Captive =
nullptr;
839 static std::once_flag initFlag;
841 static std::vector<_pi_platform> platformIds;
843 if (num_entries == 0 and platforms !=
nullptr) {
844 return PI_ERROR_INVALID_VALUE;
846 if (platforms ==
nullptr and num_platforms ==
nullptr) {
847 return PI_ERROR_INVALID_VALUE;
855 if (hipInit(0) != hipSuccess) {
860 hipError_t hipErrorCode = hipGetDeviceCount(&numDevices);
861 if (hipErrorCode == hipErrorNoDevice) {
865 err = PI_CHECK_ERROR(hipErrorCode);
866 if (numDevices == 0) {
871 numPlatforms = numDevices;
872 platformIds.resize(numDevices);
874 for (
int i = 0; i < numDevices; ++i) {
876 err = PI_CHECK_ERROR(hipDeviceGet(&device, i));
877 platformIds[i].devices_.emplace_back(
880 }
catch (
const std::bad_alloc &) {
882 for (
int i = 0; i < numDevices; ++i) {
883 platformIds[i].devices_.clear();
886 err = PI_ERROR_OUT_OF_HOST_MEMORY;
889 for (
int i = 0; i < numDevices; ++i) {
890 platformIds[i].devices_.clear();
898 if (num_platforms !=
nullptr) {
899 *num_platforms = numPlatforms;
902 if (platforms !=
nullptr) {
903 for (
unsigned i = 0; i < std::min(num_entries, numPlatforms); ++i) {
904 platforms[i] = &platformIds[i];
912 return PI_ERROR_OUT_OF_RESOURCES;
918 size_t param_value_size,
void *param_value,
919 size_t *param_value_size_ret) {
920 assert(platform !=
nullptr);
922 switch (param_name) {
924 return getInfo(param_value_size, param_value, param_value_size_ret,
927 return getInfo(param_value_size, param_value, param_value_size_ret,
930 return getInfo(param_value_size, param_value, param_value_size_ret,
933 auto version = getHipVersionString();
934 return getInfo(param_value_size, param_value, param_value_size_ret,
938 return getInfo(param_value_size, param_value, param_value_size_ret,
"");
959 const bool returnDevices = askingForDefault || askingForGPU;
961 size_t numDevices = returnDevices ? platform->devices_.size() : 0;
965 *num_devices = numDevices;
968 if (returnDevices && devices) {
969 for (
size_t i = 0; i < std::min(
size_t(num_entries), numDevices); ++i) {
970 devices[i] = platform->devices_[i].get();
978 return PI_ERROR_OUT_OF_RESOURCES;
990 size_t param_value_size,
void *param_value,
991 size_t *param_value_size_ret) {
993 switch (param_name) {
995 return getInfo(param_value_size, param_value, param_value_size_ret, 1);
997 return getInfo(param_value_size, param_value, param_value_size_ret,
1000 return getInfo(param_value_size, param_value, param_value_size_ret,
1003 return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
1008 return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
1017 PI_ERROR_INVALID_ARG_VALUE);
1018 return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
1024 return PI_ERROR_OUT_OF_RESOURCES;
1028 assert(context !=
nullptr);
1051 (void)out_num_devices;
1053 return PI_ERROR_INVALID_OPERATION;
1066 if (num_binaries < 1) {
1072 #if defined(__HIP_PLATFORM_AMD__)
1074 #elif defined(__HIP_PLATFORM_NVIDIA__)
1077 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
1080 for (
pi_uint32 i = 0; i < num_binaries; i++) {
1081 if (strcmp(binaries[i]->DeviceTargetSpec, binary_type) == 0) {
1082 *selected_binary = i;
1088 return PI_ERROR_INVALID_BINARY;
1093 const char *func_name,
1097 assert(func_pointer_ret !=
nullptr);
1100 hipError_t ret = hipModuleGetFunction(&func, program->
get(), func_name);
1101 *func_pointer_ret =
reinterpret_cast<pi_uint64>(func);
1104 if (ret != hipSuccess && ret != hipErrorNotFound)
1105 retError = PI_CHECK_ERROR(ret);
1106 if (ret == hipErrorNotFound) {
1107 *func_pointer_ret = 0;
1108 retError = PI_ERROR_INVALID_KERNEL_NAME;
1122 size_t param_value_size,
void *param_value,
1123 size_t *param_value_size_ret) {
1125 static constexpr
pi_uint32 max_work_item_dimensions = 3u;
1127 assert(device !=
nullptr);
1129 switch (param_name) {
1131 return getInfo(param_value_size, param_value, param_value_size_ret,
1135 #if defined(__HIP_PLATFORM_AMD__)
1137 #elif defined(__HIP_PLATFORM_NVIDIA__)
1143 return getInfo(param_value_size, param_value, param_value_size_ret,
1147 int compute_units = 0;
1149 hipDeviceGetAttribute(&compute_units,
1150 hipDeviceAttributeMultiprocessorCount,
1151 device->get()) == hipSuccess);
1153 return getInfo(param_value_size, param_value, param_value_size_ret,
1157 return getInfo(param_value_size, param_value, param_value_size_ret,
1158 max_work_item_dimensions);
1161 size_t return_sizes[max_work_item_dimensions];
1163 int max_x = 0, max_y = 0, max_z = 0;
1165 hipDeviceGetAttribute(&max_x, hipDeviceAttributeMaxBlockDimX,
1166 device->get()) == hipSuccess);
1170 hipDeviceGetAttribute(&max_y, hipDeviceAttributeMaxBlockDimY,
1171 device->get()) == hipSuccess);
1175 hipDeviceGetAttribute(&max_z, hipDeviceAttributeMaxBlockDimZ,
1176 device->get()) == hipSuccess);
1179 return_sizes[0] = size_t(max_x);
1180 return_sizes[1] = size_t(max_y);
1181 return_sizes[2] = size_t(max_z);
1182 return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
1183 param_value_size_ret, return_sizes);
1187 size_t return_sizes[max_work_item_dimensions];
1188 int max_x = 0, max_y = 0, max_z = 0;
1190 hipDeviceGetAttribute(&max_x, hipDeviceAttributeMaxGridDimX,
1191 device->get()) == hipSuccess);
1195 hipDeviceGetAttribute(&max_y, hipDeviceAttributeMaxGridDimY,
1196 device->get()) == hipSuccess);
1200 hipDeviceGetAttribute(&max_z, hipDeviceAttributeMaxGridDimZ,
1201 device->get()) == hipSuccess);
1204 return_sizes[0] = size_t(max_x);
1205 return_sizes[1] = size_t(max_y);
1206 return_sizes[2] = size_t(max_z);
1207 return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
1208 param_value_size_ret, return_sizes);
1212 int max_work_group_size = 0;
1214 hipDeviceGetAttribute(&max_work_group_size,
1215 hipDeviceAttributeMaxThreadsPerBlock,
1216 device->get()) == hipSuccess);
1220 return getInfo(param_value_size, param_value, param_value_size_ret,
1221 size_t(max_work_group_size));
1224 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1227 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1230 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1233 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1236 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1239 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1242 return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1245 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1248 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1251 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1254 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1257 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1260 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1263 return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1267 int max_threads = 0;
1269 hipDeviceGetAttribute(&max_threads,
1270 hipDeviceAttributeMaxThreadsPerBlock,
1271 device->get()) == hipSuccess);
1274 hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
1275 device->get()) == hipSuccess);
1276 int maxWarps = (max_threads + warpSize - 1) / warpSize;
1277 return getInfo(param_value_size, param_value, param_value_size_ret,
1278 static_cast<uint32_t
>(maxWarps));
1285 hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor,
1286 device->get()) == hipSuccess);
1287 bool ifp = (major >= 7);
1288 return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
1293 hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
1294 device->get()) == hipSuccess);
1295 size_t sizes[1] = {
static_cast<size_t>(warpSize)};
1296 return getInfoArray<size_t>(1, param_value_size, param_value,
1297 param_value_size_ret, sizes);
1302 hipDeviceGetAttribute(&clock_freq, hipDeviceAttributeClockRate,
1303 device->get()) == hipSuccess);
1305 return getInfo(param_value_size, param_value, param_value_size_ret,
1309 auto bits =
pi_uint32{std::numeric_limits<uintptr_t>::digits};
1310 return getInfo(param_value_size, param_value, param_value_size_ret, bits);
1323 auto quarter_global =
static_cast<pi_uint32>(global / 4u);
1325 auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global),
1326 32u * 1024u * 1024u);
1328 return getInfo(param_value_size, param_value, param_value_size_ret,
1332 return getInfo(param_value_size, param_value, param_value_size_ret,
1339 return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1345 return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1352 hipDeviceGetAttribute(&tex_height, hipDeviceAttributeMaxTexture2DHeight,
1353 device->get()) == hipSuccess);
1355 int surf_height = 0;
1357 hipDeviceGetAttribute(&surf_height,
1358 hipDeviceAttributeMaxTexture2DHeight,
1359 device->get()) == hipSuccess);
1362 int min = std::min(tex_height, surf_height);
1364 return getInfo(param_value_size, param_value, param_value_size_ret,
min);
1370 hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture2DWidth,
1371 device->get()) == hipSuccess);
1375 hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture2DWidth,
1376 device->get()) == hipSuccess);
1379 int min = std::min(tex_width, surf_width);
1381 return getInfo(param_value_size, param_value, param_value_size_ret,
min);
1387 hipDeviceGetAttribute(&tex_height, hipDeviceAttributeMaxTexture3DHeight,
1388 device->get()) == hipSuccess);
1390 int surf_height = 0;
1392 hipDeviceGetAttribute(&surf_height,
1393 hipDeviceAttributeMaxTexture3DHeight,
1394 device->get()) == hipSuccess);
1397 int min = std::min(tex_height, surf_height);
1399 return getInfo(param_value_size, param_value, param_value_size_ret,
min);
1405 hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture3DWidth,
1406 device->get()) == hipSuccess);
1410 hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture3DWidth,
1411 device->get()) == hipSuccess);
1414 int min = std::min(tex_width, surf_width);
1416 return getInfo(param_value_size, param_value, param_value_size_ret,
min);
1422 hipDeviceGetAttribute(&tex_depth, hipDeviceAttributeMaxTexture3DDepth,
1423 device->get()) == hipSuccess);
1427 hipDeviceGetAttribute(&surf_depth, hipDeviceAttributeMaxTexture3DDepth,
1428 device->get()) == hipSuccess);
1431 int min = std::min(tex_depth, surf_depth);
1433 return getInfo(param_value_size, param_value, param_value_size_ret,
min);
1439 hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture1DWidth,
1440 device->get()) == hipSuccess);
1444 hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture1DWidth,
1445 device->get()) == hipSuccess);
1448 int min = std::min(tex_width, surf_width);
1450 return getInfo(param_value_size, param_value, param_value_size_ret,
min);
1453 return getInfo(param_value_size, param_value, param_value_size_ret,
1459 return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1464 return getInfo(param_value_size, param_value, param_value_size_ret,
1468 int mem_base_addr_align = 0;
1470 hipDeviceGetAttribute(&mem_base_addr_align,
1471 hipDeviceAttributeTextureAlignment,
1472 device->get()) == hipSuccess);
1474 mem_base_addr_align *= 8;
1475 return getInfo(param_value_size, param_value, param_value_size_ret,
1476 mem_base_addr_align);
1479 return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1485 return getInfo(param_value_size, param_value, param_value_size_ret, config);
1490 return getInfo(param_value_size, param_value, param_value_size_ret, config);
1493 return getInfo(param_value_size, param_value, param_value_size_ret,
1499 return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1504 hipDeviceGetAttribute(&cache_size, hipDeviceAttributeL2CacheSize,
1505 device->get()) == hipSuccess);
1508 return getInfo(param_value_size, param_value, param_value_size_ret,
1516 return getInfo(param_value_size, param_value, param_value_size_ret,
1520 unsigned int constant_memory = 0;
1527 hipDeviceGetAttribute(
reinterpret_cast<int *
>(&constant_memory),
1528 hipDeviceAttributeTotalConstantMemory,
1529 device->get()) == hipSuccess);
1531 return getInfo(param_value_size, param_value, param_value_size_ret,
1538 return getInfo(param_value_size, param_value, param_value_size_ret, 9u);
1541 return getInfo(param_value_size, param_value, param_value_size_ret,
1548 int local_mem_size = 0;
1550 hipDeviceGetAttribute(&local_mem_size,
1551 hipDeviceAttributeMaxSharedMemoryPerBlock,
1552 device->get()) == hipSuccess);
1554 return getInfo(param_value_size, param_value, param_value_size_ret,
1558 int ecc_enabled = 0;
1560 hipDeviceGetAttribute(&ecc_enabled, hipDeviceAttributeEccEnabled,
1561 device->get()) == hipSuccess);
1564 auto result =
static_cast<pi_bool>(ecc_enabled);
1565 return getInfo(param_value_size, param_value, param_value_size_ret, result);
1568 int is_integrated = 0;
1570 hipDeviceGetAttribute(&is_integrated, hipDeviceAttributeIntegrated,
1571 device->get()) == hipSuccess);
1574 auto result =
static_cast<pi_bool>(is_integrated);
1575 return getInfo(param_value_size, param_value, param_value_size_ret, result);
1580 return getInfo(param_value_size, param_value, param_value_size_ret,
1584 return getInfo(param_value_size, param_value, param_value_size_ret,
1588 return getInfo(param_value_size, param_value, param_value_size_ret,
1592 return getInfo(param_value_size, param_value, param_value_size_ret,
1596 return getInfo(param_value_size, param_value, param_value_size_ret,
1600 return getInfo(param_value_size, param_value, param_value_size_ret,
1605 return getInfo(param_value_size, param_value, param_value_size_ret,
1612 return getInfo(param_value_size, param_value, param_value_size_ret,
1618 return getInfo(param_value_size, param_value, param_value_size_ret,
1624 return getInfo(param_value_size, param_value, param_value_size_ret,
"");
1627 return getInfo(param_value_size, param_value, param_value_size_ret,
1628 device->get_platform());
1631 static constexpr
size_t MAX_DEVICE_NAME_LENGTH = 256u;
1632 char name[MAX_DEVICE_NAME_LENGTH];
1634 device->get()) == hipSuccess);
1638 if (strlen(name) == 0) {
1639 hipDeviceProp_t props;
1641 hipGetDeviceProperties(&props, device->get()) == hipSuccess);
1643 return getInfoArray(strlen(props.gcnArchName) + 1, param_value_size,
1644 param_value, param_value_size_ret, props.gcnArchName);
1646 return getInfoArray(strlen(name) + 1, param_value_size, param_value,
1647 param_value_size_ret, name);
1650 return getInfo(param_value_size, param_value, param_value_size_ret,
1654 auto version = getHipVersionString();
1655 return getInfo(param_value_size, param_value, param_value_size_ret,
1659 return getInfo(param_value_size, param_value, param_value_size_ret,
"HIP");
1662 return getInfo(param_value_size, param_value, param_value_size_ret,
1663 device->get_reference_count());
1666 return getInfo(param_value_size, param_value, param_value_size_ret,
1670 return getInfo(param_value_size, param_value, param_value_size_ret,
"");
1677 std::string SupportedExtensions =
"";
1679 SupportedExtensions +=
" ";
1681 hipDeviceProp_t props;
1684 if (props.arch.hasDoubles) {
1685 SupportedExtensions +=
"cl_khr_fp64 ";
1688 return getInfo(param_value_size, param_value, param_value_size_ret,
1689 SupportedExtensions.c_str());
1693 return getInfo(param_value_size, param_value, param_value_size_ret,
1697 return getInfo(param_value_size, param_value, param_value_size_ret,
1701 return getInfo(param_value_size, param_value, param_value_size_ret,
1705 return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1708 return getInfo(param_value_size, param_value, param_value_size_ret,
1712 return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1715 return getInfo(param_value_size, param_value, param_value_size_ret,
1730 if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) {
1742 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1753 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1762 if (getAttribute(device, hipDeviceAttributeManagedMemory)) {
1766 if (getAttribute(device, hipDeviceAttributeConcurrentManagedAccess)) {
1770 if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) {
1776 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1788 if (getAttribute(device, hipDeviceAttributeManagedMemory)) {
1792 if (getAttribute(device, hipDeviceAttributeConcurrentManagedAccess)) {
1798 if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) {
1806 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1816 if (getAttribute(device, hipDeviceAttributePageableMemoryAccess)) {
1821 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1826 hipDeviceProp_t props;
1829 return getInfo(param_value_size, param_value, param_value_size_ret,
1830 props.arch.hasGlobalInt64Atomics &&
1831 props.arch.hasSharedInt64Atomics);
1835 size_t FreeMemory = 0;
1836 size_t TotalMemory = 0;
1839 "failed hipMemGetInfo() API.");
1840 return getInfo(param_value_size, param_value, param_value_size_ret,
1847 hipDeviceGetAttribute(&value, hipDeviceAttributeMemoryClockRate,
1848 device->get()) == hipSuccess);
1851 return getInfo(param_value_size, param_value, param_value_size_ret,
1858 hipDeviceGetAttribute(&value, hipDeviceAttributeMemoryBusWidth,
1859 device->get()) == hipSuccess);
1861 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1864 return getInfo(param_value_size, param_value, param_value_size_ret,
1872 return getInfo(param_value_size, param_value, param_value_size_ret,
1885 return getInfo(param_value_size, param_value, param_value_size_ret,
1894 return getInfo(param_value_size, param_value, param_value_size_ret,
1901 hipDeviceGetAttribute(&value, hipDeviceAttributePciDeviceId,
1902 device->get()) == hipSuccess);
1904 return getInfo(param_value_size, param_value, param_value_size_ret, value);
1908 #if ((HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 2) || \
1909 HIP_VERSION_MAJOR > 5)
1914 std::array<unsigned char, 16> name;
1915 std::copy(uuid.bytes, uuid.bytes + 16, name.begin());
1916 return getInfoArray(16, param_value_size, param_value, param_value_size_ret,
1919 return PI_ERROR_INVALID_VALUE;
1933 PI_ERROR_INVALID_ARG_VALUE);
1934 return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
1971 "Creation of PI device from native handle not implemented");
1998 const void *private_info,
1999 size_t cb,
void *user_data),
2002 assert(devices !=
nullptr);
2005 assert(user_data ==
nullptr);
2006 assert(num_devices == 1);
2008 assert(retcontext !=
nullptr);
2012 bool property_hip_primary =
false;
2013 while (properties && (0 != *properties)) {
2023 property_hip_primary =
static_cast<bool>(value);
2028 "Unknown piContextCreate property in property list");
2029 return PI_ERROR_INVALID_VALUE;
2033 std::unique_ptr<_pi_context> piContextPtr{
nullptr};
2035 hipCtx_t current =
nullptr;
2037 if (property_hip_primary) {
2042 PI_CHECK_ERROR(hipDevicePrimaryCtxRetain(&Ctxt, devices[0]->
get()));
2043 piContextPtr = std::unique_ptr<_pi_context>(
2045 errcode_ret = PI_CHECK_ERROR(hipCtxPushCurrent(Ctxt));
2048 hipCtx_t newContext;
2049 PI_CHECK_ERROR(hipCtxGetCurrent(¤t));
2050 errcode_ret = PI_CHECK_ERROR(
2051 hipCtxCreate(&newContext, hipDeviceMapHost, devices[0]->
get()));
2052 piContextPtr = std::unique_ptr<_pi_context>(
new _pi_context{
2056 static std::once_flag initFlag;
2071 if (current !=
nullptr) {
2072 PI_CHECK_ERROR(hipCtxSetCurrent(current));
2075 *retcontext = piContextPtr.release();
2079 errcode_ret = PI_ERROR_OUT_OF_RESOURCES;
2086 assert(ctxt !=
nullptr);
2093 std::unique_ptr<_pi_context> context{ctxt};
2096 hipCtx_t hipCtxt = ctxt->
get();
2099 #if defined(__HIP_PLATFORM_NVIDIA__)
2100 hipCtx_t current =
nullptr;
2101 PI_CHECK_ERROR(hipCtxGetCurrent(¤t));
2102 if (hipCtxt != current) {
2103 PI_CHECK_ERROR(hipCtxPushCurrent(hipCtxt));
2105 PI_CHECK_ERROR(hipCtxSynchronize());
2106 PI_CHECK_ERROR(hipCtxGetCurrent(¤t));
2107 if (hipCtxt == current) {
2108 PI_CHECK_ERROR(hipCtxPopCurrent(¤t));
2111 return PI_CHECK_ERROR(hipCtxDestroy(hipCtxt));
2116 PI_CHECK_ERROR(hipCtxPopCurrent(¤t));
2117 return PI_CHECK_ERROR(hipDevicePrimaryCtxRelease(hipDev));
2120 hipCtx_t hipCtxt = ctxt->
get();
2121 return PI_CHECK_ERROR(hipCtxDestroy(hipCtxt));
2147 bool ownNativeHandle,
2152 (void)ownNativeHandle;
2155 "Creation of PI context from native handle not implemented");
2167 assert(ret_mem !=
nullptr);
2168 assert((properties ==
nullptr || *properties == 0) &&
2169 "no mem properties goes to HIP RT yet");
2173 const bool enableUseHostPtr =
false;
2174 const bool performInitialCopy =
2178 pi_mem retMemObj =
nullptr;
2181 ScopedContext active(context);
2187 retErr = PI_CHECK_ERROR(
2188 hipHostRegister(
host_ptr, size, hipHostRegisterMapped));
2189 retErr = PI_CHECK_ERROR(hipHostGetDevicePointer(&ptr,
host_ptr, 0));
2192 retErr = PI_CHECK_ERROR(hipHostMalloc(&
host_ptr, size));
2193 retErr = PI_CHECK_ERROR(hipHostGetDevicePointer(&ptr,
host_ptr, 0));
2196 retErr = PI_CHECK_ERROR(hipMalloc(&ptr, size));
2202 if (retErr == PI_SUCCESS) {
2203 pi_mem parentBuffer =
nullptr;
2206 reinterpret_cast<_pi_mem::mem_::mem_::buffer_mem_::native_type
>(ptr);
2207 auto piMemObj = std::unique_ptr<_pi_mem>(
new _pi_mem{
2208 context, parentBuffer, allocMode, devPtr,
host_ptr, size});
2209 if (piMemObj !=
nullptr) {
2210 retMemObj = piMemObj.release();
2211 if (performInitialCopy) {
2213 retErr = PI_CHECK_ERROR(hipMemcpyHtoD(devPtr,
host_ptr, size));
2217 if (retErr == PI_SUCCESS) {
2218 hipStream_t defaultStream = 0;
2219 retErr = PI_CHECK_ERROR(hipStreamSynchronize(defaultStream));
2223 retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
2229 retErr = PI_ERROR_OUT_OF_RESOURCES;
2232 *ret_mem = retMemObj;
2242 assert((memObj !=
nullptr) &&
"PI_ERROR_INVALID_MEM_OBJECTS");
2254 std::unique_ptr<_pi_mem> uniqueMemObj(memObj);
2260 ScopedContext active(uniqueMemObj->get_context());
2263 switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) {
2266 ret = PI_CHECK_ERROR(
2267 hipFree((
void *)uniqueMemObj->mem_.buffer_mem_.ptr_));
2270 ret = PI_CHECK_ERROR(
2271 hipHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
2274 ret = PI_CHECK_ERROR(
2275 hipFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
2280 ret = PI_CHECK_ERROR(hipDestroySurfaceObject(
2281 uniqueMemObj->mem_.surface_mem_.get_surface()));
2282 auto array = uniqueMemObj->mem_.surface_mem_.get_array();
2283 ret = PI_CHECK_ERROR(hipFreeArray(array));
2289 ret = PI_ERROR_OUT_OF_RESOURCES;
2292 if (ret != PI_SUCCESS) {
2298 "Unrecoverable program state reached in hip_piMemRelease");
2310 void *buffer_create_info,
pi_mem *memObj) {
2311 assert((parent_buffer !=
nullptr) &&
"PI_ERROR_INVALID_MEM_OBJECT");
2312 assert(parent_buffer->
is_buffer() &&
"PI_ERROR_INVALID_MEM_OBJECTS");
2313 assert(!parent_buffer->
is_sub_buffer() &&
"PI_ERROR_INVALID_MEM_OBJECT");
2322 "PI_ERROR_INVALID_VALUE");
2323 assert((buffer_create_info !=
nullptr) &&
"PI_ERROR_INVALID_VALUE");
2324 assert(memObj !=
nullptr);
2326 const auto bufferRegion =
2328 assert((bufferRegion.size != 0u) &&
"PI_ERROR_INVALID_BUFFER_SIZE");
2330 assert((bufferRegion.origin <= (bufferRegion.origin + bufferRegion.size)) &&
2332 assert(((bufferRegion.origin + bufferRegion.size) <=
2334 "PI_ERROR_INVALID_BUFFER_SIZE");
2345 void *hostPtr =
nullptr;
2348 bufferRegion.origin;
2353 std::unique_ptr<_pi_mem> retMemObj{
nullptr};
2355 ScopedContext active(context);
2357 retMemObj = std::unique_ptr<_pi_mem>{
new _pi_mem{
2358 context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}};
2364 return PI_ERROR_OUT_OF_HOST_MEMORY;
2368 *memObj = retMemObj.release();
2373 size_t expectedQuerySize,
void *queryOutput,
2374 size_t *writtenQuerySize) {
2377 (void)expectedQuerySize;
2379 (void)writtenQuerySize;
2392 #if defined(__HIP_PLATFORM_NVIDIA__)
2403 return PI_ERROR_INVALID_MEM_OBJECT;
2407 #elif defined(__HIP_PLATFORM_AMD__)
2411 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
2429 bool ownNativeHandle,
2433 (void)ownNativeHandle;
2437 "Creation of PI mem from native handle not implemented");
2450 std::unique_ptr<_pi_queue> queueImpl{
nullptr};
2454 return PI_ERROR_INVALID_DEVICE;
2457 unsigned int flags = 0;
2459 const bool is_out_of_order =
2462 std::vector<hipStream_t> computeHipStreams(
2464 std::vector<hipStream_t> transferHipStreams(
2467 queueImpl = std::unique_ptr<_pi_queue>(
new _pi_queue{
2468 std::move(computeHipStreams), std::move(transferHipStreams), context,
2469 device, properties, flags});
2471 *queue = queueImpl.release();
2480 return PI_ERROR_OUT_OF_RESOURCES;
2490 return PI_ERROR_INVALID_VALUE;
2493 assert(Properties[2] == 0);
2494 if (Properties[2] != 0)
2495 return PI_ERROR_INVALID_VALUE;
2500 size_t param_value_size,
void *param_value,
2501 size_t *param_value_size_ret) {
2502 assert(command_queue !=
nullptr);
2504 switch (param_name) {
2506 return getInfo(param_value_size, param_value, param_value_size_ret,
2509 return getInfo(param_value_size, param_value, param_value_size_ret,
2512 return getInfo(param_value_size, param_value, param_value_size_ret,
2515 return getInfo(param_value_size, param_value, param_value_size_ret,
2518 bool IsReady = command_queue->
all_of([](hipStream_t
s) ->
bool {
2519 const hipError_t ret = hipStreamQuery(
s);
2520 if (ret == hipSuccess)
2523 if (ret == hipErrorNotReady)
2526 PI_CHECK_ERROR(ret);
2529 return getInfo(param_value_size, param_value, param_value_size_ret,
2540 assert(command_queue !=
nullptr);
2548 assert(command_queue !=
nullptr);
2555 std::unique_ptr<_pi_queue> queueImpl(command_queue);
2557 ScopedContext active(command_queue->
get_context());
2560 PI_CHECK_ERROR(hipStreamSynchronize(
s));
2561 PI_CHECK_ERROR(hipStreamDestroy(
s));
2568 return PI_ERROR_OUT_OF_RESOURCES;
2575 pi_result result = PI_ERROR_OUT_OF_HOST_MEMORY;
2579 assert(command_queue !=
2581 ScopedContext active(command_queue->
get_context());
2583 command_queue->
sync_streams<
true>([&result](hipStream_t
s) {
2584 result = PI_CHECK_ERROR(hipStreamSynchronize(
s));
2593 result = PI_ERROR_OUT_OF_RESOURCES;
2603 (void)command_queue;
2636 bool ownNativeHandle,
2642 (void)ownNativeHandle;
2644 "Creation of PI queue from native handle not implemented");
2649 pi_bool blocking_write,
size_t offset,
2650 size_t size,
void *ptr,
2655 assert(buffer !=
nullptr);
2656 assert(command_queue !=
nullptr);
2658 std::unique_ptr<_pi_event> retImplEv{
nullptr};
2661 ScopedContext active(command_queue->
get_context());
2663 retErr = enqueueEventsWait(command_queue, hipStream,
2664 num_events_in_wait_list, event_wait_list);
2672 retErr = PI_CHECK_ERROR(
2674 ptr, size, hipStream));
2677 retErr = retImplEv->record();
2680 if (blocking_write) {
2681 retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
2685 *
event = retImplEv.release();
2694 pi_bool blocking_read,
size_t offset,
2695 size_t size,
void *ptr,
2700 assert(buffer !=
nullptr);
2701 assert(command_queue !=
nullptr);
2703 std::unique_ptr<_pi_event> retImplEv{
nullptr};
2706 ScopedContext active(command_queue->
get_context());
2708 retErr = enqueueEventsWait(command_queue, hipStream,
2709 num_events_in_wait_list, event_wait_list);
2717 retErr = PI_CHECK_ERROR(hipMemcpyDtoHAsync(
2722 retErr = retImplEv->record();
2725 if (blocking_read) {
2726 retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
2730 *
event = retImplEv.release();
2742 assert(num_events != 0);
2744 if (num_events == 0) {
2745 return PI_ERROR_INVALID_VALUE;
2749 return PI_ERROR_INVALID_EVENT;
2753 ScopedContext active(context);
2757 return PI_ERROR_INVALID_EVENT;
2760 if (event->get_context() != context) {
2761 return PI_ERROR_INVALID_CONTEXT;
2764 return event->wait();
2766 return forLatestEvents(event_list, num_events, waitFunc);
2770 return PI_ERROR_OUT_OF_RESOURCES;
2776 assert(kernel !=
nullptr);
2777 assert(program !=
nullptr);
2780 std::unique_ptr<_pi_kernel> retKernel{
nullptr};
2785 hipFunction_t hipFunc;
2786 retErr = PI_CHECK_ERROR(
2787 hipModuleGetFunction(&hipFunc, program->
get(), kernel_name));
2789 std::string kernel_name_woffset = std::string(kernel_name) +
"_with_offset";
2790 hipFunction_t hipFuncWithOffsetParam;
2791 hipError_t offsetRes = hipModuleGetFunction(
2792 &hipFuncWithOffsetParam, program->
get(), kernel_name_woffset.c_str());
2795 if (offsetRes == hipErrorNotFound) {
2796 hipFuncWithOffsetParam =
nullptr;
2798 retErr = PI_CHECK_ERROR(offsetRes);
2801 retKernel = std::unique_ptr<_pi_kernel>(
2802 new _pi_kernel{hipFunc, hipFuncWithOffsetParam, kernel_name, program,
2807 retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
2810 *kernel = retKernel.release();
2815 size_t arg_size,
const void *arg_value) {
2817 assert(kernel !=
nullptr);
2821 kernel->set_kernel_arg(arg_index, arg_size, arg_value);
2823 kernel->set_kernel_local_arg(arg_index, arg_size);
2832 const pi_mem *arg_value) {
2834 assert(kernel !=
nullptr);
2835 assert(arg_value !=
nullptr);
2839 pi_mem arg_mem = *arg_value;
2843 hipArray_Format Format;
2845 getArrayDesc(array, Format, NumChannels);
2846 if (Format != HIP_AD_FORMAT_UNSIGNED_INT32 &&
2847 Format != HIP_AD_FORMAT_SIGNED_INT32 &&
2848 Format != HIP_AD_FORMAT_HALF && Format != HIP_AD_FORMAT_FLOAT) {
2850 "PI HIP kernels only support images with channel types int32, "
2851 "uint32, float, and half.");
2854 kernel->set_kernel_arg(arg_index,
sizeof(hipSurf), (
void *)&hipSurf);
2859 kernel->set_kernel_arg(arg_index,
sizeof(
void *), (
void *)&hipPtr);
2870 assert(kernel !=
nullptr);
2871 assert(arg_value !=
nullptr);
2875 pi_uint32 samplerProps = (*arg_value)->props_;
2876 kernel->set_kernel_arg(arg_index,
sizeof(
pi_uint32), (
void *)&samplerProps);
2885 const size_t *global_work_offset,
const size_t *global_work_size,
2886 const size_t *local_work_size,
pi_uint32 num_events_in_wait_list,
2890 assert(command_queue !=
nullptr);
2891 assert(command_queue->
get_context() == kernel->get_context());
2892 assert(kernel !=
nullptr);
2893 assert(global_work_offset !=
nullptr);
2894 assert(work_dim > 0);
2895 assert(work_dim < 4);
2897 if (*global_work_size == 0) {
2899 command_queue, num_events_in_wait_list, event_wait_list, event);
2904 size_t threadsPerBlock[3] = {32u, 1u, 1u};
2905 size_t maxWorkGroupSize = 0u;
2906 size_t maxThreadsPerBlock[3] = {};
2907 bool providedLocalWorkGroupSize = (local_work_size !=
nullptr);
2912 sizeof(maxThreadsPerBlock), maxThreadsPerBlock,
nullptr);
2913 assert(retError == PI_SUCCESS);
2918 sizeof(maxWorkGroupSize), &maxWorkGroupSize,
nullptr);
2919 assert(retError == PI_SUCCESS);
2923 if (providedLocalWorkGroupSize) {
2924 auto isValid = [&](
int dim) {
2925 if (local_work_size[dim] > maxThreadsPerBlock[dim])
2926 return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2930 if (0u == local_work_size[dim])
2931 return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2932 if (0u != (global_work_size[dim] % local_work_size[dim]))
2933 return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2934 threadsPerBlock[dim] = local_work_size[dim];
2938 for (
size_t dim = 0; dim < work_dim; dim++) {
2939 auto err = isValid(dim);
2940 if (err != PI_SUCCESS)
2944 simpleGuessLocalWorkSize(threadsPerBlock, global_work_size,
2945 maxThreadsPerBlock, kernel);
2949 if (maxWorkGroupSize <
2950 size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
2951 return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2954 size_t blocksPerGrid[3] = {1u, 1u, 1u};
2956 for (
size_t i = 0; i < work_dim; i++) {
2958 (global_work_size[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i];
2962 std::unique_ptr<_pi_event> retImplEv{
nullptr};
2965 ScopedContext active(command_queue->
get_context());
2970 num_events_in_wait_list, event_wait_list, guard, &stream_token);
2971 hipFunction_t hipFunc = kernel->get();
2973 retError = enqueueEventsWait(command_queue, hipStream,
2974 num_events_in_wait_list, event_wait_list);
2977 if (kernel->get_with_offset_parameter()) {
2978 std::uint32_t hip_implicit_offset[3] = {0, 0, 0};
2979 if (global_work_offset) {
2980 for (
size_t i = 0; i < work_dim; i++) {
2981 hip_implicit_offset[i] =
2982 static_cast<std::uint32_t
>(global_work_offset[i]);
2983 if (global_work_offset[i] != 0) {
2984 hipFunc = kernel->get_with_offset_parameter();
2988 kernel->set_implicit_offset_arg(
sizeof(hip_implicit_offset),
2989 hip_implicit_offset);
2992 auto argIndices = kernel->get_arg_indices();
2995 retImplEv = std::unique_ptr<_pi_event>(
2997 hipStream, stream_token));
3002 static const char *local_mem_sz_ptr =
3003 std::getenv(
"SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE");
3005 if (local_mem_sz_ptr) {
3006 int device_max_local_mem = 0;
3007 retError = PI_CHECK_ERROR(hipDeviceGetAttribute(
3008 &device_max_local_mem, hipDeviceAttributeMaxSharedMemoryPerBlock,
3011 static const int env_val = std::atoi(local_mem_sz_ptr);
3012 if (env_val <= 0 || env_val > device_max_local_mem) {
3014 "SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE",
3015 PI_ERROR_PLUGIN_SPECIFIC_ERROR);
3016 return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
3018 retError = PI_CHECK_ERROR(hipFuncSetAttribute(
3019 hipFunc, hipFuncAttributeMaxDynamicSharedMemorySize, env_val));
3022 retError = PI_CHECK_ERROR(hipModuleLaunchKernel(
3023 hipFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2],
3024 threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2],
3025 kernel->get_local_size(), hipStream, argIndices.data(),
nullptr));
3027 kernel->clear_local_size();
3030 retError = retImplEv->record();
3031 *
event = retImplEv.release();
3042 size_t cb_args,
pi_uint32 num_mem_objects,
3043 const pi_mem *mem_list,
const void **args_mem_loc,
3050 (void)num_mem_objects;
3053 (void)num_events_in_wait_list;
3054 (void)event_wait_list;
3069 assert(ret_mem !=
nullptr);
3079 "hip_piMemImageCreate only supports RGBA channel order");
3085 HIP_ARRAY3D_DESCRIPTOR array_desc;
3086 array_desc.NumChannels = 4;
3087 array_desc.Flags = 0;
3090 array_desc.Height = 0;
3091 array_desc.Depth = 0;
3094 array_desc.Depth = 0;
3101 size_t pixel_type_size_bytes;
3106 array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT8;
3107 pixel_type_size_bytes = 1;
3110 array_desc.Format = HIP_AD_FORMAT_SIGNED_INT8;
3111 pixel_type_size_bytes = 1;
3115 array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT16;
3116 pixel_type_size_bytes = 2;
3119 array_desc.Format = HIP_AD_FORMAT_SIGNED_INT16;
3120 pixel_type_size_bytes = 2;
3123 array_desc.Format = HIP_AD_FORMAT_HALF;
3124 pixel_type_size_bytes = 2;
3127 array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT32;
3128 pixel_type_size_bytes = 4;
3131 array_desc.Format = HIP_AD_FORMAT_SIGNED_INT32;
3132 pixel_type_size_bytes = 4;
3135 array_desc.Format = HIP_AD_FORMAT_FLOAT;
3136 pixel_type_size_bytes = 4;
3140 "hip_piMemImageCreate given unsupported image_channel_data_type");
3144 size_t pixel_size_bytes =
3145 pixel_type_size_bytes * 4;
3146 size_t image_size_bytes = pixel_size_bytes * image_desc->
image_width *
3149 ScopedContext active(context);
3150 hipArray *image_array;
3151 retErr = PI_CHECK_ERROR(hipArray3DCreate(
3152 reinterpret_cast<hipCUarray *
>(&image_array), &array_desc));
3155 if (performInitialCopy) {
3158 retErr = PI_CHECK_ERROR(
3159 hipMemcpyHtoA(image_array, 0,
host_ptr, image_size_bytes));
3161 hip_Memcpy2D cpy_desc;
3162 memset(&cpy_desc, 0,
sizeof(cpy_desc));
3163 cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost;
3165 cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray;
3166 cpy_desc.dstArray =
reinterpret_cast<hipCUarray
>(image_array);
3167 cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->
image_width;
3169 retErr = PI_CHECK_ERROR(hipMemcpyParam2D(&cpy_desc));
3171 HIP_MEMCPY3D cpy_desc;
3172 memset(&cpy_desc, 0,
sizeof(cpy_desc));
3173 cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost;
3175 cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray;
3176 cpy_desc.dstArray =
reinterpret_cast<hipCUarray
>(image_array);
3177 cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->
image_width;
3180 retErr = PI_CHECK_ERROR(hipDrvMemcpy3D(&cpy_desc));
3191 hipResourceDesc image_res_desc;
3192 image_res_desc.res.array.array = image_array;
3193 image_res_desc.resType = hipResourceTypeArray;
3195 hipSurfaceObject_t surface;
3196 retErr = PI_CHECK_ERROR(hipCreateSurfaceObject(&surface, &image_res_desc));
3198 auto piMemObj = std::unique_ptr<_pi_mem>(
new _pi_mem{
3201 if (piMemObj ==
nullptr) {
3202 return PI_ERROR_OUT_OF_HOST_MEMORY;
3205 *ret_mem = piMemObj.release();
3207 PI_CHECK_ERROR(hipFreeArray(image_array));
3210 PI_CHECK_ERROR(hipFreeArray(image_array));
3211 return PI_ERROR_UNKNOWN;
3218 size_t param_value_size,
void *param_value,
3219 size_t *param_value_size_ret) {
3222 (void)param_value_size;
3224 (void)param_value_size_ret;
3231 assert(mem !=
nullptr);
3241 const char **strings,
3242 const size_t *lengths,
3251 return PI_ERROR_INVALID_OPERATION;
3259 const pi_device *device_list,
const char *options,
3264 assert(program !=
nullptr);
3265 assert(num_devices == 1 || num_devices == 0);
3266 assert(device_list !=
nullptr || num_devices == 0);
3268 assert(user_data ==
nullptr);
3302 const size_t *lengths,
const unsigned char **binaries,
3305 (void)num_metadata_entries;
3307 (void)binary_status;
3309 assert(context !=
nullptr);
3310 assert(binaries !=
nullptr);
3311 assert(program !=
nullptr);
3312 assert(device_list !=
nullptr);
3313 assert(num_devices == 1 &&
"HIP contexts are for a single device");
3315 "Mismatch between devices context and passed context when creating "
3316 "program from binary");
3320 std::unique_ptr<_pi_program> retProgram{
new _pi_program{context}};
3325 const bool has_length = (lengths !=
nullptr);
3326 size_t length = has_length
3328 : strlen(
reinterpret_cast<const char *
>(binaries[0])) + 1;
3330 assert(length != 0);
3332 retProgram->set_binary(
reinterpret_cast<const char *
>(binaries[0]), length);
3334 *program = retProgram.release();
3340 size_t param_value_size,
void *param_value,
3341 size_t *param_value_size_ret) {
3342 assert(program !=
nullptr);
3344 switch (param_name) {
3346 return getInfo(param_value_size, param_value, param_value_size_ret,
3349 return getInfo(param_value_size, param_value, param_value_size_ret,
3352 return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
3354 return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3357 return getInfo(param_value_size, param_value, param_value_size_ret,
3360 return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3363 return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3366 return getInfo(param_value_size, param_value, param_value_size_ret,
3377 const pi_device *device_list,
const char *options,
3387 (void)num_input_programs;
3388 (void)input_programs;
3393 "hip_piProgramLink: linking not supported with hip backend");
3403 const char *options,
pi_uint32 num_input_headers,
3404 const pi_program *input_headers,
const char **header_include_names,
3406 (void)input_headers;
3407 (void)header_include_names;
3409 assert(program !=
nullptr);
3410 assert(num_devices == 1 || num_devices == 0);
3411 assert(device_list !=
nullptr || num_devices == 0);
3413 assert(user_data ==
nullptr);
3414 assert(num_input_headers == 0);
3430 size_t param_value_size,
void *param_value,
3431 size_t *param_value_size_ret) {
3434 assert(program !=
nullptr);
3436 switch (param_name) {
3438 return getInfo(param_value_size, param_value, param_value_size_ret,
3442 return getInfo(param_value_size, param_value, param_value_size_ret,
3446 param_value_size_ret, program->
infoLog_);
3455 assert(program !=
nullptr);
3465 assert(program !=
nullptr);
3470 "Reference count overflow detected in hip_piProgramRelease.");
3475 std::unique_ptr<_pi_program> program_ptr{program};
3477 pi_result result = PI_ERROR_INVALID_PROGRAM;
3481 auto hipModule = program->
get();
3482 result = PI_CHECK_ERROR(hipModuleUnload(hipModule));
3484 result = PI_ERROR_OUT_OF_RESOURCES;
3518 bool ownNativeHandle,
3522 (void)ownNativeHandle;
3526 "Creation of PI program from native handle not implemented");
3531 size_t param_value_size,
void *param_value,
3532 size_t *param_value_size_ret) {
3534 if (kernel !=
nullptr) {
3536 switch (param_name) {
3538 return getInfo(param_value_size, param_value, param_value_size_ret,
3539 kernel->get_name());
3541 return getInfo(param_value_size, param_value, param_value_size_ret,
3542 kernel->get_num_args());
3544 return getInfo(param_value_size, param_value, param_value_size_ret,
3545 kernel->get_reference_count());
3547 return getInfo(param_value_size, param_value, param_value_size_ret,
3548 kernel->get_context());
3551 return getInfo(param_value_size, param_value, param_value_size_ret,
3552 kernel->get_program());
3555 return getInfo(param_value_size, param_value, param_value_size_ret,
"");
3563 return PI_ERROR_INVALID_KERNEL;
3568 size_t param_value_size,
void *param_value,
3569 size_t *param_value_size_ret) {
3573 if (kernel !=
nullptr) {
3575 switch (param_name) {
3577 int max_threads = 0;
3579 hipFuncGetAttribute(&max_threads,
3580 HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
3581 kernel->get()) == hipSuccess);
3582 return getInfo(param_value_size, param_value, param_value_size_ret,
3583 size_t(max_threads));
3592 size_t group_size[3] = {0, 0, 0};
3594 param_value_size_ret, group_size);
3600 hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
3601 kernel->get()) == hipSuccess);
3602 return getInfo(param_value_size, param_value, param_value_size_ret,
3609 hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
3610 device->get()) == hipSuccess);
3611 return getInfo(param_value_size, param_value, param_value_size_ret,
3612 static_cast<size_t>(warpSize));
3618 hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
3619 kernel->get()) == hipSuccess);
3620 return getInfo(param_value_size, param_value, param_value_size_ret,
3625 "piKernelGetGroupInfo not implemented\n");
3634 return PI_ERROR_INVALID_KERNEL;
3639 size_t input_value_size,
const void *input_value,
size_t param_value_size,
3640 void *param_value,
size_t *param_value_size_ret) {
3641 (void)input_value_size;
3644 if (kernel !=
nullptr) {
3645 switch (param_name) {
3650 hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
3651 device->get()) == hipSuccess);
3652 return getInfo(param_value_size, param_value, param_value_size_ret,
3653 static_cast<uint32_t
>(warpSize));
3657 int max_threads = 0;
3659 hipFuncGetAttribute(&max_threads,
3660 HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
3661 kernel->get()) == hipSuccess);
3664 0,
nullptr,
sizeof(uint32_t), &warpSize,
3666 int maxWarps = (max_threads + warpSize - 1) / warpSize;
3667 return getInfo(param_value_size, param_value, param_value_size_ret,
3668 static_cast<uint32_t
>(maxWarps));
3673 return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3680 return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3686 return PI_ERROR_INVALID_KERNEL;
3690 assert(kernel !=
nullptr);
3691 assert(kernel->get_reference_count() > 0u);
3693 kernel->increment_reference_count();
3698 assert(kernel !=
nullptr);
3702 assert(kernel->get_reference_count() != 0 &&
3703 "Reference count overflow detected in hip_piKernelRelease.");
3706 if (kernel->decrement_reference_count() == 0) {
3718 size_t param_value_size,
3719 const void *param_value) {
3722 (void)param_value_size;
3729 size_t,
const void *) {
3737 size_t arg_size,
const void *arg_value) {
3738 kernel->set_kernel_arg(arg_index, arg_size, arg_value);
3753 size_t param_value_size,
void *param_value,
3754 size_t *param_value_size_ret) {
3755 assert(event !=
nullptr);
3757 switch (param_name) {
3759 return getInfo(param_value_size, param_value, param_value_size_ret,
3760 event->get_queue());
3762 return getInfo(param_value_size, param_value, param_value_size_ret,
3763 event->get_command_type());
3765 return getInfo(param_value_size, param_value, param_value_size_ret,
3766 event->get_reference_count());
3768 return getInfo(param_value_size, param_value, param_value_size_ret,
3772 return getInfo(param_value_size, param_value, param_value_size_ret,
3773 event->get_context());
3778 return PI_ERROR_INVALID_EVENT;
3785 size_t param_value_size,
3787 size_t *param_value_size_ret) {
3789 assert(event !=
nullptr);
3791 pi_queue queue =
event->get_queue();
3792 if (queue ==
nullptr ||
3794 return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE;
3797 switch (param_name) {
3801 return getInfo<pi_uint64>(param_value_size, param_value,
3802 param_value_size_ret, event->get_queued_time());
3804 return getInfo<pi_uint64>(param_value_size, param_value,
3805 param_value_size_ret, event->get_start_time());
3807 return getInfo<pi_uint64>(param_value_size, param_value,
3808 param_value_size_ret, event->get_end_time());
3817 pi_int32 command_exec_callback_type,
3820 (void)command_exec_callback_type;
3830 (void)execution_status;
3833 return PI_ERROR_INVALID_VALUE;
3837 assert(event !=
nullptr);
3839 const auto refCount =
event->increment_reference_count();
3842 refCount != 0,
"Reference count overflow detected in hip_piEventRetain.");
3848 assert(event !=
nullptr);
3853 event->get_reference_count() != 0,
3854 "Reference count overflow detected in hip_piEventRelease.");
3857 if (event->decrement_reference_count() == 0) {
3858 std::unique_ptr<_pi_event> event_ptr{
event};
3859 pi_result result = PI_ERROR_INVALID_EVENT;
3861 ScopedContext active(event->get_context());
3862 result =
event->release();
3864 result = PI_ERROR_OUT_OF_RESOURCES;
3884 command_queue, num_events_in_wait_list, event_wait_list, event);
3896 if (!command_queue) {
3897 return PI_ERROR_INVALID_QUEUE;
3903 ScopedContext active(command_queue->
get_context());
3907 num_events_in_wait_list, event_wait_list, guard, &stream_token);
3909 std::lock_guard<std::mutex> guard(command_queue->
barrier_mutex_);
3913 if (num_events_in_wait_list == 0) {
3920 if (hipStream != s) {
3921 PI_CHECK_ERROR(hipEventRecord(tmp_event, s));
3922 PI_CHECK_ERROR(hipStreamWaitEvent(hipStream, tmp_event, 0));
3926 forLatestEvents(event_wait_list, num_events_in_wait_list,
3928 if (event->get_queue()->has_been_synchronized(
3929 event->get_compute_stream_token())) {
3932 return PI_CHECK_ERROR(
3933 hipStreamWaitEvent(hipStream, event->get(), 0));
3938 result = PI_CHECK_ERROR(
3940 for (
unsigned int i = 0;
3944 for (
unsigned int i = 0;
3949 if (result != PI_SUCCESS) {
3955 hipStream, stream_token);
3964 return PI_ERROR_UNKNOWN;
3990 bool ownNativeHandle,
3994 (void)ownNativeHandle;
3998 "Creation of PI event from native handle not implemented");
4014 std::unique_ptr<_pi_sampler> retImplSampl{
new _pi_sampler(context)};
4016 bool propSeen[3] = {
false,
false,
false};
4017 for (
size_t i = 0; sampler_properties[i] != 0; i += 2) {
4018 switch (sampler_properties[i]) {
4021 return PI_ERROR_INVALID_VALUE;
4024 retImplSampl->props_ |= sampler_properties[i + 1];
4028 return PI_ERROR_INVALID_VALUE;
4031 retImplSampl->props_ |=
4036 return PI_ERROR_INVALID_VALUE;
4039 retImplSampl->props_ |=
4043 return PI_ERROR_INVALID_VALUE;
4048 retImplSampl->props_ |=
PI_TRUE;
4052 retImplSampl->props_ |=
4057 *result_sampler = retImplSampl.release();
4071 size_t param_value_size,
void *param_value,
4072 size_t *param_value_size_ret) {
4073 assert(sampler !=
nullptr);
4075 switch (param_name) {
4077 return getInfo(param_value_size, param_value, param_value_size_ret,
4080 return getInfo(param_value_size, param_value, param_value_size_ret,
4084 return getInfo(param_value_size, param_value, param_value_size_ret,
4090 return getInfo(param_value_size, param_value, param_value_size_ret,
4097 return getInfo(param_value_size, param_value, param_value_size_ret,
4112 assert(sampler !=
nullptr);
4124 assert(sampler !=
nullptr);
4130 "Reference count overflow detected in hip_piSamplerRelease.");
4148 size_t src_row_pitch,
size_t src_slice_pitch,
void *dst_ptr,
4150 size_t dst_row_pitch,
size_t dst_slice_pitch) {
4152 assert(region !=
nullptr);
4153 assert(src_offset !=
nullptr);
4154 assert(dst_offset !=
nullptr);
4156 assert(src_type == hipMemoryTypeDevice || src_type == hipMemoryTypeHost);
4157 assert(dst_type == hipMemoryTypeDevice || dst_type == hipMemoryTypeHost);
4159 src_row_pitch = (!src_row_pitch) ? region->
width_bytes : src_row_pitch;
4160 src_slice_pitch = (!src_slice_pitch) ? (region->
height_scalar * src_row_pitch)
4162 dst_row_pitch = (!dst_row_pitch) ? region->
width_bytes : dst_row_pitch;
4163 dst_slice_pitch = (!dst_slice_pitch) ? (region->
height_scalar * dst_row_pitch)
4166 HIP_MEMCPY3D params;
4172 params.srcMemoryType = src_type;
4173 params.srcDevice = src_type == hipMemoryTypeDevice
4174 ? *
static_cast<const hipDeviceptr_t *
>(src_ptr)
4176 params.srcHost = src_type == hipMemoryTypeHost ? src_ptr :
nullptr;
4177 params.srcXInBytes = src_offset->
x_bytes;
4178 params.srcY = src_offset->
y_scalar;
4179 params.srcZ = src_offset->
z_scalar;
4180 params.srcPitch = src_row_pitch;
4181 params.srcHeight = src_slice_pitch / src_row_pitch;
4183 params.dstMemoryType = dst_type;
4184 params.dstDevice = dst_type == hipMemoryTypeDevice
4185 ? *
reinterpret_cast<hipDeviceptr_t *
>(dst_ptr)
4187 params.dstHost = dst_type == hipMemoryTypeHost ? dst_ptr :
nullptr;
4188 params.dstXInBytes = dst_offset->
x_bytes;
4189 params.dstY = dst_offset->
y_scalar;
4190 params.dstZ = dst_offset->
z_scalar;
4191 params.dstPitch = dst_row_pitch;
4192 params.dstHeight = dst_slice_pitch / dst_row_pitch;
4194 return PI_CHECK_ERROR(hipDrvMemcpy3DAsync(¶ms, hip_stream));
4203 size_t buffer_slice_pitch,
size_t host_row_pitch,
size_t host_slice_pitch,
4204 void *ptr,
pi_uint32 num_events_in_wait_list,
4207 assert(buffer !=
nullptr);
4208 assert(command_queue !=
nullptr);
4212 std::unique_ptr<_pi_event> retImplEv{
nullptr};
4215 ScopedContext active(command_queue->
get_context());
4218 retErr = enqueueEventsWait(command_queue, hipStream,
4219 num_events_in_wait_list, event_wait_list);
4228 hipStream, region, &devPtr, hipMemoryTypeDevice, buffer_offset,
4229 buffer_row_pitch, buffer_slice_pitch, ptr, hipMemoryTypeHost,
4230 host_offset, host_row_pitch, host_slice_pitch);
4233 retErr = retImplEv->record();
4236 if (blocking_read) {
4237 retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
4241 *
event = retImplEv.release();
4254 size_t buffer_slice_pitch,
size_t host_row_pitch,
size_t host_slice_pitch,
4255 const void *ptr,
pi_uint32 num_events_in_wait_list,
4258 assert(buffer !=
nullptr);
4259 assert(command_queue !=
nullptr);
4263 std::unique_ptr<_pi_event> retImplEv{
nullptr};
4266 ScopedContext active(command_queue->
get_context());
4268 retErr = enqueueEventsWait(command_queue, hipStream,
4269 num_events_in_wait_list, event_wait_list);
4278 hipStream, region, ptr, hipMemoryTypeHost, host_offset, host_row_pitch,
4279 host_slice_pitch, &devPtr, hipMemoryTypeDevice, buffer_offset,
4280 buffer_row_pitch, buffer_slice_pitch);
4283 retErr = retImplEv->record();
4286 if (blocking_write) {
4287 retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
4291 *
event = retImplEv.release();
4301 pi_mem dst_buffer,
size_t src_offset,
4302 size_t dst_offset,
size_t size,
4306 if (!command_queue) {
4307 return PI_ERROR_INVALID_QUEUE;
4310 std::unique_ptr<_pi_event> retImplEv{
nullptr};
4313 ScopedContext active(command_queue->
get_context());
4317 if (event_wait_list) {
4318 result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
4325 result = retImplEv->start();
4331 result = PI_CHECK_ERROR(hipMemcpyDtoDAsync(dst, src, size, stream));
4334 result = retImplEv->record();
4335 *
event = retImplEv.release();
4342 return PI_ERROR_UNKNOWN;
4350 size_t dst_row_pitch,
size_t dst_slice_pitch,
4354 assert(src_buffer !=
nullptr);
4355 assert(dst_buffer !=
nullptr);
4356 assert(command_queue !=
nullptr);
4361 std::unique_ptr<_pi_event> retImplEv{
nullptr};
4364 ScopedContext active(command_queue->
get_context());
4366 retErr = enqueueEventsWait(command_queue, hipStream,
4367 num_events_in_wait_list, event_wait_list);
4376 hipStream, region, &srcPtr, hipMemoryTypeDevice, src_origin,
4377 src_row_pitch, src_slice_pitch, &dstPtr, hipMemoryTypeDevice,
4378 dst_origin, dst_row_pitch, dst_slice_pitch);
4381 retImplEv->record();
4382 *
event = retImplEv.release();
4392 const void *pattern,
size_t pattern_size,
4393 size_t offset,
size_t size,
4397 assert(command_queue !=
nullptr);
4399 auto args_are_multiples_of_pattern_size =
4400 (offset % pattern_size == 0) || (size % pattern_size == 0);
4402 auto pattern_is_valid = (pattern !=
nullptr);
4404 auto pattern_size_is_valid =
4405 ((pattern_size & (pattern_size - 1)) == 0) &&
4406 (pattern_size > 0) && (pattern_size <= 128);
4408 assert(args_are_multiples_of_pattern_size && pattern_is_valid &&
4409 pattern_size_is_valid);
4410 (void)args_are_multiples_of_pattern_size;
4411 (void)pattern_is_valid;
4412 (void)pattern_size_is_valid;
4414 std::unique_ptr<_pi_event> retImplEv{
nullptr};
4417 ScopedContext active(command_queue->
get_context());
4421 if (event_wait_list) {
4422 result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
4429 result = retImplEv->start();
4433 auto N = size / pattern_size;
4436 switch (pattern_size) {
4438 auto value = *
static_cast<const uint8_t *
>(pattern);
4439 result = PI_CHECK_ERROR(hipMemsetD8Async(dstDevice, value, N, stream));
4443 auto value = *
static_cast<const uint16_t *
>(pattern);
4444 result = PI_CHECK_ERROR(hipMemsetD16Async(dstDevice, value, N, stream));
4448 auto value = *
static_cast<const uint32_t *
>(pattern);
4449 result = PI_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, N, stream));
4465 auto number_of_steps = pattern_size /
sizeof(uint8_t);
4466 auto pitch = number_of_steps *
sizeof(uint8_t);
4467 auto height = size / number_of_steps;
4468 auto count_32 = size /
sizeof(uint32_t);
4471 auto value = *(
static_cast<const uint32_t *
>(pattern));
4473 PI_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, count_32, stream));
4474 for (
auto step = 4u; step < number_of_steps; ++step) {
4476 value = *(
static_cast<const uint8_t *
>(pattern) + step);
4479 auto offset_ptr =
reinterpret_cast<void *
>(
4480 reinterpret_cast<uint8_t *
>(dstDevice) + (step *
sizeof(uint8_t)));
4483 result = PI_CHECK_ERROR(hipMemset2DAsync(
4484 offset_ptr, pitch, value,
sizeof(uint8_t), height, stream));
4491 result = retImplEv->record();
4492 *
event = retImplEv.release();
4499 return PI_ERROR_UNKNOWN;
4504 switch (array_format) {
4505 case HIP_AD_FORMAT_UNSIGNED_INT8:
4506 case HIP_AD_FORMAT_SIGNED_INT8:
4508 case HIP_AD_FORMAT_UNSIGNED_INT16:
4509 case HIP_AD_FORMAT_SIGNED_INT16:
4510 case HIP_AD_FORMAT_HALF:
4512 case HIP_AD_FORMAT_UNSIGNED_INT32:
4513 case HIP_AD_FORMAT_SIGNED_INT32:
4514 case HIP_AD_FORMAT_FLOAT:
4530 hipStream_t hip_stream,
pi_mem_type img_type,
const size_t *region,
4531 const void *src_ptr,
const hipMemoryType src_type,
const size_t *src_offset,
4532 void *dst_ptr,
const hipMemoryType dst_type,
const size_t *dst_offset) {
4533 assert(region !=
nullptr);
4535 assert(src_type == hipMemoryTypeArray || src_type == hipMemoryTypeHost);
4536 assert(dst_type == hipMemoryTypeArray || dst_type == hipMemoryTypeHost);
4539 hip_Memcpy2D cpyDesc;
4540 memset(&cpyDesc, 0,
sizeof(cpyDesc));
4541 cpyDesc.srcMemoryType = src_type;
4542 if (src_type == hipMemoryTypeArray) {
4544 reinterpret_cast<hipCUarray
>(
const_cast<void *
>(src_ptr));
4545 cpyDesc.srcXInBytes = src_offset[0];
4546 cpyDesc.srcY = src_offset[1];
4548 cpyDesc.srcHost = src_ptr;
4550 cpyDesc.dstMemoryType = dst_type;
4551 if (dst_type == hipMemoryTypeArray) {
4553 reinterpret_cast<hipCUarray
>(
const_cast<void *
>(dst_ptr));
4554 cpyDesc.dstXInBytes = dst_offset[0];
4555 cpyDesc.dstY = dst_offset[1];
4557 cpyDesc.dstHost = dst_ptr;
4559 cpyDesc.WidthInBytes = region[0];
4560 cpyDesc.Height = region[1];
4561 return PI_CHECK_ERROR(hipMemcpyParam2DAsync(&cpyDesc, hip_stream));
4566 HIP_MEMCPY3D cpyDesc;
4567 memset(&cpyDesc, 0,
sizeof(cpyDesc));
4568 cpyDesc.srcMemoryType = src_type;
4569 if (src_type == hipMemoryTypeArray) {
4571 reinterpret_cast<hipCUarray
>(
const_cast<void *
>(src_ptr));
4572 cpyDesc.srcXInBytes = src_offset[0];
4573 cpyDesc.srcY = src_offset[1];
4574 cpyDesc.srcZ = src_offset[2];
4576 cpyDesc.srcHost = src_ptr;
4578 cpyDesc.dstMemoryType = dst_type;
4579 if (dst_type == hipMemoryTypeArray) {
4580 cpyDesc.dstArray =
reinterpret_cast<hipCUarray
>(dst_ptr);
4581 cpyDesc.dstXInBytes = dst_offset[0];
4582 cpyDesc.dstY = dst_offset[1];
4583 cpyDesc.dstZ = dst_offset[2];
4585 cpyDesc.dstHost = dst_ptr;
4587 cpyDesc.WidthInBytes = region[0];
4588 cpyDesc.Height = region[1];
4589 cpyDesc.Depth = region[2];
4590 return PI_CHECK_ERROR(hipDrvMemcpy3DAsync(&cpyDesc, hip_stream));
4591 return PI_ERROR_UNKNOWN;
4594 return PI_ERROR_INVALID_VALUE;
4598 pi_bool blocking_read,
const size_t *origin,
4599 const size_t *region,
size_t row_pitch,
4600 size_t slice_pitch,
void *ptr,
4607 assert(command_queue !=
nullptr);
4608 assert(image !=
nullptr);
4614 ScopedContext active(command_queue->
get_context());
4617 if (event_wait_list) {
4618 retErr = enqueueEventsWait(command_queue, hipStream,
4619 num_events_in_wait_list, event_wait_list);
4622 hipArray *array = image->mem_.surface_mem_.get_array();
4624 hipArray_Format Format;
4626 getArrayDesc(array, Format, NumChannels);
4630 size_t byteOffsetX = origin[0] * elementByteSize * NumChannels;
4631 size_t bytesToCopy = elementByteSize * NumChannels * region[0];
4633 pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
4635 size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4636 size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]};
4639 array, hipMemoryTypeArray, srcOffset,
4640 ptr, hipMemoryTypeHost,
nullptr);
4642 if (retErr != PI_SUCCESS) {
4648 command_queue, hipStream);
4649 new_event->record();
4653 if (blocking_read) {
4654 retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
4659 return PI_ERROR_UNKNOWN;
4667 const size_t *origin,
const size_t *region,
4668 size_t input_row_pitch,
4669 size_t input_slice_pitch,
const void *ptr,
4673 (void)blocking_write;
4674 (void)input_row_pitch;
4675 (void)input_slice_pitch;
4676 assert(command_queue !=
nullptr);
4677 assert(image !=
nullptr);
4683 ScopedContext active(command_queue->
get_context());
4686 if (event_wait_list) {
4687 retErr = enqueueEventsWait(command_queue, hipStream,
4688 num_events_in_wait_list, event_wait_list);
4691 hipArray *array = image->mem_.surface_mem_.get_array();
4693 hipArray_Format Format;
4695 getArrayDesc(array, Format, NumChannels);
4699 size_t byteOffsetX = origin[0] * elementByteSize * NumChannels;
4700 size_t bytesToCopy = elementByteSize * NumChannels * region[0];
4702 pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
4704 size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4705 size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]};
4708 ptr, hipMemoryTypeHost,
nullptr, array,
4709 hipMemoryTypeArray, dstOffset);
4711 if (retErr != PI_SUCCESS) {
4717 command_queue, hipStream);
4718 new_event->record();
4724 return PI_ERROR_UNKNOWN;
4733 pi_mem dst_image,
const size_t *src_origin,
4734 const size_t *dst_origin,
4735 const size_t *region,
4748 ScopedContext active(command_queue->
get_context());
4750 if (event_wait_list) {
4751 retErr = enqueueEventsWait(command_queue, hipStream,
4752 num_events_in_wait_list, event_wait_list);
4756 hipArray_Format srcFormat;
4757 size_t srcNumChannels;
4758 getArrayDesc(srcArray, srcFormat, srcNumChannels);
4761 hipArray_Format dstFormat;
4762 size_t dstNumChannels;
4763 getArrayDesc(dstArray, dstFormat, dstNumChannels);
4765 assert(srcFormat == dstFormat);
4766 assert(srcNumChannels == dstNumChannels);
4770 size_t dstByteOffsetX = dst_origin[0] * elementByteSize * srcNumChannels;
4771 size_t srcByteOffsetX = src_origin[0] * elementByteSize * dstNumChannels;
4772 size_t bytesToCopy = elementByteSize * srcNumChannels * region[0];
4776 size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4777 size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
4778 size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
4781 hipStream, imgType, adjustedRegion, srcArray, hipMemoryTypeArray,
4782 srcOffset, dstArray, hipMemoryTypeArray, dstOffset);
4784 if (retErr != PI_SUCCESS) {
4790 command_queue, hipStream);
4791 new_event->record();
4797 return PI_ERROR_UNKNOWN;
4806 const void *fill_color,
4807 const size_t *origin,
const size_t *region,
4811 (void)command_queue;
4816 (void)num_events_in_wait_list;
4817 (void)event_wait_list;
4836 assert(ret_map !=
nullptr);
4837 assert(command_queue !=
nullptr);
4838 assert(buffer !=
nullptr);
4841 pi_result ret_err = PI_ERROR_INVALID_OPERATION;
4854 ret_err = PI_SUCCESS;
4860 command_queue, buffer, blocking_map, offset, size, hostPtr,
4861 num_events_in_wait_list, event_wait_list, event);
4863 ScopedContext active(command_queue->
get_context());
4867 event_wait_list,
nullptr);
4897 assert(command_queue !=
nullptr);
4898 assert(mapped_ptr !=
nullptr);
4899 assert(memobj !=
nullptr);
4913 command_queue, memobj,
true,
4916 num_events_in_wait_list, event_wait_list, event);
4918 ScopedContext active(command_queue->
get_context());
4922 event_wait_list,
nullptr);
4947 assert(result_ptr !=
nullptr);
4948 assert(context !=
nullptr);
4949 assert(properties ==
nullptr || *properties == 0);
4952 ScopedContext active(context);
4953 result = PI_CHECK_ERROR(hipHostMalloc(result_ptr, size));
4959 (result == PI_SUCCESS &&
4960 reinterpret_cast<std::uintptr_t
>(*result_ptr) %
alignment == 0));
4970 assert(result_ptr !=
nullptr);
4971 assert(context !=
nullptr);
4972 assert(device !=
nullptr);
4973 assert(properties ==
nullptr || *properties == 0);
4976 ScopedContext active(context);
4977 result = PI_CHECK_ERROR(hipMalloc(result_ptr, size));
4983 (result == PI_SUCCESS &&
4984 reinterpret_cast<std::uintptr_t
>(*result_ptr) %
alignment == 0));
4994 assert(result_ptr !=
nullptr);
4995 assert(context !=
nullptr);
4996 assert(device !=
nullptr);
4997 assert(properties ==
nullptr || *properties == 0);
5000 ScopedContext active(context);
5002 PI_CHECK_ERROR(hipMallocManaged(result_ptr, size, hipMemAttachGlobal));
5008 (result == PI_SUCCESS &&
5009 reinterpret_cast<std::uintptr_t
>(*result_ptr) %
alignment == 0));
5017 assert(context !=
nullptr);
5020 ScopedContext active(context);
5022 hipPointerAttribute_t hipPointerAttributeType;
5024 PI_CHECK_ERROR(hipPointerGetAttributes(&hipPointerAttributeType, ptr));
5025 type = hipPointerAttributeType.memoryType;
5026 assert(type == hipMemoryTypeDevice or type == hipMemoryTypeHost);
5027 if (type == hipMemoryTypeDevice) {
5028 result = PI_CHECK_ERROR(hipFree(ptr));
5030 if (type == hipMemoryTypeHost) {
5031 result = PI_CHECK_ERROR(hipFreeHost(ptr));
5045 assert(queue !=
nullptr);
5046 assert(ptr !=
nullptr);
5048 std::unique_ptr<_pi_event> event_ptr{
nullptr};
5055 num_events_in_waitlist, events_waitlist, guard, &stream_token);
5056 result = enqueueEventsWait(queue, hipStream, num_events_in_waitlist,
5063 result = PI_CHECK_ERROR(
5064 hipMemsetD8Async(
reinterpret_cast<hipDeviceptr_t
>(ptr),
5065 (
unsigned char)value & 0xFF, count, hipStream));
5067 result = event_ptr->record();
5068 *
event = event_ptr.release();
5078 void *dst_ptr,
const void *src_ptr,
5083 assert(queue !=
nullptr);
5084 assert(dst_ptr !=
nullptr);
5085 assert(src_ptr !=
nullptr);
5088 std::unique_ptr<_pi_event> event_ptr{
nullptr};
5093 result = enqueueEventsWait(queue, hipStream, num_events_in_waitlist,
5100 result = PI_CHECK_ERROR(
5101 hipMemcpyAsync(dst_ptr, src_ptr, size, hipMemcpyDefault, hipStream));
5103 result = event_ptr->record();
5106 result = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
5109 *
event = event_ptr.release();
5125 return PI_ERROR_INVALID_VALUE;
5126 assert(queue !=
nullptr);
5127 assert(ptr !=
nullptr);
5129 std::unique_ptr<_pi_event> event_ptr{
nullptr};
5134 result = enqueueEventsWait(queue, hipStream, num_events_in_waitlist,
5141 result = PI_CHECK_ERROR(hipMemPrefetchAsync(
5144 result = event_ptr->record();
5145 *
event = event_ptr.release();
5161 assert(queue !=
nullptr);
5162 assert(ptr !=
nullptr);
5173 const void *,
size_t,
size_t,
pi_uint32,
5203 void *dst_ptr,
size_t dst_pitch,
5204 const void *src_ptr,
size_t src_pitch,
5205 size_t width,
size_t height,
5209 assert(queue !=
nullptr);
5216 result = enqueueEventsWait(queue, hipStream, num_events_in_wait_list,
5224 result = PI_CHECK_ERROR(hipMemcpy2DAsync(dst_ptr, dst_pitch, src_ptr,
5225 src_pitch, width, height,
5226 hipMemcpyDefault, hipStream));
5232 result = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
5259 size_t param_value_size,
5261 size_t *param_value_size_ret) {
5263 assert(context !=
nullptr);
5264 assert(ptr !=
nullptr);
5266 hipPointerAttribute_t hipPointerAttributeType;
5269 ScopedContext active(context);
5270 switch (param_name) {
5274 hipError_t ret = hipPointerGetAttributes(&hipPointerAttributeType, ptr);
5275 if (ret == hipErrorInvalidValue) {
5277 return getInfo(param_value_size, param_value, param_value_size_ret,
5280 result = check_error(ret, __func__, __LINE__ - 5, __FILE__);
5281 value = hipPointerAttributeType.isManaged;
5284 return getInfo(param_value_size, param_value, param_value_size_ret,
5287 result = PI_CHECK_ERROR(
5288 hipPointerGetAttributes(&hipPointerAttributeType, ptr));
5289 value = hipPointerAttributeType.memoryType;
5290 assert(value == hipMemoryTypeDevice or value == hipMemoryTypeHost);
5291 if (value == hipMemoryTypeDevice) {
5293 return getInfo(param_value_size, param_value, param_value_size_ret,
5296 if (value == hipMemoryTypeHost) {
5298 return getInfo(param_value_size, param_value, param_value_size_ret,
5302 __builtin_unreachable();
5303 return getInfo(param_value_size, param_value, param_value_size_ret,
5307 return PI_ERROR_INVALID_VALUE;
5310 return PI_ERROR_INVALID_VALUE;
5315 result = PI_CHECK_ERROR(
5316 hipPointerGetAttributes(&hipPointerAttributeType, ptr));
5317 int device_idx = hipPointerAttributeType.device;
5321 std::vector<pi_platform> platforms;
5322 platforms.resize(device_idx + 1);
5326 pi_device device = platforms[device_idx]->devices_[0].get();
5327 return getInfo(param_value_size, param_value, param_value_size_ret,
5340 pi_bool blocking_write,
size_t count,
size_t offset,
const void *src,
5346 (void)blocking_write;
5350 (void)num_events_in_wait_list;
5351 (void)event_wait_list;
5355 "hip_piextEnqueueDeviceGlobalVariableWrite not implemented");
5361 size_t count,
size_t offset,
void *dst,
pi_uint32 num_events_in_wait_list,
5366 (void)blocking_read;
5370 (void)num_events_in_wait_list;
5371 (void)event_wait_list;
5375 "hip_piextEnqueueDeviceGlobalVariableRead not implemented");
5387 (void)PluginParameter;
5392 uint64_t *HostTime) {
5393 if (!DeviceTime && !HostTime)
5398 ScopedContext active(Device->get_context());
5401 PI_CHECK_ERROR(hipEventCreateWithFlags(&event, hipEventDefault));
5402 PI_CHECK_ERROR(hipEventRecord(event));
5405 using namespace std::chrono;
5407 duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
5412 PI_CHECK_ERROR(hipEventSynchronize(event));
5414 float elapsedTime = 0.0f;
5417 *DeviceTime = (uint64_t)(elapsedTime * (
double)1e6);
5429 size_t PluginVersionSize =
sizeof(PluginInit->
PluginVersion);
5431 return PI_ERROR_INVALID_VALUE;
5440 #define _PI_CL(pi_api, hip_api) \
5441 (PluginInit->PiFunctionTable).pi_api = (decltype(&::pi_api))(&hip_api);
5577 #define __SYCL_PLUGIN_DLL_NAME "pi_hip.dll"
5578 #include "../common_win_pi_trace/common_win_pi_trace.hpp"
5579 #undef __SYCL_PLUGIN_DLL_NAME