DPC++ Runtime
Runtime libraries for oneAPI DPC++
pi_hip.cpp
Go to the documentation of this file.
1 //==---------- pi_hip.cpp - HIP Plugin ------------------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
13 
14 #include <pi_hip.hpp>
15 #include <sycl/detail/defines.hpp>
17 #include <sycl/detail/pi.hpp>
18 
19 #include <algorithm>
20 #include <cassert>
21 #include <chrono>
22 #include <hip/hip_runtime.h>
23 #include <limits>
24 #include <memory>
25 #include <mutex>
26 #include <regex>
27 #include <string.h>
28 
29 namespace {
30 // Hipify doesn't support cuArrayGetDescriptor, on AMD the hipArray can just be
31 // indexed, but on NVidia it is an opaque type and needs to go through
32 // cuArrayGetDescriptor so implement a utility function to get the array
33 // properties
34 inline void getArrayDesc(hipArray *array, hipArray_Format &format,
35  size_t &channels) {
36 #if defined(__HIP_PLATFORM_AMD__)
37  format = array->Format;
38  channels = array->NumChannels;
39 #elif defined(__HIP_PLATFORM_NVIDIA__)
40  CUDA_ARRAY_DESCRIPTOR arrayDesc;
41  cuArrayGetDescriptor(&arrayDesc, (CUarray)array);
42 
43  format = arrayDesc.Format;
44  channels = arrayDesc.NumChannels;
45 #else
46 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
47 #endif
48 }
49 
50 // NVidia HIP headers guard hipArray3DCreate behind __CUDACC__, this does not
51 // seem to be required and we're not using nvcc to build the HIP PI plugin so
52 // add the translation function here
53 #if defined(__HIP_PLATFORM_NVIDIA__) && !defined(__CUDACC__)
54 inline static hipError_t
55 hipArray3DCreate(hiparray *pHandle,
56  const HIP_ARRAY3D_DESCRIPTOR *pAllocateArray) {
57  return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray));
58 }
59 #endif
60 
61 // hipArray gets turned into cudaArray when using the HIP NVIDIA platform, and
62 // some CUDA APIs use cudaArray* and others use CUarray, these two represent the
63 // same type, however when building cudaArray appears as an opaque type, so it
64 // needs to be explicitly casted to CUarray. In order for this to work for both
65 // AMD and NVidia we introduce an second hipArray type that will be CUarray for
66 // NVIDIA and hipArray* for AMD so that we can place the explicit casts when
67 // necessary for NVIDIA and they will be no-ops for AMD.
68 #if defined(__HIP_PLATFORM_NVIDIA__)
69 typedef CUarray hipCUarray;
70 #elif defined(__HIP_PLATFORM_AMD__)
71 typedef hipArray *hipCUarray;
72 #else
73 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
74 #endif
75 
76 // Add missing HIP to CUDA defines
77 #if defined(__HIP_PLATFORM_NVIDIA__)
78 #define hipMemoryType CUmemorytype
79 #define hipMemoryTypeHost CU_MEMORYTYPE_HOST
80 #define hipMemoryTypeDevice CU_MEMORYTYPE_DEVICE
81 #define hipMemoryTypeArray CU_MEMORYTYPE_ARRAY
82 #define hipMemoryTypeUnified CU_MEMORYTYPE_UNIFIED
83 #endif
84 
85 std::string getHipVersionString() {
86  int driver_version = 0;
87  if (hipDriverGetVersion(&driver_version) != hipSuccess) {
88  return "";
89  }
90  // The version is returned as (1000 major + 10 minor).
91  std::stringstream stream;
92  stream << "HIP " << driver_version / 1000 << "."
93  << driver_version % 1000 / 10;
94  return stream.str();
95 }
96 
97 pi_result map_error(hipError_t result) {
98  switch (result) {
99  case hipSuccess:
100  return PI_SUCCESS;
101  case hipErrorInvalidContext:
102  return PI_ERROR_INVALID_CONTEXT;
103  case hipErrorInvalidDevice:
104  return PI_ERROR_INVALID_DEVICE;
105  case hipErrorInvalidValue:
106  return PI_ERROR_INVALID_VALUE;
107  case hipErrorOutOfMemory:
108  return PI_ERROR_OUT_OF_HOST_MEMORY;
109  case hipErrorLaunchOutOfResources:
110  return PI_ERROR_OUT_OF_RESOURCES;
111  default:
112  return PI_ERROR_UNKNOWN;
113  }
114 }
115 
116 // Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR
117 constexpr size_t MaxMessageSize = 256;
118 thread_local pi_result ErrorMessageCode = PI_SUCCESS;
119 thread_local char ErrorMessage[MaxMessageSize];
120 
121 // Utility function for setting a message and warning
122 [[maybe_unused]] static void setErrorMessage(const char *message,
123  pi_result error_code) {
124  assert(strlen(message) <= MaxMessageSize);
125  strcpy(ErrorMessage, message);
126  ErrorMessageCode = error_code;
127 }
128 
129 // Returns plugin specific error and warning messages
130 pi_result hip_piPluginGetLastError(char **message) {
131  *message = &ErrorMessage[0];
132  return ErrorMessageCode;
133 }
134 
135 // Iterates over the event wait list, returns correct pi_result error codes.
136 // Invokes the callback for the latest event of each queue in the wait list.
137 // The callback must take a single pi_event argument and return a pi_result.
138 template <typename Func>
139 pi_result forLatestEvents(const pi_event *event_wait_list,
140  std::size_t num_events_in_wait_list, Func &&f) {
141 
142  if (event_wait_list == nullptr || num_events_in_wait_list == 0) {
143  return PI_ERROR_INVALID_EVENT_WAIT_LIST;
144  }
145 
146  // Fast path if we only have a single event
147  if (num_events_in_wait_list == 1) {
148  return f(event_wait_list[0]);
149  }
150 
151  std::vector<pi_event> events{event_wait_list,
152  event_wait_list + num_events_in_wait_list};
153  std::sort(events.begin(), events.end(), [](pi_event e0, pi_event e1) {
154  // Tiered sort creating sublists of streams (smallest value first) in which
155  // the corresponding events are sorted into a sequence of newest first.
156  return e0->get_stream() < e1->get_stream() ||
157  (e0->get_stream() == e1->get_stream() &&
158  e0->get_event_id() > e1->get_event_id());
159  });
160 
161  bool first = true;
162  hipStream_t lastSeenStream = 0;
163  for (pi_event event : events) {
164  if (!event || (!first && event->get_stream() == lastSeenStream)) {
165  continue;
166  }
167 
168  first = false;
169  lastSeenStream = event->get_stream();
170 
171  auto result = f(event);
172  if (result != PI_SUCCESS) {
173  return result;
174  }
175  }
176 
177  return PI_SUCCESS;
178 }
179 
187 pi_result check_error(hipError_t result, const char *function, int line,
188  const char *file) {
189  if (result == hipSuccess) {
190  return PI_SUCCESS;
191  }
192 
193  if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) {
194  const char *errorString = nullptr;
195  const char *errorName = nullptr;
196  errorName = hipGetErrorName(result);
197  errorString = hipGetErrorString(result);
198  std::stringstream ss;
199  ss << "\nPI HIP ERROR:"
200  << "\n\tValue: " << result
201  << "\n\tName: " << errorName
202  << "\n\tDescription: " << errorString
203  << "\n\tFunction: " << function << "\n\tSource Location: " << file
204  << ":" << line << "\n"
205  << std::endl;
206  std::cerr << ss.str();
207  }
208 
209  if (std::getenv("PI_HIP_ABORT") != nullptr) {
210  std::abort();
211  }
212 
213  throw map_error(result);
214 }
215 
217 #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)
218 
225 class ScopedContext {
226  pi_context placedContext_;
227  hipCtx_t original_;
228  bool needToRecover_;
229 
230 public:
231  ScopedContext(pi_context ctxt) : placedContext_{ctxt}, needToRecover_{false} {
232 
233  if (!placedContext_) {
234  throw PI_ERROR_INVALID_CONTEXT;
235  }
236 
237  hipCtx_t desired = placedContext_->get();
238  PI_CHECK_ERROR(hipCtxGetCurrent(&original_));
239  if (original_ != desired) {
240  // Sets the desired context as the active one for the thread
241  PI_CHECK_ERROR(hipCtxSetCurrent(desired));
242  if (original_ == nullptr) {
243  // No context is installed on the current thread
244  // This is the most common case. We can activate the context in the
245  // thread and leave it there until all the PI context referring to the
246  // same underlying HIP context are destroyed. This emulates
247  // the behaviour of the HIP runtime api, and avoids costly context
248  // switches. No action is required on this side of the if.
249  } else {
250  needToRecover_ = true;
251  }
252  }
253  }
254 
255  ~ScopedContext() {
256  if (needToRecover_) {
257  PI_CHECK_ERROR(hipCtxSetCurrent(original_));
258  }
259  }
260 };
261 
263 template <typename T, typename Assign>
264 pi_result getInfoImpl(size_t param_value_size, void *param_value,
265  size_t *param_value_size_ret, T value, size_t value_size,
266  Assign &&assign_func) {
267 
268  if (param_value != nullptr) {
269 
270  if (param_value_size < value_size) {
271  return PI_ERROR_INVALID_VALUE;
272  }
273 
274  assign_func(param_value, value, value_size);
275  }
276 
277  if (param_value_size_ret != nullptr) {
278  *param_value_size_ret = value_size;
279  }
280 
281  return PI_SUCCESS;
282 }
283 
284 template <typename T>
285 pi_result getInfo(size_t param_value_size, void *param_value,
286  size_t *param_value_size_ret, T value) {
287 
288  auto assignment = [](void *param_value, T value, size_t value_size) {
289  (void)value_size;
290  *static_cast<T *>(param_value) = value;
291  };
292 
293  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
294  sizeof(T), std::move(assignment));
295 }
296 
297 template <typename T>
298 pi_result getInfoArray(size_t array_length, size_t param_value_size,
299  void *param_value, size_t *param_value_size_ret,
300  T *value) {
301 
302  auto assignment = [](void *param_value, T *value, size_t value_size) {
303  memcpy(param_value, static_cast<const void *>(value), value_size);
304  };
305 
306  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
307  array_length * sizeof(T), std::move(assignment));
308 }
309 
310 template <>
311 pi_result getInfo<const char *>(size_t param_value_size, void *param_value,
312  size_t *param_value_size_ret,
313  const char *value) {
314  return getInfoArray(strlen(value) + 1, param_value_size, param_value,
315  param_value_size_ret, value);
316 }
317 
318 int getAttribute(pi_device device, hipDeviceAttribute_t attribute) {
319  int value;
321  hipDeviceGetAttribute(&value, attribute, device->get()) == hipSuccess);
322  return value;
323 }
325 
326 void simpleGuessLocalWorkSize(size_t *threadsPerBlock,
327  const size_t *global_work_size,
328  const size_t maxThreadsPerBlock[3],
329  pi_kernel kernel) {
330  assert(threadsPerBlock != nullptr);
331  assert(global_work_size != nullptr);
332  assert(kernel != nullptr);
333  // int recommendedBlockSize, minGrid;
334 
335  // PI_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize(
336  // &minGrid, &recommendedBlockSize, kernel->get(),
337  // 0, 0));
338 
339  //(void)minGrid; // Not used, avoid warnings
340 
341  threadsPerBlock[0] = std::min(maxThreadsPerBlock[0], global_work_size[0]);
342 
343  // Find a local work group size that is a divisor of the global
344  // work group size to produce uniform work groups.
345  while (0u != (global_work_size[0] % threadsPerBlock[0])) {
346  --threadsPerBlock[0];
347  }
348 }
349 
350 pi_result enqueueEventsWait(pi_queue command_queue, hipStream_t stream,
351  pi_uint32 num_events_in_wait_list,
352  const pi_event *event_wait_list) {
353  if (!event_wait_list) {
354  return PI_SUCCESS;
355  }
356  try {
357  ScopedContext active(command_queue->get_context());
358 
359  auto result = forLatestEvents(
360  event_wait_list, num_events_in_wait_list,
361  [stream](pi_event event) -> pi_result {
362  if (event->get_stream() == stream) {
363  return PI_SUCCESS;
364  } else {
365  return PI_CHECK_ERROR(hipStreamWaitEvent(stream, event->get(), 0));
366  }
367  });
368 
369  if (result != PI_SUCCESS) {
370  return result;
371  }
372  return PI_SUCCESS;
373  } catch (pi_result err) {
374  return err;
375  } catch (...) {
376  return PI_ERROR_UNKNOWN;
377  }
378 }
379 
380 } // anonymous namespace
381 
383 namespace sycl {
385 namespace detail {
386 namespace pi {
387 
388 // Report error and no return (keeps compiler from printing warnings).
389 // TODO: Probably change that to throw a catchable exception,
390 // but for now it is useful to see every failure.
391 //
392 [[noreturn]] void die(const char *Message) {
393  std::cerr << "pi_die: " << Message << std::endl;
394  std::terminate();
395 }
396 
397 // Reports error messages
398 void hipPrint(const char *Message) {
399  std::cerr << "pi_print: " << Message << std::endl;
400 }
401 
402 void assertion(bool Condition, const char *Message) {
403  if (!Condition)
404  die(Message);
405 }
406 
407 } // namespace pi
408 } // namespace detail
409 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
410 } // namespace sycl
411 
412 //--------------
413 // PI object implementation
414 
415 extern "C" {
416 
417 // Required in a number of functions, so forward declare here
419  pi_uint32 num_events_in_wait_list,
420  const pi_event *event_wait_list,
421  pi_event *event);
423  pi_uint32 num_events_in_wait_list,
424  const pi_event *event_wait_list,
425  pi_event *event);
428 
429 } // extern "C"
430 
432 
434  pi_uint32 stream_i) {
435  if (barrier_event_ && !compute_applied_barrier_[stream_i]) {
436  PI_CHECK_ERROR(hipStreamWaitEvent(stream, barrier_event_, 0));
437  compute_applied_barrier_[stream_i] = true;
438  }
439 }
440 
442  pi_uint32 stream_i) {
443  if (barrier_event_ && !transfer_applied_barrier_[stream_i]) {
444  PI_CHECK_ERROR(hipStreamWaitEvent(stream, barrier_event_, 0));
445  transfer_applied_barrier_[stream_i] = true;
446  }
447 }
448 
449 hipStream_t _pi_queue::get_next_compute_stream(pi_uint32 *stream_token) {
450  pi_uint32 stream_i;
451  pi_uint32 token;
452  while (true) {
453  if (num_compute_streams_ < compute_streams_.size()) {
454  // the check above is for performance - so as not to lock mutex every time
455  std::lock_guard<std::mutex> guard(compute_stream_mutex_);
456  // The second check is done after mutex is locked so other threads can not
457  // change num_compute_streams_ after that
458  if (num_compute_streams_ < compute_streams_.size()) {
459  PI_CHECK_ERROR(hipStreamCreateWithFlags(
461  }
462  }
463  token = compute_stream_idx_++;
464  stream_i = token % compute_streams_.size();
465  // if a stream has been reused before it was next selected round-robin
466  // fashion, we want to delay its next use and instead select another one
467  // that is more likely to have completed all the enqueued work.
468  if (delay_compute_[stream_i]) {
469  delay_compute_[stream_i] = false;
470  } else {
471  break;
472  }
473  }
474  if (stream_token) {
475  *stream_token = token;
476  }
477  hipStream_t res = compute_streams_[stream_i];
479  return res;
480 }
481 
483  pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
484  _pi_stream_guard &guard, pi_uint32 *stream_token) {
485  for (pi_uint32 i = 0; i < num_events_in_wait_list; i++) {
486  pi_uint32 token = event_wait_list[i]->get_compute_stream_token();
487  if (event_wait_list[i]->get_queue() == this && can_reuse_stream(token)) {
488  std::unique_lock<std::mutex> compute_sync_guard(
490  // redo the check after lock to avoid data races on
491  // last_sync_compute_streams_
492  if (can_reuse_stream(token)) {
493  pi_uint32 stream_i = token % delay_compute_.size();
494  delay_compute_[stream_i] = true;
495  if (stream_token) {
496  *stream_token = token;
497  }
498  guard = _pi_stream_guard{std::move(compute_sync_guard)};
499  hipStream_t res = event_wait_list[i]->get_stream();
501  return res;
502  }
503  }
504  }
505  guard = {};
506  return get_next_compute_stream(stream_token);
507 }
508 
510  if (transfer_streams_.empty()) { // for example in in-order queue
511  return get_next_compute_stream();
512  }
514  // the check above is for performance - so as not to lock mutex every time
515  std::lock_guard<std::mutex> guard(transfer_stream_mutex_);
516  // The second check is done after mutex is locked so other threads can not
517  // change num_transfer_streams_ after that
519  PI_CHECK_ERROR(hipStreamCreateWithFlags(
521  }
522  }
523  pi_uint32 stream_i = transfer_stream_idx_++ % transfer_streams_.size();
524  hipStream_t res = transfer_streams_[stream_i];
526  return res;
527 }
528 
530  hipStream_t stream, pi_uint32 stream_token)
531  : commandType_{type}, refCount_{1}, hasBeenWaitedOn_{false},
532  isRecorded_{false}, isStarted_{false},
533  streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr},
534  evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} {
535 
536  assert(type != PI_COMMAND_TYPE_USER);
537 
538  bool profilingEnabled = queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE;
539 
540  PI_CHECK_ERROR(hipEventCreateWithFlags(
541  &evEnd_, profilingEnabled ? hipEventDefault : hipEventDisableTiming));
542 
543  if (profilingEnabled) {
544  PI_CHECK_ERROR(hipEventCreateWithFlags(&evQueued_, hipEventDefault));
545  PI_CHECK_ERROR(hipEventCreateWithFlags(&evStart_, hipEventDefault));
546  }
547 
548  if (queue_ != nullptr) {
549  hip_piQueueRetain(queue_);
550  }
551  hip_piContextRetain(context_);
552 }
553 
555  if (queue_ != nullptr) {
556  hip_piQueueRelease(queue_);
557  }
558  hip_piContextRelease(context_);
559 }
560 
562  assert(!is_started());
563  pi_result result = PI_SUCCESS;
564 
565  try {
567  // NOTE: This relies on the default stream to be unused.
568  PI_CHECK_ERROR(hipEventRecord(evQueued_, 0));
569  PI_CHECK_ERROR(hipEventRecord(evStart_, queue_->get()));
570  }
571  } catch (pi_result error) {
572  result = error;
573  }
574 
575  isStarted_ = true;
576  return result;
577 }
578 
579 bool _pi_event::is_completed() const noexcept {
580  if (!isRecorded_) {
581  return false;
582  }
583  if (!hasBeenWaitedOn_) {
584  const hipError_t ret = hipEventQuery(evEnd_);
585  if (ret != hipSuccess && ret != hipErrorNotReady) {
586  PI_CHECK_ERROR(ret);
587  return false;
588  }
589  if (ret == hipErrorNotReady) {
590  return false;
591  }
592  }
593  return true;
594 }
595 
597  float miliSeconds = 0.0f;
598  assert(is_started());
599 
600  PI_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, evStart_, evEnd_));
601  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
602 }
603 
605  float miliSeconds = 0.0f;
606  assert(is_started());
607 
608  PI_CHECK_ERROR(
609  hipEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evStart_));
610  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
611 }
612 
614  float miliSeconds = 0.0f;
615  assert(is_started() && is_recorded());
616 
617  PI_CHECK_ERROR(
618  hipEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evEnd_));
619  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
620 }
621 
623 
624  if (is_recorded() || !is_started()) {
625  return PI_ERROR_INVALID_EVENT;
626  }
627 
628  pi_result result = PI_ERROR_INVALID_OPERATION;
629 
630  if (!queue_) {
631  return PI_ERROR_INVALID_QUEUE;
632  }
633 
634  try {
635  eventId_ = queue_->get_next_event_id();
636  if (eventId_ == 0) {
638  "Unrecoverable program state reached in event identifier overflow");
639  }
640  result = PI_CHECK_ERROR(hipEventRecord(evEnd_, stream_));
641  } catch (pi_result error) {
642  result = error;
643  }
644 
645  if (result == PI_SUCCESS) {
646  isRecorded_ = true;
647  }
648 
649  return result;
650 }
651 
653  pi_result retErr;
654  try {
655  retErr = PI_CHECK_ERROR(hipEventSynchronize(evEnd_));
656  hasBeenWaitedOn_ = true;
657  } catch (pi_result error) {
658  retErr = error;
659  }
660 
661  return retErr;
662 }
663 
665  assert(queue_ != nullptr);
666  PI_CHECK_ERROR(hipEventDestroy(evEnd_));
667 
669  PI_CHECK_ERROR(hipEventDestroy(evQueued_));
670  PI_CHECK_ERROR(hipEventDestroy(evStart_));
671  }
672 
673  return PI_SUCCESS;
674 }
675 
676 // makes all future work submitted to queue wait for all work captured in event.
678  // for native events, the hipStreamWaitEvent call is used.
679  // This makes all future work submitted to stream wait for all
680  // work captured in event.
681  queue->for_each_stream([e = event->get()](hipStream_t s) {
682  PI_CHECK_ERROR(hipStreamWaitEvent(s, e, 0));
683  });
684  return PI_SUCCESS;
685 }
686 
688  : module_{nullptr}, binary_{},
689  binarySizeInBytes_{0}, refCount_{1}, context_{ctxt} {
690  hip_piContextRetain(context_);
691 }
692 
694 
695 pi_result _pi_program::set_binary(const char *source, size_t length) {
696  assert((binary_ == nullptr && binarySizeInBytes_ == 0) &&
697  "Re-setting program binary data which has already been set");
698  binary_ = source;
699  binarySizeInBytes_ = length;
700  return PI_SUCCESS;
701 }
702 
703 pi_result _pi_program::build_program(const char *build_options) {
704 
705  this->buildOptions_ = build_options;
706 
707  constexpr const unsigned int numberOfOptions = 4u;
708 
709  hipJitOption options[numberOfOptions];
710  void *optionVals[numberOfOptions];
711 
712  // Pass a buffer for info messages
713  options[0] = hipJitOptionInfoLogBuffer;
714  optionVals[0] = (void *)infoLog_;
715  // Pass the size of the info buffer
716  options[1] = hipJitOptionInfoLogBufferSizeBytes;
717  optionVals[1] = (void *)(long)MAX_LOG_SIZE;
718  // Pass a buffer for error message
719  options[2] = hipJitOptionErrorLogBuffer;
720  optionVals[2] = (void *)errorLog_;
721  // Pass the size of the error buffer
722  options[3] = hipJitOptionErrorLogBufferSizeBytes;
723  optionVals[3] = (void *)(long)MAX_LOG_SIZE;
724 
725  auto result = PI_CHECK_ERROR(
726  hipModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
727  numberOfOptions, options, optionVals));
728 
729  const auto success = (result == PI_SUCCESS);
730 
731  buildStatus_ =
733 
734  // If no exception, result is correct
735  return success ? PI_SUCCESS : PI_ERROR_BUILD_PROGRAM_FAILURE;
736 }
737 
743 std::string getKernelNames(pi_program program) {
744  (void)program;
745  sycl::detail::pi::die("getKernelNames not implemented");
746  return {};
747 }
748 
753 template <typename T> class ReleaseGuard {
754 private:
755  T Captive;
756 
757  static pi_result callRelease(pi_device Captive) {
758  return hip_piDeviceRelease(Captive);
759  }
760 
761  static pi_result callRelease(pi_context Captive) {
762  return hip_piContextRelease(Captive);
763  }
764 
765  static pi_result callRelease(pi_mem Captive) {
766  return hip_piMemRelease(Captive);
767  }
768 
769  static pi_result callRelease(pi_program Captive) {
770  return hip_piProgramRelease(Captive);
771  }
772 
773  static pi_result callRelease(pi_kernel Captive) {
774  return hip_piKernelRelease(Captive);
775  }
776 
777  static pi_result callRelease(pi_queue Captive) {
778  return hip_piQueueRelease(Captive);
779  }
780 
781  static pi_result callRelease(pi_event Captive) {
782  return hip_piEventRelease(Captive);
783  }
784 
785 public:
786  ReleaseGuard() = delete;
788  explicit ReleaseGuard(T Obj) : Captive(Obj) {}
789  ReleaseGuard(ReleaseGuard &&Other) noexcept : Captive(Other.Captive) {
790  Other.Captive = nullptr;
791  }
792 
793  ReleaseGuard(const ReleaseGuard &) = delete;
794 
798  if (Captive != nullptr) {
799  pi_result ret = callRelease(Captive);
800  if (ret != PI_SUCCESS) {
801  // A reported HIP error is either an implementation or an asynchronous
802  // HIP error for which it is unclear if the function that reported it
803  // succeeded or not. Either way, the state of the program is compromised
804  // and likely unrecoverable.
806  "Unrecoverable program state reached in hip_piMemRelease");
807  }
808  }
809  }
810 
811  ReleaseGuard &operator=(const ReleaseGuard &) = delete;
812 
814  Captive = Other.Captive;
815  Other.Captive = nullptr;
816  return *this;
817  }
818 
821  void dismiss() { Captive = nullptr; }
822 };
823 
824 //-- PI API implementation
825 extern "C" {
826 
836  pi_uint32 *num_platforms) {
837 
838  try {
839  static std::once_flag initFlag;
840  static pi_uint32 numPlatforms = 1;
841  static std::vector<_pi_platform> platformIds;
842 
843  if (num_entries == 0 and platforms != nullptr) {
844  return PI_ERROR_INVALID_VALUE;
845  }
846  if (platforms == nullptr and num_platforms == nullptr) {
847  return PI_ERROR_INVALID_VALUE;
848  }
849 
850  pi_result err = PI_SUCCESS;
851 
852  std::call_once(
853  initFlag,
854  [](pi_result &err) {
855  if (hipInit(0) != hipSuccess) {
856  numPlatforms = 0;
857  return;
858  }
859  int numDevices = 0;
860  hipError_t hipErrorCode = hipGetDeviceCount(&numDevices);
861  if (hipErrorCode == hipErrorNoDevice) {
862  numPlatforms = 0;
863  return;
864  }
865  err = PI_CHECK_ERROR(hipErrorCode);
866  if (numDevices == 0) {
867  numPlatforms = 0;
868  return;
869  }
870  try {
871  numPlatforms = numDevices;
872  platformIds.resize(numDevices);
873 
874  for (int i = 0; i < numDevices; ++i) {
875  hipDevice_t device;
876  err = PI_CHECK_ERROR(hipDeviceGet(&device, i));
877  platformIds[i].devices_.emplace_back(
878  new _pi_device{device, &platformIds[i]});
879  }
880  } catch (const std::bad_alloc &) {
881  // Signal out-of-memory situation
882  for (int i = 0; i < numDevices; ++i) {
883  platformIds[i].devices_.clear();
884  }
885  platformIds.clear();
886  err = PI_ERROR_OUT_OF_HOST_MEMORY;
887  } catch (...) {
888  // Clear and rethrow to allow retry
889  for (int i = 0; i < numDevices; ++i) {
890  platformIds[i].devices_.clear();
891  }
892  platformIds.clear();
893  throw;
894  }
895  },
896  err);
897 
898  if (num_platforms != nullptr) {
899  *num_platforms = numPlatforms;
900  }
901 
902  if (platforms != nullptr) {
903  for (unsigned i = 0; i < std::min(num_entries, numPlatforms); ++i) {
904  platforms[i] = &platformIds[i];
905  }
906  }
907 
908  return err;
909  } catch (pi_result err) {
910  return err;
911  } catch (...) {
912  return PI_ERROR_OUT_OF_RESOURCES;
913  }
914 }
915 
917  pi_platform_info param_name,
918  size_t param_value_size, void *param_value,
919  size_t *param_value_size_ret) {
920  assert(platform != nullptr);
921 
922  switch (param_name) {
924  return getInfo(param_value_size, param_value, param_value_size_ret,
925  "AMD HIP BACKEND");
927  return getInfo(param_value_size, param_value, param_value_size_ret,
928  "AMD Corporation");
930  return getInfo(param_value_size, param_value, param_value_size_ret,
931  "FULL PROFILE");
933  auto version = getHipVersionString();
934  return getInfo(param_value_size, param_value, param_value_size_ret,
935  version.c_str());
936  }
938  return getInfo(param_value_size, param_value, param_value_size_ret, "");
939  }
940  default:
942  }
943  sycl::detail::pi::die("Platform info request not implemented");
944  return {};
945 }
946 
953  pi_uint32 num_entries, pi_device *devices,
954  pi_uint32 *num_devices) {
955 
956  pi_result err = PI_SUCCESS;
957  const bool askingForDefault = device_type == PI_DEVICE_TYPE_DEFAULT;
958  const bool askingForGPU = device_type & PI_DEVICE_TYPE_GPU;
959  const bool returnDevices = askingForDefault || askingForGPU;
960 
961  size_t numDevices = returnDevices ? platform->devices_.size() : 0;
962 
963  try {
964  if (num_devices) {
965  *num_devices = numDevices;
966  }
967 
968  if (returnDevices && devices) {
969  for (size_t i = 0; i < std::min(size_t(num_entries), numDevices); ++i) {
970  devices[i] = platform->devices_[i].get();
971  }
972  }
973 
974  return err;
975  } catch (pi_result err) {
976  return err;
977  } catch (...) {
978  return PI_ERROR_OUT_OF_RESOURCES;
979  }
980 }
981 
985  (void)device;
986  return PI_SUCCESS;
987 }
988 
990  size_t param_value_size, void *param_value,
991  size_t *param_value_size_ret) {
992 
993  switch (param_name) {
995  return getInfo(param_value_size, param_value, param_value_size_ret, 1);
997  return getInfo(param_value_size, param_value, param_value_size_ret,
998  context->get_device());
1000  return getInfo(param_value_size, param_value, param_value_size_ret,
1001  context->get_reference_count());
1003  return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
1004  true);
1007  // 2D USM operations currently not supported.
1008  return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
1009  false);
1014  // These queries should be dealt with in context_impl.cpp by calling the
1015  // queries of each device separately and building the intersection set.
1016  setErrorMessage("These queries should have never come here.",
1017  PI_ERROR_INVALID_ARG_VALUE);
1018  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
1019  }
1020  default:
1022  }
1023 
1024  return PI_ERROR_OUT_OF_RESOURCES;
1025 }
1026 
1028  assert(context != nullptr);
1029  assert(context->get_reference_count() > 0);
1030 
1031  context->increment_reference_count();
1032  return PI_SUCCESS;
1033 }
1034 
1036  pi_context context, pi_context_extended_deleter function, void *user_data) {
1037  context->set_extended_deleter(function, user_data);
1038  return PI_SUCCESS;
1039 }
1040 
1044  const pi_device_partition_property *properties,
1045  pi_uint32 num_devices, pi_device *out_devices,
1046  pi_uint32 *out_num_devices) {
1047  (void)device;
1048  (void)properties;
1049  (void)num_devices;
1050  (void)out_devices;
1051  (void)out_num_devices;
1052 
1053  return PI_ERROR_INVALID_OPERATION;
1054 }
1055 
1059  pi_device_binary *binaries,
1060  pi_uint32 num_binaries,
1061  pi_uint32 *selected_binary) {
1062  (void)device;
1063  if (!binaries) {
1064  sycl::detail::pi::die("No list of device images provided");
1065  }
1066  if (num_binaries < 1) {
1067  sycl::detail::pi::die("No binary images in the list");
1068  }
1069 
1070  // Look for an image for the HIP target, and return the first one that is
1071  // found
1072 #if defined(__HIP_PLATFORM_AMD__)
1073  const char *binary_type = __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN;
1074 #elif defined(__HIP_PLATFORM_NVIDIA__)
1075  const char *binary_type = __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64;
1076 #else
1077 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
1078 #endif
1079 
1080  for (pi_uint32 i = 0; i < num_binaries; i++) {
1081  if (strcmp(binaries[i]->DeviceTargetSpec, binary_type) == 0) {
1082  *selected_binary = i;
1083  return PI_SUCCESS;
1084  }
1085  }
1086 
1087  // No image can be loaded for the given device
1088  return PI_ERROR_INVALID_BINARY;
1089 }
1090 
1092  pi_program program,
1093  const char *func_name,
1094  pi_uint64 *func_pointer_ret) {
1095  // Check if device passed is the same the device bound to the context
1096  assert(device == program->get_context()->get_device());
1097  assert(func_pointer_ret != nullptr);
1098 
1099  hipFunction_t func;
1100  hipError_t ret = hipModuleGetFunction(&func, program->get(), func_name);
1101  *func_pointer_ret = reinterpret_cast<pi_uint64>(func);
1102  pi_result retError = PI_SUCCESS;
1103 
1104  if (ret != hipSuccess && ret != hipErrorNotFound)
1105  retError = PI_CHECK_ERROR(ret);
1106  if (ret == hipErrorNotFound) {
1107  *func_pointer_ret = 0;
1108  retError = PI_ERROR_INVALID_KERNEL_NAME;
1109  }
1110 
1111  return retError;
1112 }
1113 
1117  (void)device;
1118  return PI_SUCCESS;
1119 }
1120 
1122  size_t param_value_size, void *param_value,
1123  size_t *param_value_size_ret) {
1124 
1125  static constexpr pi_uint32 max_work_item_dimensions = 3u;
1126 
1127  assert(device != nullptr);
1128 
1129  switch (param_name) {
1130  case PI_DEVICE_INFO_TYPE: {
1131  return getInfo(param_value_size, param_value, param_value_size_ret,
1133  }
1134  case PI_DEVICE_INFO_VENDOR_ID: {
1135 #if defined(__HIP_PLATFORM_AMD__)
1136  pi_uint32 vendor_id = 4098u;
1137 #elif defined(__HIP_PLATFORM_NVIDIA__)
1138  pi_uint32 vendor_id = 4318u;
1139 #else
1140  pi_uint32 vendor_id = 0u;
1141 #endif
1142 
1143  return getInfo(param_value_size, param_value, param_value_size_ret,
1144  vendor_id);
1145  }
1147  int compute_units = 0;
1149  hipDeviceGetAttribute(&compute_units,
1150  hipDeviceAttributeMultiprocessorCount,
1151  device->get()) == hipSuccess);
1152  sycl::detail::pi::assertion(compute_units >= 0);
1153  return getInfo(param_value_size, param_value, param_value_size_ret,
1154  pi_uint32(compute_units));
1155  }
1157  return getInfo(param_value_size, param_value, param_value_size_ret,
1158  max_work_item_dimensions);
1159  }
1161  size_t return_sizes[max_work_item_dimensions];
1162 
1163  int max_x = 0, max_y = 0, max_z = 0;
1165  hipDeviceGetAttribute(&max_x, hipDeviceAttributeMaxBlockDimX,
1166  device->get()) == hipSuccess);
1167  sycl::detail::pi::assertion(max_x >= 0);
1168 
1170  hipDeviceGetAttribute(&max_y, hipDeviceAttributeMaxBlockDimY,
1171  device->get()) == hipSuccess);
1172  sycl::detail::pi::assertion(max_y >= 0);
1173 
1175  hipDeviceGetAttribute(&max_z, hipDeviceAttributeMaxBlockDimZ,
1176  device->get()) == hipSuccess);
1177  sycl::detail::pi::assertion(max_z >= 0);
1178 
1179  return_sizes[0] = size_t(max_x);
1180  return_sizes[1] = size_t(max_y);
1181  return_sizes[2] = size_t(max_z);
1182  return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
1183  param_value_size_ret, return_sizes);
1184  }
1185 
1187  size_t return_sizes[max_work_item_dimensions];
1188  int max_x = 0, max_y = 0, max_z = 0;
1190  hipDeviceGetAttribute(&max_x, hipDeviceAttributeMaxGridDimX,
1191  device->get()) == hipSuccess);
1192  sycl::detail::pi::assertion(max_x >= 0);
1193 
1195  hipDeviceGetAttribute(&max_y, hipDeviceAttributeMaxGridDimY,
1196  device->get()) == hipSuccess);
1197  sycl::detail::pi::assertion(max_y >= 0);
1198 
1200  hipDeviceGetAttribute(&max_z, hipDeviceAttributeMaxGridDimZ,
1201  device->get()) == hipSuccess);
1202  sycl::detail::pi::assertion(max_z >= 0);
1203 
1204  return_sizes[0] = size_t(max_x);
1205  return_sizes[1] = size_t(max_y);
1206  return_sizes[2] = size_t(max_z);
1207  return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
1208  param_value_size_ret, return_sizes);
1209  }
1210 
1212  int max_work_group_size = 0;
1214  hipDeviceGetAttribute(&max_work_group_size,
1215  hipDeviceAttributeMaxThreadsPerBlock,
1216  device->get()) == hipSuccess);
1217 
1218  sycl::detail::pi::assertion(max_work_group_size >= 0);
1219 
1220  return getInfo(param_value_size, param_value, param_value_size_ret,
1221  size_t(max_work_group_size));
1222  }
1224  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1225  }
1227  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1228  }
1230  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1231  }
1233  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1234  }
1236  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1237  }
1239  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1240  }
1242  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1243  }
1245  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1246  }
1248  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1249  }
1251  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1252  }
1254  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1255  }
1257  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1258  }
1260  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1261  }
1263  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1264  }
1266  // Number of sub-groups = max block size / warp size + possible remainder
1267  int max_threads = 0;
1269  hipDeviceGetAttribute(&max_threads,
1270  hipDeviceAttributeMaxThreadsPerBlock,
1271  device->get()) == hipSuccess);
1272  int warpSize = 0;
1274  hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
1275  device->get()) == hipSuccess);
1276  int maxWarps = (max_threads + warpSize - 1) / warpSize;
1277  return getInfo(param_value_size, param_value, param_value_size_ret,
1278  static_cast<uint32_t>(maxWarps));
1279  }
1281  // Volta provides independent thread scheduling
1282  // TODO: Revisit for previous generation GPUs
1283  int major = 0;
1285  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor,
1286  device->get()) == hipSuccess);
1287  bool ifp = (major >= 7);
1288  return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
1289  }
1291  int warpSize = 0;
1293  hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
1294  device->get()) == hipSuccess);
1295  size_t sizes[1] = {static_cast<size_t>(warpSize)};
1296  return getInfoArray<size_t>(1, param_value_size, param_value,
1297  param_value_size_ret, sizes);
1298  }
1300  int clock_freq = 0;
1302  hipDeviceGetAttribute(&clock_freq, hipDeviceAttributeClockRate,
1303  device->get()) == hipSuccess);
1304  sycl::detail::pi::assertion(clock_freq >= 0);
1305  return getInfo(param_value_size, param_value, param_value_size_ret,
1306  pi_uint32(clock_freq) / 1000u);
1307  }
1309  auto bits = pi_uint32{std::numeric_limits<uintptr_t>::digits};
1310  return getInfo(param_value_size, param_value, param_value_size_ret, bits);
1311  }
1313  // Max size of memory object allocation in bytes.
1314  // The minimum value is max(min(1024 × 1024 ×
1315  // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE),
1316  // 32 × 1024 × 1024) for devices that are not of type
1317  // CL_DEVICE_TYPE_HIPSTOM.
1318 
1319  size_t global = 0;
1320  sycl::detail::pi::assertion(hipDeviceTotalMem(&global, device->get()) ==
1321  hipSuccess);
1322 
1323  auto quarter_global = static_cast<pi_uint32>(global / 4u);
1324 
1325  auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global),
1326  32u * 1024u * 1024u);
1327 
1328  return getInfo(param_value_size, param_value, param_value_size_ret,
1329  pi_uint64{max_alloc});
1330  }
1332  return getInfo(param_value_size, param_value, param_value_size_ret,
1333  PI_TRUE);
1334  }
1336  // This call doesn't match to HIP as it doesn't have images, but instead
1337  // surfaces and textures. No clear call in the HIP API to determine this,
1338  // but some searching found as of SM 2.x 128 are supported.
1339  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1340  }
1342  // This call doesn't match to HIP as it doesn't have images, but instead
1343  // surfaces and textures. No clear call in the HIP API to determine this,
1344  // but some searching found as of SM 2.x 128 are supported.
1345  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1346  }
1347 
1349  // Take the smaller of maximum surface and maximum texture height.
1350  int tex_height = 0;
1352  hipDeviceGetAttribute(&tex_height, hipDeviceAttributeMaxTexture2DHeight,
1353  device->get()) == hipSuccess);
1354  sycl::detail::pi::assertion(tex_height >= 0);
1355  int surf_height = 0;
1357  hipDeviceGetAttribute(&surf_height,
1358  hipDeviceAttributeMaxTexture2DHeight,
1359  device->get()) == hipSuccess);
1360  sycl::detail::pi::assertion(surf_height >= 0);
1361 
1362  int min = std::min(tex_height, surf_height);
1363 
1364  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1365  }
1367  // Take the smaller of maximum surface and maximum texture width.
1368  int tex_width = 0;
1370  hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture2DWidth,
1371  device->get()) == hipSuccess);
1372  sycl::detail::pi::assertion(tex_width >= 0);
1373  int surf_width = 0;
1375  hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture2DWidth,
1376  device->get()) == hipSuccess);
1377  sycl::detail::pi::assertion(surf_width >= 0);
1378 
1379  int min = std::min(tex_width, surf_width);
1380 
1381  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1382  }
1384  // Take the smaller of maximum surface and maximum texture height.
1385  int tex_height = 0;
1387  hipDeviceGetAttribute(&tex_height, hipDeviceAttributeMaxTexture3DHeight,
1388  device->get()) == hipSuccess);
1389  sycl::detail::pi::assertion(tex_height >= 0);
1390  int surf_height = 0;
1392  hipDeviceGetAttribute(&surf_height,
1393  hipDeviceAttributeMaxTexture3DHeight,
1394  device->get()) == hipSuccess);
1395  sycl::detail::pi::assertion(surf_height >= 0);
1396 
1397  int min = std::min(tex_height, surf_height);
1398 
1399  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1400  }
1402  // Take the smaller of maximum surface and maximum texture width.
1403  int tex_width = 0;
1405  hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture3DWidth,
1406  device->get()) == hipSuccess);
1407  sycl::detail::pi::assertion(tex_width >= 0);
1408  int surf_width = 0;
1410  hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture3DWidth,
1411  device->get()) == hipSuccess);
1412  sycl::detail::pi::assertion(surf_width >= 0);
1413 
1414  int min = std::min(tex_width, surf_width);
1415 
1416  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1417  }
1419  // Take the smaller of maximum surface and maximum texture depth.
1420  int tex_depth = 0;
1422  hipDeviceGetAttribute(&tex_depth, hipDeviceAttributeMaxTexture3DDepth,
1423  device->get()) == hipSuccess);
1424  sycl::detail::pi::assertion(tex_depth >= 0);
1425  int surf_depth = 0;
1427  hipDeviceGetAttribute(&surf_depth, hipDeviceAttributeMaxTexture3DDepth,
1428  device->get()) == hipSuccess);
1429  sycl::detail::pi::assertion(surf_depth >= 0);
1430 
1431  int min = std::min(tex_depth, surf_depth);
1432 
1433  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1434  }
1436  // Take the smaller of maximum surface and maximum texture width.
1437  int tex_width = 0;
1439  hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture1DWidth,
1440  device->get()) == hipSuccess);
1441  sycl::detail::pi::assertion(tex_width >= 0);
1442  int surf_width = 0;
1444  hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture1DWidth,
1445  device->get()) == hipSuccess);
1446  sycl::detail::pi::assertion(surf_width >= 0);
1447 
1448  int min = std::min(tex_width, surf_width);
1449 
1450  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1451  }
1453  return getInfo(param_value_size, param_value, param_value_size_ret,
1454  size_t(0));
1455  }
1457  // This call is kind of meaningless for HIP, as samplers don't exist.
1458  // Closest thing is textures, which is 128.
1459  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1460  }
1462  // __global__ function parameters are passed to the device via constant
1463  // memory and are limited to 4 KB.
1464  return getInfo(param_value_size, param_value, param_value_size_ret,
1465  size_t{4000u});
1466  }
1468  int mem_base_addr_align = 0;
1470  hipDeviceGetAttribute(&mem_base_addr_align,
1471  hipDeviceAttributeTextureAlignment,
1472  device->get()) == hipSuccess);
1473  // Multiply by 8 as clGetDeviceInfo returns this value in bits
1474  mem_base_addr_align *= 8;
1475  return getInfo(param_value_size, param_value, param_value_size_ret,
1476  mem_base_addr_align);
1477  }
1479  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1480  }
1485  return getInfo(param_value_size, param_value, param_value_size_ret, config);
1486  }
1490  return getInfo(param_value_size, param_value, param_value_size_ret, config);
1491  }
1493  return getInfo(param_value_size, param_value, param_value_size_ret,
1495  }
1497  // The value is dohipmented for all existing GPUs in the HIP programming
1498  // guidelines, section "H.3.2. Global Memory".
1499  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1500  }
1502  int cache_size = 0;
1504  hipDeviceGetAttribute(&cache_size, hipDeviceAttributeL2CacheSize,
1505  device->get()) == hipSuccess);
1506  sycl::detail::pi::assertion(cache_size >= 0);
1507  // The L2 cache is global to the GPU.
1508  return getInfo(param_value_size, param_value, param_value_size_ret,
1509  pi_uint64(cache_size));
1510  }
1512  size_t bytes = 0;
1513  // Runtime API has easy access to this value, driver API info is scarse.
1514  sycl::detail::pi::assertion(hipDeviceTotalMem(&bytes, device->get()) ==
1515  hipSuccess);
1516  return getInfo(param_value_size, param_value, param_value_size_ret,
1517  pi_uint64{bytes});
1518  }
1520  unsigned int constant_memory = 0;
1521 
1522  // hipDeviceGetAttribute takes a int*, however the size of the constant
1523  // memory on AMD GPU may be larger than what can fit in the positive part
1524  // of a signed integer, so use an unsigned integer and cast the pointer to
1525  // int*.
1527  hipDeviceGetAttribute(reinterpret_cast<int *>(&constant_memory),
1528  hipDeviceAttributeTotalConstantMemory,
1529  device->get()) == hipSuccess);
1530 
1531  return getInfo(param_value_size, param_value, param_value_size_ret,
1532  pi_uint64(constant_memory));
1533  }
1535  // TODO: is there a way to retrieve this from HIP driver API?
1536  // Hard coded to value returned by clinfo for OpenCL 1.2 HIP | GeForce GTX
1537  // 1060 3GB
1538  return getInfo(param_value_size, param_value, param_value_size_ret, 9u);
1539  }
1541  return getInfo(param_value_size, param_value, param_value_size_ret,
1543  }
1545  // OpenCL's "local memory" maps most closely to HIP's "shared memory".
1546  // HIP has its own definition of "local memory", which maps to OpenCL's
1547  // "private memory".
1548  int local_mem_size = 0;
1550  hipDeviceGetAttribute(&local_mem_size,
1551  hipDeviceAttributeMaxSharedMemoryPerBlock,
1552  device->get()) == hipSuccess);
1553  sycl::detail::pi::assertion(local_mem_size >= 0);
1554  return getInfo(param_value_size, param_value, param_value_size_ret,
1555  pi_uint64(local_mem_size));
1556  }
1558  int ecc_enabled = 0;
1560  hipDeviceGetAttribute(&ecc_enabled, hipDeviceAttributeEccEnabled,
1561  device->get()) == hipSuccess);
1562 
1563  sycl::detail::pi::assertion((ecc_enabled == 0) | (ecc_enabled == 1));
1564  auto result = static_cast<pi_bool>(ecc_enabled);
1565  return getInfo(param_value_size, param_value, param_value_size_ret, result);
1566  }
1568  int is_integrated = 0;
1570  hipDeviceGetAttribute(&is_integrated, hipDeviceAttributeIntegrated,
1571  device->get()) == hipSuccess);
1572 
1573  sycl::detail::pi::assertion((is_integrated == 0) | (is_integrated == 1));
1574  auto result = static_cast<pi_bool>(is_integrated);
1575  return getInfo(param_value_size, param_value, param_value_size_ret, result);
1576  }
1578  // Hard coded to value returned by clinfo for OpenCL 1.2 HIP | GeForce GTX
1579  // 1060 3GB
1580  return getInfo(param_value_size, param_value, param_value_size_ret,
1581  size_t{1000u});
1582  }
1584  return getInfo(param_value_size, param_value, param_value_size_ret,
1585  PI_TRUE);
1586  }
1587  case PI_DEVICE_INFO_AVAILABLE: {
1588  return getInfo(param_value_size, param_value, param_value_size_ret,
1589  PI_TRUE);
1590  }
1592  return getInfo(param_value_size, param_value, param_value_size_ret,
1593  PI_TRUE);
1594  }
1596  return getInfo(param_value_size, param_value, param_value_size_ret,
1597  PI_TRUE);
1598  }
1600  return getInfo(param_value_size, param_value, param_value_size_ret,
1601  PI_TRUE);
1602  }
1604  auto capability = PI_DEVICE_EXEC_CAPABILITIES_KERNEL;
1605  return getInfo(param_value_size, param_value, param_value_size_ret,
1606  capability);
1607  }
1609  // The mandated minimum capability:
1610  auto capability = PI_QUEUE_FLAG_PROFILING_ENABLE |
1612  return getInfo(param_value_size, param_value, param_value_size_ret,
1613  capability);
1614  }
1616  // The mandated minimum capability:
1617  auto capability = PI_QUEUE_FLAG_PROFILING_ENABLE;
1618  return getInfo(param_value_size, param_value, param_value_size_ret,
1619  capability);
1620  }
1622  // An empty string is returned if no built-in kernels are supported by the
1623  // device.
1624  return getInfo(param_value_size, param_value, param_value_size_ret, "");
1625  }
1626  case PI_DEVICE_INFO_PLATFORM: {
1627  return getInfo(param_value_size, param_value, param_value_size_ret,
1628  device->get_platform());
1629  }
1630  case PI_DEVICE_INFO_NAME: {
1631  static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u;
1632  char name[MAX_DEVICE_NAME_LENGTH];
1633  sycl::detail::pi::assertion(hipDeviceGetName(name, MAX_DEVICE_NAME_LENGTH,
1634  device->get()) == hipSuccess);
1635 
1636  // On AMD GPUs hipDeviceGetName returns an empty string, so return the arch
1637  // name instead, this is also what AMD OpenCL devices return.
1638  if (strlen(name) == 0) {
1639  hipDeviceProp_t props;
1641  hipGetDeviceProperties(&props, device->get()) == hipSuccess);
1642 
1643  return getInfoArray(strlen(props.gcnArchName) + 1, param_value_size,
1644  param_value, param_value_size_ret, props.gcnArchName);
1645  }
1646  return getInfoArray(strlen(name) + 1, param_value_size, param_value,
1647  param_value_size_ret, name);
1648  }
1649  case PI_DEVICE_INFO_VENDOR: {
1650  return getInfo(param_value_size, param_value, param_value_size_ret,
1651  "AMD Corporation");
1652  }
1654  auto version = getHipVersionString();
1655  return getInfo(param_value_size, param_value, param_value_size_ret,
1656  version.c_str());
1657  }
1658  case PI_DEVICE_INFO_PROFILE: {
1659  return getInfo(param_value_size, param_value, param_value_size_ret, "HIP");
1660  }
1662  return getInfo(param_value_size, param_value, param_value_size_ret,
1663  device->get_reference_count());
1664  }
1665  case PI_DEVICE_INFO_VERSION: {
1666  return getInfo(param_value_size, param_value, param_value_size_ret,
1667  "PI 0.0");
1668  }
1670  return getInfo(param_value_size, param_value, param_value_size_ret, "");
1671  }
1673  // TODO: Remove comment when HIP support native asserts.
1674  // DEVICELIB_ASSERT extension is set so fallback assert
1675  // postprocessing is NOP. HIP 4.3 docs indicate support for
1676  // native asserts are in progress
1677  std::string SupportedExtensions = "";
1678  SupportedExtensions += PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT;
1679  SupportedExtensions += " ";
1680 
1681  hipDeviceProp_t props;
1682  sycl::detail::pi::assertion(hipGetDeviceProperties(&props, device->get()) ==
1683  hipSuccess);
1684  if (props.arch.hasDoubles) {
1685  SupportedExtensions += "cl_khr_fp64 ";
1686  }
1687 
1688  return getInfo(param_value_size, param_value, param_value_size_ret,
1689  SupportedExtensions.c_str());
1690  }
1692  // The minimum value for the FULL profile is 1 MB.
1693  return getInfo(param_value_size, param_value, param_value_size_ret,
1694  size_t{1024u});
1695  }
1697  return getInfo(param_value_size, param_value, param_value_size_ret,
1698  PI_TRUE);
1699  }
1701  return getInfo(param_value_size, param_value, param_value_size_ret,
1702  nullptr);
1703  }
1705  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1706  }
1708  return getInfo(param_value_size, param_value, param_value_size_ret,
1709  static_cast<pi_device_partition_property>(0u));
1710  }
1712  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1713  }
1715  return getInfo(param_value_size, param_value, param_value_size_ret,
1716  static_cast<pi_device_partition_property>(0u));
1717  }
1718 
1719  // Intel USM extensions
1720 
1722  // from cl_intel_unified_shared_memory: "The host memory access capabilities
1723  // apply to any host allocation."
1724  //
1725  // query if/how the device can access page-locked host memory, possibly
1726  // through PCIe, using the same pointer as the host
1727  pi_bitfield value = {};
1728  // if (getAttribute(device, HIP_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
1729  // the device shares a unified address space with the host
1730  if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) {
1731  // compute capability 6.x introduces operations that are atomic with
1732  // respect to other CPUs and GPUs in the system
1735  } else {
1736  // on GPU architectures with compute capability lower than 6.x, atomic
1737  // operations from the GPU to CPU memory will not be atomic with respect
1738  // to CPU initiated atomic operations
1740  }
1741  //}
1742  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1743  }
1745  // from cl_intel_unified_shared_memory:
1746  // "The device memory access capabilities apply to any device allocation
1747  // associated with this device."
1748  //
1749  // query how the device can access memory allocated on the device itself (?)
1753  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1754  }
1756  // from cl_intel_unified_shared_memory:
1757  // "The single device shared memory access capabilities apply to any shared
1758  // allocation associated with this device."
1759  //
1760  // query if/how the device can access managed memory associated to it
1761  pi_bitfield value = {};
1762  if (getAttribute(device, hipDeviceAttributeManagedMemory)) {
1763  // the device can allocate managed memory on this system
1765  }
1766  if (getAttribute(device, hipDeviceAttributeConcurrentManagedAccess)) {
1767  // the device can coherently access managed memory concurrently with the
1768  // CPU
1769  value |= PI_USM_CONCURRENT_ACCESS;
1770  if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) {
1771  // compute capability 6.x introduces operations that are atomic with
1772  // respect to other CPUs and GPUs in the system
1774  }
1775  }
1776  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1777  }
1779  // from cl_intel_unified_shared_memory:
1780  // "The cross-device shared memory access capabilities apply to any shared
1781  // allocation associated with this device, or to any shared memory
1782  // allocation on another device that also supports the same cross-device
1783  // shared memory access capability."
1784  //
1785  // query if/how the device can access managed memory associated to other
1786  // devices
1787  pi_bitfield value = {};
1788  if (getAttribute(device, hipDeviceAttributeManagedMemory)) {
1789  // the device can allocate managed memory on this system
1790  value |= PI_USM_ACCESS;
1791  }
1792  if (getAttribute(device, hipDeviceAttributeConcurrentManagedAccess)) {
1793  // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
1794  // attribute can coherently access managed memory concurrently with the
1795  // CPU
1796  value |= PI_USM_CONCURRENT_ACCESS;
1797  }
1798  if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) {
1799  // compute capability 6.x introduces operations that are atomic with
1800  // respect to other CPUs and GPUs in the system
1801  if (value & PI_USM_ACCESS)
1802  value |= PI_USM_ATOMIC_ACCESS;
1803  if (value & PI_USM_CONCURRENT_ACCESS)
1805  }
1806  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1807  }
1809  // from cl_intel_unified_shared_memory:
1810  // "The shared system memory access capabilities apply to any allocations
1811  // made by a system allocator, such as malloc or new."
1812  //
1813  // query if/how the device can access pageable host memory allocated by the
1814  // system allocator
1815  pi_bitfield value = {};
1816  if (getAttribute(device, hipDeviceAttributePageableMemoryAccess)) {
1817  // the link between the device and the host does not support native
1818  // atomic operations
1820  }
1821  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1822  }
1823 
1824  case PI_DEVICE_INFO_ATOMIC_64: {
1825  // TODO: Reconsider it when AMD supports SYCL_USE_NATIVE_FP_ATOMICS.
1826  hipDeviceProp_t props;
1827  sycl::detail::pi::assertion(hipGetDeviceProperties(&props, device->get()) ==
1828  hipSuccess);
1829  return getInfo(param_value_size, param_value, param_value_size_ret,
1830  props.arch.hasGlobalInt64Atomics &&
1831  props.arch.hasSharedInt64Atomics);
1832  }
1833 
1835  size_t FreeMemory = 0;
1836  size_t TotalMemory = 0;
1837  sycl::detail::pi::assertion(hipMemGetInfo(&FreeMemory, &TotalMemory) ==
1838  hipSuccess,
1839  "failed hipMemGetInfo() API.");
1840  return getInfo(param_value_size, param_value, param_value_size_ret,
1841  FreeMemory);
1842  }
1843 
1845  int value = 0;
1847  hipDeviceGetAttribute(&value, hipDeviceAttributeMemoryClockRate,
1848  device->get()) == hipSuccess);
1849  sycl::detail::pi::assertion(value >= 0);
1850  // Convert kilohertz to megahertz when returning.
1851  return getInfo(param_value_size, param_value, param_value_size_ret,
1852  value / 1000);
1853  }
1854 
1856  int value = 0;
1858  hipDeviceGetAttribute(&value, hipDeviceAttributeMemoryBusWidth,
1859  device->get()) == hipSuccess);
1860  sycl::detail::pi::assertion(value >= 0);
1861  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1862  }
1864  return getInfo(param_value_size, param_value, param_value_size_ret,
1865  pi_int32{1});
1866  }
1867 
1872  return getInfo(param_value_size, param_value, param_value_size_ret,
1873  capabilities);
1874  }
1877  // SYCL2020 4.6.4.2 minimum mandated capabilities for
1878  // atomic_fence/memory_scope_capabilities.
1879  // Because scopes are hierarchical, wider scopes support all narrower
1880  // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and
1881  // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382)
1885  return getInfo(param_value_size, param_value, param_value_size_ret,
1886  capabilities);
1887  }
1889  // SYCL2020 4.6.4.2 minimum mandated capabilities for
1890  // atomic_fence_order_capabilities.
1891  pi_memory_order_capabilities capabilities =
1894  return getInfo(param_value_size, param_value, param_value_size_ret,
1895  capabilities);
1896  }
1897 
1898  case PI_DEVICE_INFO_DEVICE_ID: {
1899  int value = 0;
1901  hipDeviceGetAttribute(&value, hipDeviceAttributePciDeviceId,
1902  device->get()) == hipSuccess);
1903  sycl::detail::pi::assertion(value >= 0);
1904  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1905  }
1906 
1907  case PI_DEVICE_INFO_UUID: {
1908 #if ((HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 2) || \
1909  HIP_VERSION_MAJOR > 5)
1910  hipUUID uuid = {};
1911  // Supported since 5.2+
1912  sycl::detail::pi::assertion(hipDeviceGetUuid(&uuid, device->get()) ==
1913  hipSuccess);
1914  std::array<unsigned char, 16> name;
1915  std::copy(uuid.bytes, uuid.bytes + 16, name.begin());
1916  return getInfoArray(16, param_value_size, param_value, param_value_size_ret,
1917  name.data());
1918 #endif
1919  return PI_ERROR_INVALID_VALUE;
1920  }
1921 
1922  // TODO: Investigate if this information is available on HIP.
1932  setErrorMessage("HIP backend does not support this query",
1933  PI_ERROR_INVALID_ARG_VALUE);
1934  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
1935 
1936  default:
1938  }
1939  sycl::detail::pi::die("Device info request not implemented");
1940  return {};
1941 }
1942 
1950  pi_native_handle *nativeHandle) {
1951  *nativeHandle = static_cast<pi_native_handle>(device->get());
1952  return PI_SUCCESS;
1953 }
1954 
1965  pi_platform platform,
1966  pi_device *device) {
1967  (void)nativeHandle;
1968  (void)platform;
1969  (void)device;
1971  "Creation of PI device from native handle not implemented");
1972  return {};
1973 }
1974 
1975 /* Context APIs */
1976 
1996  pi_uint32 num_devices, const pi_device *devices,
1997  void (*pfn_notify)(const char *errinfo,
1998  const void *private_info,
1999  size_t cb, void *user_data),
2000  void *user_data, pi_context *retcontext) {
2001 
2002  assert(devices != nullptr);
2003  // TODO: How to implement context callback?
2004  assert(pfn_notify == nullptr);
2005  assert(user_data == nullptr);
2006  assert(num_devices == 1);
2007  // Need input context
2008  assert(retcontext != nullptr);
2009  pi_result errcode_ret = PI_SUCCESS;
2010 
2011  // Parse properties.
2012  bool property_hip_primary = false;
2013  while (properties && (0 != *properties)) {
2014  // Consume property ID.
2015  pi_context_properties id = *properties;
2016  ++properties;
2017  // Consume property value.
2018  pi_context_properties value = *properties;
2019  ++properties;
2020  switch (id) {
2022  assert(value == PI_FALSE || value == PI_TRUE);
2023  property_hip_primary = static_cast<bool>(value);
2024  break;
2025  default:
2026  // Unknown property.
2028  "Unknown piContextCreate property in property list");
2029  return PI_ERROR_INVALID_VALUE;
2030  }
2031  }
2032 
2033  std::unique_ptr<_pi_context> piContextPtr{nullptr};
2034  try {
2035  hipCtx_t current = nullptr;
2036 
2037  if (property_hip_primary) {
2038  // Use the HIP primary context and assume that we want to use it
2039  // immediately as we want to forge context switches.
2040  hipCtx_t Ctxt;
2041  errcode_ret =
2042  PI_CHECK_ERROR(hipDevicePrimaryCtxRetain(&Ctxt, devices[0]->get()));
2043  piContextPtr = std::unique_ptr<_pi_context>(
2044  new _pi_context{_pi_context::kind::primary, Ctxt, *devices});
2045  errcode_ret = PI_CHECK_ERROR(hipCtxPushCurrent(Ctxt));
2046  } else {
2047  // Create a scoped context.
2048  hipCtx_t newContext;
2049  PI_CHECK_ERROR(hipCtxGetCurrent(&current));
2050  errcode_ret = PI_CHECK_ERROR(
2051  hipCtxCreate(&newContext, hipDeviceMapHost, devices[0]->get()));
2052  piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{
2053  _pi_context::kind::user_defined, newContext, *devices});
2054  }
2055 
2056  static std::once_flag initFlag;
2057  std::call_once(
2058  initFlag,
2059  [](pi_result &err) {
2060  // Use default stream to record base event counter
2061  PI_CHECK_ERROR(
2062  hipEventCreateWithFlags(&_pi_platform::evBase_, hipEventDefault));
2063  PI_CHECK_ERROR(hipEventRecord(_pi_platform::evBase_, 0));
2064  },
2065  errcode_ret);
2066 
2067  // For non-primary scoped contexts keep the last active on top of the stack
2068  // as `cuCtxCreate` replaces it implicitly otherwise.
2069  // Primary contexts are kept on top of the stack, so the previous context
2070  // is not queried and therefore not recovered.
2071  if (current != nullptr) {
2072  PI_CHECK_ERROR(hipCtxSetCurrent(current));
2073  }
2074 
2075  *retcontext = piContextPtr.release();
2076  } catch (pi_result err) {
2077  errcode_ret = err;
2078  } catch (...) {
2079  errcode_ret = PI_ERROR_OUT_OF_RESOURCES;
2080  }
2081  return errcode_ret;
2082 }
2083 
2085 
2086  assert(ctxt != nullptr);
2087 
2088  if (ctxt->decrement_reference_count() > 0) {
2089  return PI_SUCCESS;
2090  }
2091  ctxt->invoke_extended_deleters();
2092 
2093  std::unique_ptr<_pi_context> context{ctxt};
2094 
2095  if (!ctxt->is_primary()) {
2096  hipCtx_t hipCtxt = ctxt->get();
2097  // hipCtxSynchronize is not supported for AMD platform so we can just
2098  // destroy the context, for NVIDIA make sure it's synchronized.
2099 #if defined(__HIP_PLATFORM_NVIDIA__)
2100  hipCtx_t current = nullptr;
2101  PI_CHECK_ERROR(hipCtxGetCurrent(&current));
2102  if (hipCtxt != current) {
2103  PI_CHECK_ERROR(hipCtxPushCurrent(hipCtxt));
2104  }
2105  PI_CHECK_ERROR(hipCtxSynchronize());
2106  PI_CHECK_ERROR(hipCtxGetCurrent(&current));
2107  if (hipCtxt == current) {
2108  PI_CHECK_ERROR(hipCtxPopCurrent(&current));
2109  }
2110 #endif
2111  return PI_CHECK_ERROR(hipCtxDestroy(hipCtxt));
2112  } else {
2113  // Primary context is not destroyed, but released
2114  hipDevice_t hipDev = ctxt->get_device()->get();
2115  hipCtx_t current;
2116  PI_CHECK_ERROR(hipCtxPopCurrent(&current));
2117  return PI_CHECK_ERROR(hipDevicePrimaryCtxRelease(hipDev));
2118  }
2119 
2120  hipCtx_t hipCtxt = ctxt->get();
2121  return PI_CHECK_ERROR(hipCtxDestroy(hipCtxt));
2122 }
2123 
2131  pi_native_handle *nativeHandle) {
2132  *nativeHandle = reinterpret_cast<pi_native_handle>(context->get());
2133  return PI_SUCCESS;
2134 }
2135 
2145  pi_uint32 num_devices,
2146  const pi_device *devices,
2147  bool ownNativeHandle,
2148  pi_context *context) {
2149  (void)nativeHandle;
2150  (void)num_devices;
2151  (void)devices;
2152  (void)ownNativeHandle;
2153  (void)context;
2155  "Creation of PI context from native handle not implemented");
2156  return {};
2157 }
2158 
2164  size_t size, void *host_ptr, pi_mem *ret_mem,
2165  const pi_mem_properties *properties) {
2166  // Need input memory object
2167  assert(ret_mem != nullptr);
2168  assert((properties == nullptr || *properties == 0) &&
2169  "no mem properties goes to HIP RT yet");
2170  // Currently, USE_HOST_PTR is not implemented using host register
2171  // since this triggers a weird segfault after program ends.
2172  // Setting this constant to true enables testing that behavior.
2173  const bool enableUseHostPtr = false;
2174  const bool performInitialCopy =
2175  (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
2176  ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && !enableUseHostPtr);
2177  pi_result retErr = PI_SUCCESS;
2178  pi_mem retMemObj = nullptr;
2179 
2180  try {
2181  ScopedContext active(context);
2182  void *ptr;
2185 
2186  if ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && enableUseHostPtr) {
2187  retErr = PI_CHECK_ERROR(
2188  hipHostRegister(host_ptr, size, hipHostRegisterMapped));
2189  retErr = PI_CHECK_ERROR(hipHostGetDevicePointer(&ptr, host_ptr, 0));
2191  } else if (flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
2192  retErr = PI_CHECK_ERROR(hipHostMalloc(&host_ptr, size));
2193  retErr = PI_CHECK_ERROR(hipHostGetDevicePointer(&ptr, host_ptr, 0));
2195  } else {
2196  retErr = PI_CHECK_ERROR(hipMalloc(&ptr, size));
2197  if (flags & PI_MEM_FLAGS_HOST_PTR_COPY) {
2199  }
2200  }
2201 
2202  if (retErr == PI_SUCCESS) {
2203  pi_mem parentBuffer = nullptr;
2204 
2205  auto devPtr =
2206  reinterpret_cast<_pi_mem::mem_::mem_::buffer_mem_::native_type>(ptr);
2207  auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{
2208  context, parentBuffer, allocMode, devPtr, host_ptr, size});
2209  if (piMemObj != nullptr) {
2210  retMemObj = piMemObj.release();
2211  if (performInitialCopy) {
2212  // Operates on the default stream of the current HIP context.
2213  retErr = PI_CHECK_ERROR(hipMemcpyHtoD(devPtr, host_ptr, size));
2214  // Synchronize with default stream implicitly used by cuMemcpyHtoD
2215  // to make buffer data available on device before any other PI call
2216  // uses it.
2217  if (retErr == PI_SUCCESS) {
2218  hipStream_t defaultStream = 0;
2219  retErr = PI_CHECK_ERROR(hipStreamSynchronize(defaultStream));
2220  }
2221  }
2222  } else {
2223  retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
2224  }
2225  }
2226  } catch (pi_result err) {
2227  retErr = err;
2228  } catch (...) {
2229  retErr = PI_ERROR_OUT_OF_RESOURCES;
2230  }
2231 
2232  *ret_mem = retMemObj;
2233 
2234  return retErr;
2235 }
2236 
2242  assert((memObj != nullptr) && "PI_ERROR_INVALID_MEM_OBJECTS");
2243 
2244  pi_result ret = PI_SUCCESS;
2245 
2246  try {
2247 
2248  // Do nothing if there are other references
2249  if (memObj->decrement_reference_count() > 0) {
2250  return PI_SUCCESS;
2251  }
2252 
2253  // make sure memObj is released in case PI_CHECK_ERROR throws
2254  std::unique_ptr<_pi_mem> uniqueMemObj(memObj);
2255 
2256  if (memObj->is_sub_buffer()) {
2257  return PI_SUCCESS;
2258  }
2259 
2260  ScopedContext active(uniqueMemObj->get_context());
2261 
2262  if (memObj->mem_type_ == _pi_mem::mem_type::buffer) {
2263  switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) {
2266  ret = PI_CHECK_ERROR(
2267  hipFree((void *)uniqueMemObj->mem_.buffer_mem_.ptr_));
2268  break;
2270  ret = PI_CHECK_ERROR(
2271  hipHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
2272  break;
2274  ret = PI_CHECK_ERROR(
2275  hipFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
2276  };
2277  }
2278 
2279  else if (memObj->mem_type_ == _pi_mem::mem_type::surface) {
2280  ret = PI_CHECK_ERROR(hipDestroySurfaceObject(
2281  uniqueMemObj->mem_.surface_mem_.get_surface()));
2282  auto array = uniqueMemObj->mem_.surface_mem_.get_array();
2283  ret = PI_CHECK_ERROR(hipFreeArray(array));
2284  }
2285 
2286  } catch (pi_result err) {
2287  ret = err;
2288  } catch (...) {
2289  ret = PI_ERROR_OUT_OF_RESOURCES;
2290  }
2291 
2292  if (ret != PI_SUCCESS) {
2293  // A reported HIP error is either an implementation or an asynchronous HIP
2294  // error for which it is unclear if the function that reported it succeeded
2295  // or not. Either way, the state of the program is compromised and likely
2296  // unrecoverable.
2298  "Unrecoverable program state reached in hip_piMemRelease");
2299  }
2300 
2301  return PI_SUCCESS;
2302 }
2303 
2309  pi_buffer_create_type buffer_create_type,
2310  void *buffer_create_info, pi_mem *memObj) {
2311  assert((parent_buffer != nullptr) && "PI_ERROR_INVALID_MEM_OBJECT");
2312  assert(parent_buffer->is_buffer() && "PI_ERROR_INVALID_MEM_OBJECTS");
2313  assert(!parent_buffer->is_sub_buffer() && "PI_ERROR_INVALID_MEM_OBJECT");
2314 
2315  // Default value for flags means PI_MEM_FLAGS_ACCCESS_RW.
2316  if (flags == 0) {
2317  flags = PI_MEM_FLAGS_ACCESS_RW;
2318  }
2319 
2320  assert((flags == PI_MEM_FLAGS_ACCESS_RW) && "PI_ERROR_INVALID_VALUE");
2321  assert((buffer_create_type == PI_BUFFER_CREATE_TYPE_REGION) &&
2322  "PI_ERROR_INVALID_VALUE");
2323  assert((buffer_create_info != nullptr) && "PI_ERROR_INVALID_VALUE");
2324  assert(memObj != nullptr);
2325 
2326  const auto bufferRegion =
2327  *reinterpret_cast<pi_buffer_region>(buffer_create_info);
2328  assert((bufferRegion.size != 0u) && "PI_ERROR_INVALID_BUFFER_SIZE");
2329 
2330  assert((bufferRegion.origin <= (bufferRegion.origin + bufferRegion.size)) &&
2331  "Overflow");
2332  assert(((bufferRegion.origin + bufferRegion.size) <=
2333  parent_buffer->mem_.buffer_mem_.get_size()) &&
2334  "PI_ERROR_INVALID_BUFFER_SIZE");
2335  // Retained indirectly due to retaining parent buffer below.
2336  pi_context context = parent_buffer->context_;
2339 
2340  assert(parent_buffer->mem_.buffer_mem_.ptr_ !=
2343  parent_buffer->mem_.buffer_mem_.get_with_offset(bufferRegion.origin);
2344 
2345  void *hostPtr = nullptr;
2346  if (parent_buffer->mem_.buffer_mem_.hostPtr_) {
2347  hostPtr = static_cast<char *>(parent_buffer->mem_.buffer_mem_.hostPtr_) +
2348  bufferRegion.origin;
2349  }
2350 
2351  ReleaseGuard<pi_mem> releaseGuard(parent_buffer);
2352 
2353  std::unique_ptr<_pi_mem> retMemObj{nullptr};
2354  try {
2355  ScopedContext active(context);
2356 
2357  retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{
2358  context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}};
2359  } catch (pi_result err) {
2360  *memObj = nullptr;
2361  return err;
2362  } catch (...) {
2363  *memObj = nullptr;
2364  return PI_ERROR_OUT_OF_HOST_MEMORY;
2365  }
2366 
2367  releaseGuard.dismiss();
2368  *memObj = retMemObj.release();
2369  return PI_SUCCESS;
2370 }
2371 
2373  size_t expectedQuerySize, void *queryOutput,
2374  size_t *writtenQuerySize) {
2375  (void)memObj;
2376  (void)queriedInfo;
2377  (void)expectedQuerySize;
2378  (void)queryOutput;
2379  (void)writtenQuerySize;
2380 
2381  sycl::detail::pi::die("hip_piMemGetInfo not implemented");
2382 }
2383 
2391  pi_native_handle *nativeHandle) {
2392 #if defined(__HIP_PLATFORM_NVIDIA__)
2394  sizeof(pi_native_handle)) {
2395  // Check that all the upper bits that cannot be represented by
2396  // pi_native_handle are empty.
2397  // NOTE: The following shift might trigger a warning, but the check in the
2398  // if above makes sure that this does not underflow.
2400  mem->mem_.buffer_mem_.get() >> (sizeof(pi_native_handle) * CHAR_BIT);
2401  if (upperBits) {
2402  // Return an error if any of the remaining bits is non-zero.
2403  return PI_ERROR_INVALID_MEM_OBJECT;
2404  }
2405  }
2406  *nativeHandle = static_cast<pi_native_handle>(mem->mem_.buffer_mem_.get());
2407 #elif defined(__HIP_PLATFORM_AMD__)
2408  *nativeHandle =
2409  reinterpret_cast<pi_native_handle>(mem->mem_.buffer_mem_.get());
2410 #else
2411 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
2412 #endif
2413  return PI_SUCCESS;
2414 }
2415 
2428  pi_context context,
2429  bool ownNativeHandle,
2430  pi_mem *mem) {
2431  (void)nativeHandle;
2432  (void)context;
2433  (void)ownNativeHandle;
2434  (void)mem;
2435 
2437  "Creation of PI mem from native handle not implemented");
2438  return {};
2439 }
2440 
2448  pi_queue_properties properties, pi_queue *queue) {
2449  try {
2450  std::unique_ptr<_pi_queue> queueImpl{nullptr};
2451 
2452  if (context->get_device() != device) {
2453  *queue = nullptr;
2454  return PI_ERROR_INVALID_DEVICE;
2455  }
2456 
2457  unsigned int flags = 0;
2458 
2459  const bool is_out_of_order =
2461 
2462  std::vector<hipStream_t> computeHipStreams(
2463  is_out_of_order ? _pi_queue::default_num_compute_streams : 1);
2464  std::vector<hipStream_t> transferHipStreams(
2465  is_out_of_order ? _pi_queue::default_num_transfer_streams : 0);
2466 
2467  queueImpl = std::unique_ptr<_pi_queue>(new _pi_queue{
2468  std::move(computeHipStreams), std::move(transferHipStreams), context,
2469  device, properties, flags});
2470 
2471  *queue = queueImpl.release();
2472 
2473  return PI_SUCCESS;
2474  } catch (pi_result err) {
2475 
2476  return err;
2477 
2478  } catch (...) {
2479 
2480  return PI_ERROR_OUT_OF_RESOURCES;
2481  }
2482 }
2484  pi_queue_properties *Properties,
2485  pi_queue *Queue) {
2486  assert(Properties);
2487  // Expect flags mask to be passed first.
2488  assert(Properties[0] == PI_QUEUE_FLAGS);
2489  if (Properties[0] != PI_QUEUE_FLAGS)
2490  return PI_ERROR_INVALID_VALUE;
2491  pi_queue_properties Flags = Properties[1];
2492  // Extra data isn't supported yet.
2493  assert(Properties[2] == 0);
2494  if (Properties[2] != 0)
2495  return PI_ERROR_INVALID_VALUE;
2496  return hip_piQueueCreate(Context, Device, Flags, Queue);
2497 }
2498 
2500  size_t param_value_size, void *param_value,
2501  size_t *param_value_size_ret) {
2502  assert(command_queue != nullptr);
2503 
2504  switch (param_name) {
2505  case PI_QUEUE_INFO_CONTEXT:
2506  return getInfo(param_value_size, param_value, param_value_size_ret,
2507  command_queue->context_);
2508  case PI_QUEUE_INFO_DEVICE:
2509  return getInfo(param_value_size, param_value, param_value_size_ret,
2510  command_queue->device_);
2512  return getInfo(param_value_size, param_value, param_value_size_ret,
2513  command_queue->get_reference_count());
2515  return getInfo(param_value_size, param_value, param_value_size_ret,
2516  command_queue->properties_);
2518  bool IsReady = command_queue->all_of([](hipStream_t s) -> bool {
2519  const hipError_t ret = hipStreamQuery(s);
2520  if (ret == hipSuccess)
2521  return true;
2522 
2523  if (ret == hipErrorNotReady)
2524  return false;
2525 
2526  PI_CHECK_ERROR(ret);
2527  return false;
2528  });
2529  return getInfo(param_value_size, param_value, param_value_size_ret,
2530  IsReady);
2531  }
2532  default:
2534  }
2535  sycl::detail::pi::die("Queue info request not implemented");
2536  return {};
2537 }
2538 
2540  assert(command_queue != nullptr);
2541  assert(command_queue->get_reference_count() > 0);
2542 
2543  command_queue->increment_reference_count();
2544  return PI_SUCCESS;
2545 }
2546 
2548  assert(command_queue != nullptr);
2549 
2550  if (command_queue->decrement_reference_count() > 0) {
2551  return PI_SUCCESS;
2552  }
2553 
2554  try {
2555  std::unique_ptr<_pi_queue> queueImpl(command_queue);
2556 
2557  ScopedContext active(command_queue->get_context());
2558 
2559  command_queue->for_each_stream([](hipStream_t s) {
2560  PI_CHECK_ERROR(hipStreamSynchronize(s));
2561  PI_CHECK_ERROR(hipStreamDestroy(s));
2562  });
2563 
2564  return PI_SUCCESS;
2565  } catch (pi_result err) {
2566  return err;
2567  } catch (...) {
2568  return PI_ERROR_OUT_OF_RESOURCES;
2569  }
2570 }
2571 
2573 
2574  // set default result to a negative result (avoid false-positve tests)
2575  pi_result result = PI_ERROR_OUT_OF_HOST_MEMORY;
2576 
2577  try {
2578 
2579  assert(command_queue !=
2580  nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code
2581  ScopedContext active(command_queue->get_context());
2582 
2583  command_queue->sync_streams<true>([&result](hipStream_t s) {
2584  result = PI_CHECK_ERROR(hipStreamSynchronize(s));
2585  });
2586 
2587  } catch (pi_result err) {
2588 
2589  result = err;
2590 
2591  } catch (...) {
2592 
2593  result = PI_ERROR_OUT_OF_RESOURCES;
2594  }
2595 
2596  return result;
2597 }
2598 
2599 // There is no HIP counterpart for queue flushing and we don't run into the
2600 // same problem of having to flush cross-queue dependencies as some of the
2601 // other plugins, so it can be left as no-op.
2603  (void)command_queue;
2604  return PI_SUCCESS;
2605 }
2606 
2614  pi_native_handle *nativeHandle) {
2615  ScopedContext active(queue->get_context());
2616  *nativeHandle =
2617  reinterpret_cast<pi_native_handle>(queue->get_next_compute_stream());
2618  return PI_SUCCESS;
2619 }
2620 
2634  pi_context context,
2635  pi_device device,
2636  bool ownNativeHandle,
2637  pi_queue *queue) {
2638  (void)nativeHandle;
2639  (void)context;
2640  (void)device;
2641  (void)queue;
2642  (void)ownNativeHandle;
2644  "Creation of PI queue from native handle not implemented");
2645  return {};
2646 }
2647 
2649  pi_bool blocking_write, size_t offset,
2650  size_t size, void *ptr,
2651  pi_uint32 num_events_in_wait_list,
2652  const pi_event *event_wait_list,
2653  pi_event *event) {
2654 
2655  assert(buffer != nullptr);
2656  assert(command_queue != nullptr);
2657  pi_result retErr = PI_SUCCESS;
2658  std::unique_ptr<_pi_event> retImplEv{nullptr};
2659 
2660  try {
2661  ScopedContext active(command_queue->get_context());
2662  hipStream_t hipStream = command_queue->get_next_transfer_stream();
2663  retErr = enqueueEventsWait(command_queue, hipStream,
2664  num_events_in_wait_list, event_wait_list);
2665 
2666  if (event) {
2667  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
2668  PI_COMMAND_TYPE_MEM_BUFFER_WRITE, command_queue, hipStream));
2669  retImplEv->start();
2670  }
2671 
2672  retErr = PI_CHECK_ERROR(
2673  hipMemcpyHtoDAsync(buffer->mem_.buffer_mem_.get_with_offset(offset),
2674  ptr, size, hipStream));
2675 
2676  if (event) {
2677  retErr = retImplEv->record();
2678  }
2679 
2680  if (blocking_write) {
2681  retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
2682  }
2683 
2684  if (event) {
2685  *event = retImplEv.release();
2686  }
2687  } catch (pi_result err) {
2688  retErr = err;
2689  }
2690  return retErr;
2691 }
2692 
2694  pi_bool blocking_read, size_t offset,
2695  size_t size, void *ptr,
2696  pi_uint32 num_events_in_wait_list,
2697  const pi_event *event_wait_list,
2698  pi_event *event) {
2699 
2700  assert(buffer != nullptr);
2701  assert(command_queue != nullptr);
2702  pi_result retErr = PI_SUCCESS;
2703  std::unique_ptr<_pi_event> retImplEv{nullptr};
2704 
2705  try {
2706  ScopedContext active(command_queue->get_context());
2707  hipStream_t hipStream = command_queue->get_next_transfer_stream();
2708  retErr = enqueueEventsWait(command_queue, hipStream,
2709  num_events_in_wait_list, event_wait_list);
2710 
2711  if (event) {
2712  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
2713  PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue, hipStream));
2714  retImplEv->start();
2715  }
2716 
2717  retErr = PI_CHECK_ERROR(hipMemcpyDtoHAsync(
2718  ptr, buffer->mem_.buffer_mem_.get_with_offset(offset), size,
2719  hipStream));
2720 
2721  if (event) {
2722  retErr = retImplEv->record();
2723  }
2724 
2725  if (blocking_read) {
2726  retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
2727  }
2728 
2729  if (event) {
2730  *event = retImplEv.release();
2731  }
2732 
2733  } catch (pi_result err) {
2734  retErr = err;
2735  }
2736  return retErr;
2737 }
2738 
2739 pi_result hip_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
2740 
2741  try {
2742  assert(num_events != 0);
2743  assert(event_list);
2744  if (num_events == 0) {
2745  return PI_ERROR_INVALID_VALUE;
2746  }
2747 
2748  if (!event_list) {
2749  return PI_ERROR_INVALID_EVENT;
2750  }
2751 
2752  auto context = event_list[0]->get_context();
2753  ScopedContext active(context);
2754 
2755  auto waitFunc = [context](pi_event event) -> pi_result {
2756  if (!event) {
2757  return PI_ERROR_INVALID_EVENT;
2758  }
2759 
2760  if (event->get_context() != context) {
2761  return PI_ERROR_INVALID_CONTEXT;
2762  }
2763 
2764  return event->wait();
2765  };
2766  return forLatestEvents(event_list, num_events, waitFunc);
2767  } catch (pi_result err) {
2768  return err;
2769  } catch (...) {
2770  return PI_ERROR_OUT_OF_RESOURCES;
2771  }
2772 }
2773 
2774 pi_result hip_piKernelCreate(pi_program program, const char *kernel_name,
2775  pi_kernel *kernel) {
2776  assert(kernel != nullptr);
2777  assert(program != nullptr);
2778 
2779  pi_result retErr = PI_SUCCESS;
2780  std::unique_ptr<_pi_kernel> retKernel{nullptr};
2781 
2782  try {
2783  ScopedContext active(program->get_context());
2784 
2785  hipFunction_t hipFunc;
2786  retErr = PI_CHECK_ERROR(
2787  hipModuleGetFunction(&hipFunc, program->get(), kernel_name));
2788 
2789  std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset";
2790  hipFunction_t hipFuncWithOffsetParam;
2791  hipError_t offsetRes = hipModuleGetFunction(
2792  &hipFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str());
2793 
2794  // If there is no kernel with global offset parameter we mark it as missing
2795  if (offsetRes == hipErrorNotFound) {
2796  hipFuncWithOffsetParam = nullptr;
2797  } else {
2798  retErr = PI_CHECK_ERROR(offsetRes);
2799  }
2800 
2801  retKernel = std::unique_ptr<_pi_kernel>(
2802  new _pi_kernel{hipFunc, hipFuncWithOffsetParam, kernel_name, program,
2803  program->get_context()});
2804  } catch (pi_result err) {
2805  retErr = err;
2806  } catch (...) {
2807  retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
2808  }
2809 
2810  *kernel = retKernel.release();
2811  return retErr;
2812 }
2813 
2815  size_t arg_size, const void *arg_value) {
2816 
2817  assert(kernel != nullptr);
2818  pi_result retErr = PI_SUCCESS;
2819  try {
2820  if (arg_value) {
2821  kernel->set_kernel_arg(arg_index, arg_size, arg_value);
2822  } else {
2823  kernel->set_kernel_local_arg(arg_index, arg_size);
2824  }
2825  } catch (pi_result err) {
2826  retErr = err;
2827  }
2828  return retErr;
2829 }
2830 
2832  const pi_mem *arg_value) {
2833 
2834  assert(kernel != nullptr);
2835  assert(arg_value != nullptr);
2836 
2837  pi_result retErr = PI_SUCCESS;
2838  try {
2839  pi_mem arg_mem = *arg_value;
2840 
2841  if (arg_mem->mem_type_ == _pi_mem::mem_type::surface) {
2842  auto array = arg_mem->mem_.surface_mem_.get_array();
2843  hipArray_Format Format;
2844  size_t NumChannels;
2845  getArrayDesc(array, Format, NumChannels);
2846  if (Format != HIP_AD_FORMAT_UNSIGNED_INT32 &&
2847  Format != HIP_AD_FORMAT_SIGNED_INT32 &&
2848  Format != HIP_AD_FORMAT_HALF && Format != HIP_AD_FORMAT_FLOAT) {
2850  "PI HIP kernels only support images with channel types int32, "
2851  "uint32, float, and half.");
2852  }
2853  hipSurfaceObject_t hipSurf = arg_mem->mem_.surface_mem_.get_surface();
2854  kernel->set_kernel_arg(arg_index, sizeof(hipSurf), (void *)&hipSurf);
2855  } else
2856 
2857  {
2858  void *hipPtr = arg_mem->mem_.buffer_mem_.get_void();
2859  kernel->set_kernel_arg(arg_index, sizeof(void *), (void *)&hipPtr);
2860  }
2861  } catch (pi_result err) {
2862  retErr = err;
2863  }
2864  return retErr;
2865 }
2866 
2868  const pi_sampler *arg_value) {
2869 
2870  assert(kernel != nullptr);
2871  assert(arg_value != nullptr);
2872 
2873  pi_result retErr = PI_SUCCESS;
2874  try {
2875  pi_uint32 samplerProps = (*arg_value)->props_;
2876  kernel->set_kernel_arg(arg_index, sizeof(pi_uint32), (void *)&samplerProps);
2877  } catch (pi_result err) {
2878  retErr = err;
2879  }
2880  return retErr;
2881 }
2882 
2884  pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim,
2885  const size_t *global_work_offset, const size_t *global_work_size,
2886  const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
2887  const pi_event *event_wait_list, pi_event *event) {
2888 
2889  // Preconditions
2890  assert(command_queue != nullptr);
2891  assert(command_queue->get_context() == kernel->get_context());
2892  assert(kernel != nullptr);
2893  assert(global_work_offset != nullptr);
2894  assert(work_dim > 0);
2895  assert(work_dim < 4);
2896 
2897  if (*global_work_size == 0) {
2899  command_queue, num_events_in_wait_list, event_wait_list, event);
2900  }
2901 
2902  // Set the number of threads per block to the number of threads per warp
2903  // by default unless user has provided a better number
2904  size_t threadsPerBlock[3] = {32u, 1u, 1u};
2905  size_t maxWorkGroupSize = 0u;
2906  size_t maxThreadsPerBlock[3] = {};
2907  bool providedLocalWorkGroupSize = (local_work_size != nullptr);
2908 
2909  {
2910  pi_result retError = hip_piDeviceGetInfo(
2912  sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
2913  assert(retError == PI_SUCCESS);
2914  (void)retError;
2915 
2916  retError = hip_piDeviceGetInfo(
2918  sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr);
2919  assert(retError == PI_SUCCESS);
2920  // The maxWorkGroupsSize = 1024 for AMD GPU
2921  // The maxThreadsPerBlock = {1024, 1024, 1024}
2922 
2923  if (providedLocalWorkGroupSize) {
2924  auto isValid = [&](int dim) {
2925  if (local_work_size[dim] > maxThreadsPerBlock[dim])
2926  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2927  // Checks that local work sizes are a divisor of the global work sizes
2928  // which includes that the local work sizes are neither larger than the
2929  // global work sizes and not 0.
2930  if (0u == local_work_size[dim])
2931  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2932  if (0u != (global_work_size[dim] % local_work_size[dim]))
2933  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2934  threadsPerBlock[dim] = local_work_size[dim];
2935  return PI_SUCCESS;
2936  };
2937 
2938  for (size_t dim = 0; dim < work_dim; dim++) {
2939  auto err = isValid(dim);
2940  if (err != PI_SUCCESS)
2941  return err;
2942  }
2943  } else {
2944  simpleGuessLocalWorkSize(threadsPerBlock, global_work_size,
2945  maxThreadsPerBlock, kernel);
2946  }
2947  }
2948 
2949  if (maxWorkGroupSize <
2950  size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
2951  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2952  }
2953 
2954  size_t blocksPerGrid[3] = {1u, 1u, 1u};
2955 
2956  for (size_t i = 0; i < work_dim; i++) {
2957  blocksPerGrid[i] =
2958  (global_work_size[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i];
2959  }
2960 
2961  pi_result retError = PI_SUCCESS;
2962  std::unique_ptr<_pi_event> retImplEv{nullptr};
2963 
2964  try {
2965  ScopedContext active(command_queue->get_context());
2966 
2967  pi_uint32 stream_token;
2968  _pi_stream_guard guard;
2969  hipStream_t hipStream = command_queue->get_next_compute_stream(
2970  num_events_in_wait_list, event_wait_list, guard, &stream_token);
2971  hipFunction_t hipFunc = kernel->get();
2972 
2973  retError = enqueueEventsWait(command_queue, hipStream,
2974  num_events_in_wait_list, event_wait_list);
2975 
2976  // Set the implicit global offset parameter if kernel has offset variant
2977  if (kernel->get_with_offset_parameter()) {
2978  std::uint32_t hip_implicit_offset[3] = {0, 0, 0};
2979  if (global_work_offset) {
2980  for (size_t i = 0; i < work_dim; i++) {
2981  hip_implicit_offset[i] =
2982  static_cast<std::uint32_t>(global_work_offset[i]);
2983  if (global_work_offset[i] != 0) {
2984  hipFunc = kernel->get_with_offset_parameter();
2985  }
2986  }
2987  }
2988  kernel->set_implicit_offset_arg(sizeof(hip_implicit_offset),
2989  hip_implicit_offset);
2990  }
2991 
2992  auto argIndices = kernel->get_arg_indices();
2993 
2994  if (event) {
2995  retImplEv = std::unique_ptr<_pi_event>(
2997  hipStream, stream_token));
2998  retImplEv->start();
2999  }
3000 
3001  // Set local mem max size if env var is present
3002  static const char *local_mem_sz_ptr =
3003  std::getenv("SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE");
3004 
3005  if (local_mem_sz_ptr) {
3006  int device_max_local_mem = 0;
3007  retError = PI_CHECK_ERROR(hipDeviceGetAttribute(
3008  &device_max_local_mem, hipDeviceAttributeMaxSharedMemoryPerBlock,
3009  command_queue->get_device()->get()));
3010 
3011  static const int env_val = std::atoi(local_mem_sz_ptr);
3012  if (env_val <= 0 || env_val > device_max_local_mem) {
3013  setErrorMessage("Invalid value specified for "
3014  "SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE",
3015  PI_ERROR_PLUGIN_SPECIFIC_ERROR);
3016  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
3017  }
3018  retError = PI_CHECK_ERROR(hipFuncSetAttribute(
3019  hipFunc, hipFuncAttributeMaxDynamicSharedMemorySize, env_val));
3020  }
3021 
3022  retError = PI_CHECK_ERROR(hipModuleLaunchKernel(
3023  hipFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2],
3024  threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2],
3025  kernel->get_local_size(), hipStream, argIndices.data(), nullptr));
3026 
3027  kernel->clear_local_size();
3028 
3029  if (event) {
3030  retError = retImplEv->record();
3031  *event = retImplEv.release();
3032  }
3033  } catch (pi_result err) {
3034  retError = err;
3035  }
3036  return retError;
3037 }
3038 
3040 pi_result
3041 hip_piEnqueueNativeKernel(pi_queue queue, void (*user_func)(void *), void *args,
3042  size_t cb_args, pi_uint32 num_mem_objects,
3043  const pi_mem *mem_list, const void **args_mem_loc,
3044  pi_uint32 num_events_in_wait_list,
3045  const pi_event *event_wait_list, pi_event *event) {
3046  (void)queue;
3047  (void)user_func;
3048  (void)args;
3049  (void)cb_args;
3050  (void)num_mem_objects;
3051  (void)mem_list;
3052  (void)args_mem_loc;
3053  (void)num_events_in_wait_list;
3054  (void)event_wait_list;
3055  (void)event;
3056 
3057  sycl::detail::pi::die("Not implemented in HIP backend");
3058  return {};
3059 }
3060 
3062 
3064  const pi_image_format *image_format,
3065  const pi_image_desc *image_desc, void *host_ptr,
3066  pi_mem *ret_mem) {
3067 
3068  // Need input memory object
3069  assert(ret_mem != nullptr);
3070  const bool performInitialCopy = (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
3071  ((flags & PI_MEM_FLAGS_HOST_PTR_USE));
3072  pi_result retErr = PI_SUCCESS;
3073 
3074  // We only support RBGA channel order
3075  // TODO: check SYCL CTS and spec. May also have to support BGRA
3076  if (image_format->image_channel_order !=
3079  "hip_piMemImageCreate only supports RGBA channel order");
3080  }
3081 
3082  // We have to use cuArray3DCreate, which has some caveats. The height and
3083  // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc gives
3084  // a minimum value of 1, so we need to convert the answer.
3085  HIP_ARRAY3D_DESCRIPTOR array_desc;
3086  array_desc.NumChannels = 4; // Only support 4 channel image
3087  array_desc.Flags = 0; // No flags required
3088  array_desc.Width = image_desc->image_width;
3089  if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
3090  array_desc.Height = 0;
3091  array_desc.Depth = 0;
3092  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
3093  array_desc.Height = image_desc->image_height;
3094  array_desc.Depth = 0;
3095  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
3096  array_desc.Height = image_desc->image_height;
3097  array_desc.Depth = image_desc->image_depth;
3098  }
3099 
3100  // We need to get this now in bytes for calculating the total image size later
3101  size_t pixel_type_size_bytes;
3102 
3103  switch (image_format->image_channel_data_type) {
3106  array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT8;
3107  pixel_type_size_bytes = 1;
3108  break;
3110  array_desc.Format = HIP_AD_FORMAT_SIGNED_INT8;
3111  pixel_type_size_bytes = 1;
3112  break;
3115  array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT16;
3116  pixel_type_size_bytes = 2;
3117  break;
3119  array_desc.Format = HIP_AD_FORMAT_SIGNED_INT16;
3120  pixel_type_size_bytes = 2;
3121  break;
3123  array_desc.Format = HIP_AD_FORMAT_HALF;
3124  pixel_type_size_bytes = 2;
3125  break;
3127  array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT32;
3128  pixel_type_size_bytes = 4;
3129  break;
3131  array_desc.Format = HIP_AD_FORMAT_SIGNED_INT32;
3132  pixel_type_size_bytes = 4;
3133  break;
3135  array_desc.Format = HIP_AD_FORMAT_FLOAT;
3136  pixel_type_size_bytes = 4;
3137  break;
3138  default:
3140  "hip_piMemImageCreate given unsupported image_channel_data_type");
3141  }
3142 
3143  // When a dimension isn't used image_desc has the size set to 1
3144  size_t pixel_size_bytes =
3145  pixel_type_size_bytes * 4; // 4 is the only number of channels we support
3146  size_t image_size_bytes = pixel_size_bytes * image_desc->image_width *
3147  image_desc->image_height * image_desc->image_depth;
3148 
3149  ScopedContext active(context);
3150  hipArray *image_array;
3151  retErr = PI_CHECK_ERROR(hipArray3DCreate(
3152  reinterpret_cast<hipCUarray *>(&image_array), &array_desc));
3153 
3154  try {
3155  if (performInitialCopy) {
3156  // We have to use a different copy function for each image dimensionality
3157  if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
3158  retErr = PI_CHECK_ERROR(
3159  hipMemcpyHtoA(image_array, 0, host_ptr, image_size_bytes));
3160  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
3161  hip_Memcpy2D cpy_desc;
3162  memset(&cpy_desc, 0, sizeof(cpy_desc));
3163  cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost;
3164  cpy_desc.srcHost = host_ptr;
3165  cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray;
3166  cpy_desc.dstArray = reinterpret_cast<hipCUarray>(image_array);
3167  cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
3168  cpy_desc.Height = image_desc->image_height;
3169  retErr = PI_CHECK_ERROR(hipMemcpyParam2D(&cpy_desc));
3170  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
3171  HIP_MEMCPY3D cpy_desc;
3172  memset(&cpy_desc, 0, sizeof(cpy_desc));
3173  cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost;
3174  cpy_desc.srcHost = host_ptr;
3175  cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray;
3176  cpy_desc.dstArray = reinterpret_cast<hipCUarray>(image_array);
3177  cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
3178  cpy_desc.Height = image_desc->image_height;
3179  cpy_desc.Depth = image_desc->image_depth;
3180  retErr = PI_CHECK_ERROR(hipDrvMemcpy3D(&cpy_desc));
3181  }
3182  }
3183 
3184  // HIP_RESOURCE_DESC is a union of different structs, shown here
3185  // We need to fill it as described here to use it for a surface or texture
3186  // HIP_RESOURCE_DESC::resType must be HIP_RESOURCE_TYPE_ARRAY and
3187  // HIP_RESOURCE_DESC::res::array::hArray must be set to a valid HIP array
3188  // handle.
3189  // HIP_RESOURCE_DESC::flags must be set to zero
3190 
3191  hipResourceDesc image_res_desc;
3192  image_res_desc.res.array.array = image_array;
3193  image_res_desc.resType = hipResourceTypeArray;
3194 
3195  hipSurfaceObject_t surface;
3196  retErr = PI_CHECK_ERROR(hipCreateSurfaceObject(&surface, &image_res_desc));
3197 
3198  auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{
3199  context, image_array, surface, image_desc->image_type, host_ptr});
3200 
3201  if (piMemObj == nullptr) {
3202  return PI_ERROR_OUT_OF_HOST_MEMORY;
3203  }
3204 
3205  *ret_mem = piMemObj.release();
3206  } catch (pi_result err) {
3207  PI_CHECK_ERROR(hipFreeArray(image_array));
3208  return err;
3209  } catch (...) {
3210  PI_CHECK_ERROR(hipFreeArray(image_array));
3211  return PI_ERROR_UNKNOWN;
3212  }
3213  return retErr;
3214 }
3215 
3218  size_t param_value_size, void *param_value,
3219  size_t *param_value_size_ret) {
3220  (void)image;
3221  (void)param_name;
3222  (void)param_value_size;
3223  (void)param_value;
3224  (void)param_value_size_ret;
3225 
3226  sycl::detail::pi::die("hip_piMemImageGetInfo not implemented");
3227  return {};
3228 }
3229 
3231  assert(mem != nullptr);
3232  assert(mem->get_reference_count() > 0);
3234  return PI_SUCCESS;
3235 }
3236 
3241  const char **strings,
3242  const size_t *lengths,
3243  pi_program *program) {
3244  (void)context;
3245  (void)count;
3246  (void)strings;
3247  (void)lengths;
3248  (void)program;
3249 
3250  sycl::detail::pi::hipPrint("hip_piclProgramCreateWithSource not implemented");
3251  return PI_ERROR_INVALID_OPERATION;
3252 }
3253 
3259  const pi_device *device_list, const char *options,
3260  void (*pfn_notify)(pi_program program,
3261  void *user_data),
3262  void *user_data) {
3263 
3264  assert(program != nullptr);
3265  assert(num_devices == 1 || num_devices == 0);
3266  assert(device_list != nullptr || num_devices == 0);
3267  assert(pfn_notify == nullptr);
3268  assert(user_data == nullptr);
3269  pi_result retError = PI_SUCCESS;
3270 
3271  try {
3272  ScopedContext active(program->get_context());
3273 
3274  program->build_program(options);
3275 
3276  } catch (pi_result err) {
3277  retError = err;
3278  }
3279  return retError;
3280 }
3281 
3283 pi_result hip_piProgramCreate(pi_context context, const void *il, size_t length,
3284  pi_program *res_program) {
3285  (void)context;
3286  (void)il;
3287  (void)length;
3288  (void)res_program;
3289 
3290  sycl::detail::pi::die("hip_piProgramCreate not implemented");
3291  return {};
3292 }
3293 
3301  pi_context context, pi_uint32 num_devices, const pi_device *device_list,
3302  const size_t *lengths, const unsigned char **binaries,
3303  size_t num_metadata_entries, const pi_device_binary_property *metadata,
3304  pi_int32 *binary_status, pi_program *program) {
3305  (void)num_metadata_entries;
3306  (void)metadata;
3307  (void)binary_status;
3308 
3309  assert(context != nullptr);
3310  assert(binaries != nullptr);
3311  assert(program != nullptr);
3312  assert(device_list != nullptr);
3313  assert(num_devices == 1 && "HIP contexts are for a single device");
3314  assert((context->get_device()->get() == device_list[0]->get()) &&
3315  "Mismatch between devices context and passed context when creating "
3316  "program from binary");
3317 
3318  pi_result retError = PI_SUCCESS;
3319 
3320  std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
3321 
3322  // TODO: Set metadata here and use reqd_work_group_size information.
3323  // See cuda_piProgramCreateWithBinary
3324 
3325  const bool has_length = (lengths != nullptr);
3326  size_t length = has_length
3327  ? lengths[0]
3328  : strlen(reinterpret_cast<const char *>(binaries[0])) + 1;
3329 
3330  assert(length != 0);
3331 
3332  retProgram->set_binary(reinterpret_cast<const char *>(binaries[0]), length);
3333 
3334  *program = retProgram.release();
3335 
3336  return retError;
3337 }
3338 
3340  size_t param_value_size, void *param_value,
3341  size_t *param_value_size_ret) {
3342  assert(program != nullptr);
3343 
3344  switch (param_name) {
3346  return getInfo(param_value_size, param_value, param_value_size_ret,
3347  program->get_reference_count());
3349  return getInfo(param_value_size, param_value, param_value_size_ret,
3350  program->context_);
3352  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
3354  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3355  &program->context_->deviceId_);
3357  return getInfo(param_value_size, param_value, param_value_size_ret,
3358  program->binary_);
3360  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3361  &program->binarySizeInBytes_);
3363  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3364  &program->binary_);
3366  return getInfo(param_value_size, param_value, param_value_size_ret,
3367  getKernelNames(program).c_str());
3368  }
3369  default:
3371  }
3372  sycl::detail::pi::die("Program info request not implemented");
3373  return {};
3374 }
3375 
3377  const pi_device *device_list, const char *options,
3378  pi_uint32 num_input_programs,
3379  const pi_program *input_programs,
3380  void (*pfn_notify)(pi_program program,
3381  void *user_data),
3382  void *user_data, pi_program *ret_program) {
3383  (void)context;
3384  (void)num_devices;
3385  (void)device_list;
3386  (void)options;
3387  (void)num_input_programs;
3388  (void)input_programs;
3389  (void)pfn_notify;
3390  (void)user_data;
3391  (void)ret_program;
3393  "hip_piProgramLink: linking not supported with hip backend");
3394  return {};
3395 }
3396 
3402  pi_program program, pi_uint32 num_devices, const pi_device *device_list,
3403  const char *options, pi_uint32 num_input_headers,
3404  const pi_program *input_headers, const char **header_include_names,
3405  void (*pfn_notify)(pi_program program, void *user_data), void *user_data) {
3406  (void)input_headers;
3407  (void)header_include_names;
3408 
3409  assert(program != nullptr);
3410  assert(num_devices == 1 || num_devices == 0);
3411  assert(device_list != nullptr || num_devices == 0);
3412  assert(pfn_notify == nullptr);
3413  assert(user_data == nullptr);
3414  assert(num_input_headers == 0);
3415  pi_result retError = PI_SUCCESS;
3416 
3417  try {
3418  ScopedContext active(program->get_context());
3419 
3420  program->build_program(options);
3421 
3422  } catch (pi_result err) {
3423  retError = err;
3424  }
3425  return retError;
3426 }
3427 
3429  pi_program_build_info param_name,
3430  size_t param_value_size, void *param_value,
3431  size_t *param_value_size_ret) {
3432  (void)device;
3433 
3434  assert(program != nullptr);
3435 
3436  switch (param_name) {
3438  return getInfo(param_value_size, param_value, param_value_size_ret,
3439  program->buildStatus_);
3440  }
3442  return getInfo(param_value_size, param_value, param_value_size_ret,
3443  program->buildOptions_.c_str());
3445  return getInfoArray(program->MAX_LOG_SIZE, param_value_size, param_value,
3446  param_value_size_ret, program->infoLog_);
3447  default:
3449  }
3450  sycl::detail::pi::die("Program Build info request not implemented");
3451  return {};
3452 }
3453 
3455  assert(program != nullptr);
3456  assert(program->get_reference_count() > 0);
3457  program->increment_reference_count();
3458  return PI_SUCCESS;
3459 }
3460 
3465  assert(program != nullptr);
3466 
3467  // double delete or someone is messing with the ref count.
3468  // either way, cannot safely proceed.
3469  assert(program->get_reference_count() != 0 &&
3470  "Reference count overflow detected in hip_piProgramRelease.");
3471 
3472  // decrement ref count. If it is 0, delete the program.
3473  if (program->decrement_reference_count() == 0) {
3474 
3475  std::unique_ptr<_pi_program> program_ptr{program};
3476 
3477  pi_result result = PI_ERROR_INVALID_PROGRAM;
3478 
3479  try {
3480  ScopedContext active(program->get_context());
3481  auto hipModule = program->get();
3482  result = PI_CHECK_ERROR(hipModuleUnload(hipModule));
3483  } catch (...) {
3484  result = PI_ERROR_OUT_OF_RESOURCES;
3485  }
3486 
3487  return result;
3488  }
3489 
3490  return PI_SUCCESS;
3491 }
3492 
3500  pi_native_handle *nativeHandle) {
3501  *nativeHandle = reinterpret_cast<pi_native_handle>(program->get());
3502  return PI_SUCCESS;
3503 }
3504 
3517  pi_context context,
3518  bool ownNativeHandle,
3519  pi_program *program) {
3520  (void)nativeHandle;
3521  (void)context;
3522  (void)ownNativeHandle;
3523  (void)program;
3524 
3526  "Creation of PI program from native handle not implemented");
3527  return {};
3528 }
3529 
3531  size_t param_value_size, void *param_value,
3532  size_t *param_value_size_ret) {
3533 
3534  if (kernel != nullptr) {
3535 
3536  switch (param_name) {
3538  return getInfo(param_value_size, param_value, param_value_size_ret,
3539  kernel->get_name());
3541  return getInfo(param_value_size, param_value, param_value_size_ret,
3542  kernel->get_num_args());
3544  return getInfo(param_value_size, param_value, param_value_size_ret,
3545  kernel->get_reference_count());
3546  case PI_KERNEL_INFO_CONTEXT: {
3547  return getInfo(param_value_size, param_value, param_value_size_ret,
3548  kernel->get_context());
3549  }
3550  case PI_KERNEL_INFO_PROGRAM: {
3551  return getInfo(param_value_size, param_value, param_value_size_ret,
3552  kernel->get_program());
3553  }
3555  return getInfo(param_value_size, param_value, param_value_size_ret, "");
3556  }
3557  default: {
3559  }
3560  }
3561  }
3562 
3563  return PI_ERROR_INVALID_KERNEL;
3564 }
3565 
3567  pi_kernel_group_info param_name,
3568  size_t param_value_size, void *param_value,
3569  size_t *param_value_size_ret) {
3570 
3571  // here we want to query about a kernel's hip blocks!
3572 
3573  if (kernel != nullptr) {
3574 
3575  switch (param_name) {
3577  int max_threads = 0;
3579  hipFuncGetAttribute(&max_threads,
3580  HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
3581  kernel->get()) == hipSuccess);
3582  return getInfo(param_value_size, param_value, param_value_size_ret,
3583  size_t(max_threads));
3584  }
3586  // Returns the work-group size specified in the kernel source or IL.
3587  // If the work-group size is not specified in the kernel source or IL,
3588  // (0, 0, 0) is returned.
3589  // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html
3590 
3591  // TODO: can we extract the work group size from the PTX?
3592  size_t group_size[3] = {0, 0, 0};
3593  return getInfoArray(3, param_value_size, param_value,
3594  param_value_size_ret, group_size);
3595  }
3597  // OpenCL LOCAL == HIP SHARED
3598  int bytes = 0;
3600  hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
3601  kernel->get()) == hipSuccess);
3602  return getInfo(param_value_size, param_value, param_value_size_ret,
3603  pi_uint64(bytes));
3604  }
3606  // Work groups should be multiples of the warp size
3607  int warpSize = 0;
3609  hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
3610  device->get()) == hipSuccess);
3611  return getInfo(param_value_size, param_value, param_value_size_ret,
3612  static_cast<size_t>(warpSize));
3613  }
3615  // OpenCL PRIVATE == HIP LOCAL
3616  int bytes = 0;
3618  hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
3619  kernel->get()) == hipSuccess);
3620  return getInfo(param_value_size, param_value, param_value_size_ret,
3621  pi_uint64(bytes));
3622  }
3624  sycl::detail::pi::die("PI_KERNEL_GROUP_INFO_NUM_REGS in "
3625  "piKernelGetGroupInfo not implemented\n");
3626  return {};
3627  }
3628 
3629  default:
3631  }
3632  }
3633 
3634  return PI_ERROR_INVALID_KERNEL;
3635 }
3636 
3638  pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name,
3639  size_t input_value_size, const void *input_value, size_t param_value_size,
3640  void *param_value, size_t *param_value_size_ret) {
3641  (void)input_value_size;
3642  (void)input_value;
3643 
3644  if (kernel != nullptr) {
3645  switch (param_name) {
3647  // Sub-group size is equivalent to warp size
3648  int warpSize = 0;
3650  hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
3651  device->get()) == hipSuccess);
3652  return getInfo(param_value_size, param_value, param_value_size_ret,
3653  static_cast<uint32_t>(warpSize));
3654  }
3656  // Number of sub-groups = max block size / warp size + possible remainder
3657  int max_threads = 0;
3659  hipFuncGetAttribute(&max_threads,
3660  HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
3661  kernel->get()) == hipSuccess);
3662  int warpSize = 0;
3664  0, nullptr, sizeof(uint32_t), &warpSize,
3665  nullptr);
3666  int maxWarps = (max_threads + warpSize - 1) / warpSize;
3667  return getInfo(param_value_size, param_value, param_value_size_ret,
3668  static_cast<uint32_t>(maxWarps));
3669  }
3671  // Return value of 0 => not specified
3672  // TODO: Revisit if PTX is generated for compile-time work-group sizes
3673  return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3674  }
3676  // Return value of 0 => unspecified or "auto" sub-group size
3677  // Correct for now, since warp size may be read from special register
3678  // TODO: Return warp size once default is primary sub-group size
3679  // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX
3680  return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3681  }
3682  default:
3684  }
3685  }
3686  return PI_ERROR_INVALID_KERNEL;
3687 }
3688 
3690  assert(kernel != nullptr);
3691  assert(kernel->get_reference_count() > 0u);
3692 
3693  kernel->increment_reference_count();
3694  return PI_SUCCESS;
3695 }
3696 
3698  assert(kernel != nullptr);
3699 
3700  // double delete or someone is messing with the ref count.
3701  // either way, cannot safely proceed.
3702  assert(kernel->get_reference_count() != 0 &&
3703  "Reference count overflow detected in hip_piKernelRelease.");
3704 
3705  // decrement ref count. If it is 0, delete the program.
3706  if (kernel->decrement_reference_count() == 0) {
3707  // no internal hip resources to clean up. Just delete it.
3708  delete kernel;
3709  return PI_SUCCESS;
3710  }
3711 
3712  return PI_SUCCESS;
3713 }
3714 
3715 // A NOP for the HIP backend
3717  pi_kernel_exec_info param_name,
3718  size_t param_value_size,
3719  const void *param_value) {
3720  (void)kernel;
3721  (void)param_name;
3722  (void)param_value_size;
3723  (void)param_value;
3724 
3725  return PI_SUCCESS;
3726 }
3727 
3729  size_t, const void *) {
3730  // This entry point is only used for native specialization constants (SPIR-V),
3731  // and the HIP plugin is AOT only so this entry point is not supported.
3732  sycl::detail::pi::die("Native specialization constants are not supported");
3733  return {};
3734 }
3735 
3737  size_t arg_size, const void *arg_value) {
3738  kernel->set_kernel_arg(arg_index, arg_size, arg_value);
3739  return PI_SUCCESS;
3740 }
3741 
3742 //
3743 // Events
3744 //
3746  (void)context;
3747  (void)event;
3748 
3749  sycl::detail::pi::die("PI Event Create not implemented in HIP backend");
3750 }
3751 
3753  size_t param_value_size, void *param_value,
3754  size_t *param_value_size_ret) {
3755  assert(event != nullptr);
3756 
3757  switch (param_name) {
3759  return getInfo(param_value_size, param_value, param_value_size_ret,
3760  event->get_queue());
3762  return getInfo(param_value_size, param_value, param_value_size_ret,
3763  event->get_command_type());
3765  return getInfo(param_value_size, param_value, param_value_size_ret,
3766  event->get_reference_count());
3768  return getInfo(param_value_size, param_value, param_value_size_ret,
3769  static_cast<pi_event_status>(event->get_execution_status()));
3770  }
3771  case PI_EVENT_INFO_CONTEXT:
3772  return getInfo(param_value_size, param_value, param_value_size_ret,
3773  event->get_context());
3774  default:
3776  }
3777 
3778  return PI_ERROR_INVALID_EVENT;
3779 }
3780 
3784  pi_profiling_info param_name,
3785  size_t param_value_size,
3786  void *param_value,
3787  size_t *param_value_size_ret) {
3788 
3789  assert(event != nullptr);
3790 
3791  pi_queue queue = event->get_queue();
3792  if (queue == nullptr ||
3794  return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE;
3795  }
3796 
3797  switch (param_name) {
3800  // Note: No user for this case
3801  return getInfo<pi_uint64>(param_value_size, param_value,
3802  param_value_size_ret, event->get_queued_time());
3804  return getInfo<pi_uint64>(param_value_size, param_value,
3805  param_value_size_ret, event->get_start_time());
3807  return getInfo<pi_uint64>(param_value_size, param_value,
3808  param_value_size_ret, event->get_end_time());
3809  default:
3811  }
3812  sycl::detail::pi::die("Event Profiling info request not implemented");
3813  return {};
3814 }
3815 
3817  pi_int32 command_exec_callback_type,
3818  pfn_notify notify, void *user_data) {
3819  (void)event;
3820  (void)command_exec_callback_type;
3821  (void)notify;
3822  (void)user_data;
3823 
3824  sycl::detail::pi::die("Event Callback not implemented in HIP backend");
3825  return PI_SUCCESS;
3826 }
3827 
3829  (void)event;
3830  (void)execution_status;
3831 
3832  sycl::detail::pi::die("Event Set Status not implemented in HIP backend");
3833  return PI_ERROR_INVALID_VALUE;
3834 }
3835 
3837  assert(event != nullptr);
3838 
3839  const auto refCount = event->increment_reference_count();
3840 
3842  refCount != 0, "Reference count overflow detected in hip_piEventRetain.");
3843 
3844  return PI_SUCCESS;
3845 }
3846 
3848  assert(event != nullptr);
3849 
3850  // double delete or someone is messing with the ref count.
3851  // either way, cannot safely proceed.
3853  event->get_reference_count() != 0,
3854  "Reference count overflow detected in hip_piEventRelease.");
3855 
3856  // decrement ref count. If it is 0, delete the event.
3857  if (event->decrement_reference_count() == 0) {
3858  std::unique_ptr<_pi_event> event_ptr{event};
3859  pi_result result = PI_ERROR_INVALID_EVENT;
3860  try {
3861  ScopedContext active(event->get_context());
3862  result = event->release();
3863  } catch (...) {
3864  result = PI_ERROR_OUT_OF_RESOURCES;
3865  }
3866  return result;
3867  }
3868 
3869  return PI_SUCCESS;
3870 }
3871 
3880  pi_uint32 num_events_in_wait_list,
3881  const pi_event *event_wait_list,
3882  pi_event *event) {
3884  command_queue, num_events_in_wait_list, event_wait_list, event);
3885 }
3886 
3893  pi_uint32 num_events_in_wait_list,
3894  const pi_event *event_wait_list,
3895  pi_event *event) {
3896  if (!command_queue) {
3897  return PI_ERROR_INVALID_QUEUE;
3898  }
3899 
3900  pi_result result;
3901 
3902  try {
3903  ScopedContext active(command_queue->get_context());
3904  pi_uint32 stream_token;
3905  _pi_stream_guard guard;
3906  hipStream_t hipStream = command_queue->get_next_compute_stream(
3907  num_events_in_wait_list, event_wait_list, guard, &stream_token);
3908  {
3909  std::lock_guard<std::mutex> guard(command_queue->barrier_mutex_);
3910  if (command_queue->barrier_event_ == nullptr) {
3911  PI_CHECK_ERROR(hipEventCreate(&command_queue->barrier_event_));
3912  }
3913  if (num_events_in_wait_list == 0) { // wait on all work
3914  if (command_queue->barrier_tmp_event_ == nullptr) {
3915  PI_CHECK_ERROR(hipEventCreate(&command_queue->barrier_tmp_event_));
3916  }
3917  command_queue->sync_streams(
3918  [hipStream,
3919  tmp_event = command_queue->barrier_tmp_event_](hipStream_t s) {
3920  if (hipStream != s) {
3921  PI_CHECK_ERROR(hipEventRecord(tmp_event, s));
3922  PI_CHECK_ERROR(hipStreamWaitEvent(hipStream, tmp_event, 0));
3923  }
3924  });
3925  } else { // wait just on given events
3926  forLatestEvents(event_wait_list, num_events_in_wait_list,
3927  [hipStream](pi_event event) -> pi_result {
3928  if (event->get_queue()->has_been_synchronized(
3929  event->get_compute_stream_token())) {
3930  return PI_SUCCESS;
3931  } else {
3932  return PI_CHECK_ERROR(
3933  hipStreamWaitEvent(hipStream, event->get(), 0));
3934  }
3935  });
3936  }
3937 
3938  result = PI_CHECK_ERROR(
3939  hipEventRecord(command_queue->barrier_event_, hipStream));
3940  for (unsigned int i = 0;
3941  i < command_queue->compute_applied_barrier_.size(); i++) {
3942  command_queue->compute_applied_barrier_[i] = false;
3943  }
3944  for (unsigned int i = 0;
3945  i < command_queue->transfer_applied_barrier_.size(); i++) {
3946  command_queue->transfer_applied_barrier_[i] = false;
3947  }
3948  }
3949  if (result != PI_SUCCESS) {
3950  return result;
3951  }
3952 
3953  if (event) {
3954  *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue,
3955  hipStream, stream_token);
3956  (*event)->start();
3957  (*event)->record();
3958  }
3959 
3960  return PI_SUCCESS;
3961  } catch (pi_result err) {
3962  return err;
3963  } catch (...) {
3964  return PI_ERROR_UNKNOWN;
3965  }
3966 }
3967 
3975  pi_native_handle *nativeHandle) {
3976  *nativeHandle = reinterpret_cast<pi_native_handle>(event->get());
3977  return PI_SUCCESS;
3978 }
3979 
3989  pi_context context,
3990  bool ownNativeHandle,
3991  pi_event *event) {
3992  (void)nativeHandle;
3993  (void)context;
3994  (void)ownNativeHandle;
3995  (void)event;
3996 
3998  "Creation of PI event from native handle not implemented");
3999  return {};
4000 }
4001 
4012  const pi_sampler_properties *sampler_properties,
4013  pi_sampler *result_sampler) {
4014  std::unique_ptr<_pi_sampler> retImplSampl{new _pi_sampler(context)};
4015 
4016  bool propSeen[3] = {false, false, false};
4017  for (size_t i = 0; sampler_properties[i] != 0; i += 2) {
4018  switch (sampler_properties[i]) {
4020  if (propSeen[0]) {
4021  return PI_ERROR_INVALID_VALUE;
4022  }
4023  propSeen[0] = true;
4024  retImplSampl->props_ |= sampler_properties[i + 1];
4025  break;
4027  if (propSeen[1]) {
4028  return PI_ERROR_INVALID_VALUE;
4029  }
4030  propSeen[1] = true;
4031  retImplSampl->props_ |=
4032  (sampler_properties[i + 1] - PI_SAMPLER_FILTER_MODE_NEAREST) << 1;
4033  break;
4035  if (propSeen[2]) {
4036  return PI_ERROR_INVALID_VALUE;
4037  }
4038  propSeen[2] = true;
4039  retImplSampl->props_ |=
4040  (sampler_properties[i + 1] - PI_SAMPLER_ADDRESSING_MODE_NONE) << 2;
4041  break;
4042  default:
4043  return PI_ERROR_INVALID_VALUE;
4044  }
4045  }
4046 
4047  if (!propSeen[0]) {
4048  retImplSampl->props_ |= PI_TRUE;
4049  }
4050  // Default filter mode to CL_FILTER_NEAREST
4051  if (!propSeen[2]) {
4052  retImplSampl->props_ |=
4054  << 2;
4055  }
4056 
4057  *result_sampler = retImplSampl.release();
4058  return PI_SUCCESS;
4059 }
4060 
4071  size_t param_value_size, void *param_value,
4072  size_t *param_value_size_ret) {
4073  assert(sampler != nullptr);
4074 
4075  switch (param_name) {
4077  return getInfo(param_value_size, param_value, param_value_size_ret,
4078  sampler->get_reference_count());
4080  return getInfo(param_value_size, param_value, param_value_size_ret,
4081  sampler->context_);
4083  pi_bool norm_coords_prop = static_cast<pi_bool>(sampler->props_ & 0x1);
4084  return getInfo(param_value_size, param_value, param_value_size_ret,
4085  norm_coords_prop);
4086  }
4088  pi_sampler_filter_mode filter_prop = static_cast<pi_sampler_filter_mode>(
4089  ((sampler->props_ >> 1) & 0x1) + PI_SAMPLER_FILTER_MODE_NEAREST);
4090  return getInfo(param_value_size, param_value, param_value_size_ret,
4091  filter_prop);
4092  }
4094  pi_sampler_addressing_mode addressing_prop =
4095  static_cast<pi_sampler_addressing_mode>(
4096  (sampler->props_ >> 2) + PI_SAMPLER_ADDRESSING_MODE_NONE);
4097  return getInfo(param_value_size, param_value, param_value_size_ret,
4098  addressing_prop);
4099  }
4100  default:
4102  }
4103  return {};
4104 }
4105 
4112  assert(sampler != nullptr);
4113  sampler->increment_reference_count();
4114  return PI_SUCCESS;
4115 }
4116 
4124  assert(sampler != nullptr);
4125 
4126  // double delete or someone is messing with the ref count.
4127  // either way, cannot safely proceed.
4129  sampler->get_reference_count() != 0,
4130  "Reference count overflow detected in hip_piSamplerRelease.");
4131 
4132  // decrement ref count. If it is 0, delete the sampler.
4133  if (sampler->decrement_reference_count() == 0) {
4134  delete sampler;
4135  }
4136 
4137  return PI_SUCCESS;
4138 }
4139 
4146  hipStream_t hip_stream, pi_buff_rect_region region, const void *src_ptr,
4147  const hipMemoryType src_type, pi_buff_rect_offset src_offset,
4148  size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr,
4149  const hipMemoryType dst_type, pi_buff_rect_offset dst_offset,
4150  size_t dst_row_pitch, size_t dst_slice_pitch) {
4151 
4152  assert(region != nullptr);
4153  assert(src_offset != nullptr);
4154  assert(dst_offset != nullptr);
4155 
4156  assert(src_type == hipMemoryTypeDevice || src_type == hipMemoryTypeHost);
4157  assert(dst_type == hipMemoryTypeDevice || dst_type == hipMemoryTypeHost);
4158 
4159  src_row_pitch = (!src_row_pitch) ? region->width_bytes : src_row_pitch;
4160  src_slice_pitch = (!src_slice_pitch) ? (region->height_scalar * src_row_pitch)
4161  : src_slice_pitch;
4162  dst_row_pitch = (!dst_row_pitch) ? region->width_bytes : dst_row_pitch;
4163  dst_slice_pitch = (!dst_slice_pitch) ? (region->height_scalar * dst_row_pitch)
4164  : dst_slice_pitch;
4165 
4166  HIP_MEMCPY3D params;
4167 
4168  params.WidthInBytes = region->width_bytes;
4169  params.Height = region->height_scalar;
4170  params.Depth = region->depth_scalar;
4171 
4172  params.srcMemoryType = src_type;
4173  params.srcDevice = src_type == hipMemoryTypeDevice
4174  ? *static_cast<const hipDeviceptr_t *>(src_ptr)
4175  : 0;
4176  params.srcHost = src_type == hipMemoryTypeHost ? src_ptr : nullptr;
4177  params.srcXInBytes = src_offset->x_bytes;
4178  params.srcY = src_offset->y_scalar;
4179  params.srcZ = src_offset->z_scalar;
4180  params.srcPitch = src_row_pitch;
4181  params.srcHeight = src_slice_pitch / src_row_pitch;
4182 
4183  params.dstMemoryType = dst_type;
4184  params.dstDevice = dst_type == hipMemoryTypeDevice
4185  ? *reinterpret_cast<hipDeviceptr_t *>(dst_ptr)
4186  : 0;
4187  params.dstHost = dst_type == hipMemoryTypeHost ? dst_ptr : nullptr;
4188  params.dstXInBytes = dst_offset->x_bytes;
4189  params.dstY = dst_offset->y_scalar;
4190  params.dstZ = dst_offset->z_scalar;
4191  params.dstPitch = dst_row_pitch;
4192  params.dstHeight = dst_slice_pitch / dst_row_pitch;
4193 
4194  return PI_CHECK_ERROR(hipDrvMemcpy3DAsync(&params, hip_stream));
4195 
4196  return PI_SUCCESS;
4197 }
4198 
4200  pi_queue command_queue, pi_mem buffer, pi_bool blocking_read,
4201  pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
4202  pi_buff_rect_region region, size_t buffer_row_pitch,
4203  size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
4204  void *ptr, pi_uint32 num_events_in_wait_list,
4205  const pi_event *event_wait_list, pi_event *event) {
4206 
4207  assert(buffer != nullptr);
4208  assert(command_queue != nullptr);
4209 
4210  pi_result retErr = PI_SUCCESS;
4211  void *devPtr = buffer->mem_.buffer_mem_.get_void();
4212  std::unique_ptr<_pi_event> retImplEv{nullptr};
4213 
4214  try {
4215  ScopedContext active(command_queue->get_context());
4216  hipStream_t hipStream = command_queue->get_next_transfer_stream();
4217 
4218  retErr = enqueueEventsWait(command_queue, hipStream,
4219  num_events_in_wait_list, event_wait_list);
4220 
4221  if (event) {
4222  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4223  PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, command_queue, hipStream));
4224  retImplEv->start();
4225  }
4226 
4228  hipStream, region, &devPtr, hipMemoryTypeDevice, buffer_offset,
4229  buffer_row_pitch, buffer_slice_pitch, ptr, hipMemoryTypeHost,
4230  host_offset, host_row_pitch, host_slice_pitch);
4231 
4232  if (event) {
4233  retErr = retImplEv->record();
4234  }
4235 
4236  if (blocking_read) {
4237  retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
4238  }
4239 
4240  if (event) {
4241  *event = retImplEv.release();
4242  }
4243 
4244  } catch (pi_result err) {
4245  retErr = err;
4246  }
4247  return retErr;
4248 }
4249 
4251  pi_queue command_queue, pi_mem buffer, pi_bool blocking_write,
4252  pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
4253  pi_buff_rect_region region, size_t buffer_row_pitch,
4254  size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
4255  const void *ptr, pi_uint32 num_events_in_wait_list,
4256  const pi_event *event_wait_list, pi_event *event) {
4257 
4258  assert(buffer != nullptr);
4259  assert(command_queue != nullptr);
4260 
4261  pi_result retErr = PI_SUCCESS;
4262  void *devPtr = buffer->mem_.buffer_mem_.get_void();
4263  std::unique_ptr<_pi_event> retImplEv{nullptr};
4264 
4265  try {
4266  ScopedContext active(command_queue->get_context());
4267  hipStream_t hipStream = command_queue->get_next_transfer_stream();
4268  retErr = enqueueEventsWait(command_queue, hipStream,
4269  num_events_in_wait_list, event_wait_list);
4270 
4271  if (event) {
4272  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4273  PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, command_queue, hipStream));
4274  retImplEv->start();
4275  }
4276 
4278  hipStream, region, ptr, hipMemoryTypeHost, host_offset, host_row_pitch,
4279  host_slice_pitch, &devPtr, hipMemoryTypeDevice, buffer_offset,
4280  buffer_row_pitch, buffer_slice_pitch);
4281 
4282  if (event) {
4283  retErr = retImplEv->record();
4284  }
4285 
4286  if (blocking_write) {
4287  retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
4288  }
4289 
4290  if (event) {
4291  *event = retImplEv.release();
4292  }
4293 
4294  } catch (pi_result err) {
4295  retErr = err;
4296  }
4297  return retErr;
4298 }
4299 
4301  pi_mem dst_buffer, size_t src_offset,
4302  size_t dst_offset, size_t size,
4303  pi_uint32 num_events_in_wait_list,
4304  const pi_event *event_wait_list,
4305  pi_event *event) {
4306  if (!command_queue) {
4307  return PI_ERROR_INVALID_QUEUE;
4308  }
4309 
4310  std::unique_ptr<_pi_event> retImplEv{nullptr};
4311 
4312  try {
4313  ScopedContext active(command_queue->get_context());
4314  pi_result result;
4315  auto stream = command_queue->get_next_transfer_stream();
4316 
4317  if (event_wait_list) {
4318  result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
4319  event_wait_list);
4320  }
4321 
4322  if (event) {
4323  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4324  PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue, stream));
4325  result = retImplEv->start();
4326  }
4327 
4328  auto src = src_buffer->mem_.buffer_mem_.get_with_offset(src_offset);
4329  auto dst = dst_buffer->mem_.buffer_mem_.get_with_offset(dst_offset);
4330 
4331  result = PI_CHECK_ERROR(hipMemcpyDtoDAsync(dst, src, size, stream));
4332 
4333  if (event) {
4334  result = retImplEv->record();
4335  *event = retImplEv.release();
4336  }
4337 
4338  return result;
4339  } catch (pi_result err) {
4340  return err;
4341  } catch (...) {
4342  return PI_ERROR_UNKNOWN;
4343  }
4344 }
4345 
4347  pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer,
4348  pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin,
4349  pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch,
4350  size_t dst_row_pitch, size_t dst_slice_pitch,
4351  pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
4352  pi_event *event) {
4353 
4354  assert(src_buffer != nullptr);
4355  assert(dst_buffer != nullptr);
4356  assert(command_queue != nullptr);
4357 
4358  pi_result retErr = PI_SUCCESS;
4359  void *srcPtr = src_buffer->mem_.buffer_mem_.get_void();
4360  void *dstPtr = dst_buffer->mem_.buffer_mem_.get_void();
4361  std::unique_ptr<_pi_event> retImplEv{nullptr};
4362 
4363  try {
4364  ScopedContext active(command_queue->get_context());
4365  hipStream_t hipStream = command_queue->get_next_transfer_stream();
4366  retErr = enqueueEventsWait(command_queue, hipStream,
4367  num_events_in_wait_list, event_wait_list);
4368 
4369  if (event) {
4370  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4371  PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue, hipStream));
4372  retImplEv->start();
4373  }
4374 
4376  hipStream, region, &srcPtr, hipMemoryTypeDevice, src_origin,
4377  src_row_pitch, src_slice_pitch, &dstPtr, hipMemoryTypeDevice,
4378  dst_origin, dst_row_pitch, dst_slice_pitch);
4379 
4380  if (event) {
4381  retImplEv->record();
4382  *event = retImplEv.release();
4383  }
4384 
4385  } catch (pi_result err) {
4386  retErr = err;
4387  }
4388  return retErr;
4389 }
4390 
4392  const void *pattern, size_t pattern_size,
4393  size_t offset, size_t size,
4394  pi_uint32 num_events_in_wait_list,
4395  const pi_event *event_wait_list,
4396  pi_event *event) {
4397  assert(command_queue != nullptr);
4398 
4399  auto args_are_multiples_of_pattern_size =
4400  (offset % pattern_size == 0) || (size % pattern_size == 0);
4401 
4402  auto pattern_is_valid = (pattern != nullptr);
4403 
4404  auto pattern_size_is_valid =
4405  ((pattern_size & (pattern_size - 1)) == 0) && // is power of two
4406  (pattern_size > 0) && (pattern_size <= 128); // falls within valid range
4407 
4408  assert(args_are_multiples_of_pattern_size && pattern_is_valid &&
4409  pattern_size_is_valid);
4410  (void)args_are_multiples_of_pattern_size;
4411  (void)pattern_is_valid;
4412  (void)pattern_size_is_valid;
4413 
4414  std::unique_ptr<_pi_event> retImplEv{nullptr};
4415 
4416  try {
4417  ScopedContext active(command_queue->get_context());
4418 
4419  auto stream = command_queue->get_next_transfer_stream();
4420  pi_result result;
4421  if (event_wait_list) {
4422  result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
4423  event_wait_list);
4424  }
4425 
4426  if (event) {
4427  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4428  PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue, stream));
4429  result = retImplEv->start();
4430  }
4431 
4432  auto dstDevice = buffer->mem_.buffer_mem_.get_with_offset(offset);
4433  auto N = size / pattern_size;
4434 
4435  // pattern size in bytes
4436  switch (pattern_size) {
4437  case 1: {
4438  auto value = *static_cast<const uint8_t *>(pattern);
4439  result = PI_CHECK_ERROR(hipMemsetD8Async(dstDevice, value, N, stream));
4440  break;
4441  }
4442  case 2: {
4443  auto value = *static_cast<const uint16_t *>(pattern);
4444  result = PI_CHECK_ERROR(hipMemsetD16Async(dstDevice, value, N, stream));
4445  break;
4446  }
4447  case 4: {
4448  auto value = *static_cast<const uint32_t *>(pattern);
4449  result = PI_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, N, stream));
4450  break;
4451  }
4452 
4453  default: {
4454  // HIP has no memset functions that allow setting values more than 4
4455  // bytes. PI API lets you pass an arbitrary "pattern" to the buffer
4456  // fill, which can be more than 4 bytes. We must break up the pattern
4457  // into 1 byte values, and set the buffer using multiple strided calls.
4458  // The first 4 patterns are set using hipMemsetD32Async then all
4459  // subsequent 1 byte patterns are set using hipMemset2DAsync which is
4460  // called for each pattern.
4461 
4462  // Calculate the number of patterns, stride, number of times the pattern
4463  // needs to be applied, and the number of times the first 32 bit pattern
4464  // needs to be applied.
4465  auto number_of_steps = pattern_size / sizeof(uint8_t);
4466  auto pitch = number_of_steps * sizeof(uint8_t);
4467  auto height = size / number_of_steps;
4468  auto count_32 = size / sizeof(uint32_t);
4469 
4470  // Get 4-byte chunk of the pattern and call hipMemsetD32Async
4471  auto value = *(static_cast<const uint32_t *>(pattern));
4472  result =
4473  PI_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, count_32, stream));
4474  for (auto step = 4u; step < number_of_steps; ++step) {
4475  // take 1 byte of the pattern
4476  value = *(static_cast<const uint8_t *>(pattern) + step);
4477 
4478  // offset the pointer to the part of the buffer we want to write to
4479  auto offset_ptr = reinterpret_cast<void *>(
4480  reinterpret_cast<uint8_t *>(dstDevice) + (step * sizeof(uint8_t)));
4481 
4482  // set all of the pattern chunks
4483  result = PI_CHECK_ERROR(hipMemset2DAsync(
4484  offset_ptr, pitch, value, sizeof(uint8_t), height, stream));
4485  }
4486  break;
4487  }
4488  }
4489 
4490  if (event) {
4491  result = retImplEv->record();
4492  *event = retImplEv.release();
4493  }
4494 
4495  return result;
4496  } catch (pi_result err) {
4497  return err;
4498  } catch (...) {
4499  return PI_ERROR_UNKNOWN;
4500  }
4501 }
4502 
4503 static size_t imageElementByteSize(hipArray_Format array_format) {
4504  switch (array_format) {
4505  case HIP_AD_FORMAT_UNSIGNED_INT8:
4506  case HIP_AD_FORMAT_SIGNED_INT8:
4507  return 1;
4508  case HIP_AD_FORMAT_UNSIGNED_INT16:
4509  case HIP_AD_FORMAT_SIGNED_INT16:
4510  case HIP_AD_FORMAT_HALF:
4511  return 2;
4512  case HIP_AD_FORMAT_UNSIGNED_INT32:
4513  case HIP_AD_FORMAT_SIGNED_INT32:
4514  case HIP_AD_FORMAT_FLOAT:
4515  return 4;
4516  default:
4517  return 0;
4518  }
4519  sycl::detail::pi::die("Invalid iamge format.");
4520  return 0;
4521 }
4522 
4528 
4530  hipStream_t hip_stream, pi_mem_type img_type, const size_t *region,
4531  const void *src_ptr, const hipMemoryType src_type, const size_t *src_offset,
4532  void *dst_ptr, const hipMemoryType dst_type, const size_t *dst_offset) {
4533  assert(region != nullptr);
4534 
4535  assert(src_type == hipMemoryTypeArray || src_type == hipMemoryTypeHost);
4536  assert(dst_type == hipMemoryTypeArray || dst_type == hipMemoryTypeHost);
4537 
4538  if (img_type == PI_MEM_TYPE_IMAGE2D) {
4539  hip_Memcpy2D cpyDesc;
4540  memset(&cpyDesc, 0, sizeof(cpyDesc));
4541  cpyDesc.srcMemoryType = src_type;
4542  if (src_type == hipMemoryTypeArray) {
4543  cpyDesc.srcArray =
4544  reinterpret_cast<hipCUarray>(const_cast<void *>(src_ptr));
4545  cpyDesc.srcXInBytes = src_offset[0];
4546  cpyDesc.srcY = src_offset[1];
4547  } else {
4548  cpyDesc.srcHost = src_ptr;
4549  }
4550  cpyDesc.dstMemoryType = dst_type;
4551  if (dst_type == hipMemoryTypeArray) {
4552  cpyDesc.dstArray =
4553  reinterpret_cast<hipCUarray>(const_cast<void *>(dst_ptr));
4554  cpyDesc.dstXInBytes = dst_offset[0];
4555  cpyDesc.dstY = dst_offset[1];
4556  } else {
4557  cpyDesc.dstHost = dst_ptr;
4558  }
4559  cpyDesc.WidthInBytes = region[0];
4560  cpyDesc.Height = region[1];
4561  return PI_CHECK_ERROR(hipMemcpyParam2DAsync(&cpyDesc, hip_stream));
4562  }
4563 
4564  if (img_type == PI_MEM_TYPE_IMAGE3D) {
4565 
4566  HIP_MEMCPY3D cpyDesc;
4567  memset(&cpyDesc, 0, sizeof(cpyDesc));
4568  cpyDesc.srcMemoryType = src_type;
4569  if (src_type == hipMemoryTypeArray) {
4570  cpyDesc.srcArray =
4571  reinterpret_cast<hipCUarray>(const_cast<void *>(src_ptr));
4572  cpyDesc.srcXInBytes = src_offset[0];
4573  cpyDesc.srcY = src_offset[1];
4574  cpyDesc.srcZ = src_offset[2];
4575  } else {
4576  cpyDesc.srcHost = src_ptr;
4577  }
4578  cpyDesc.dstMemoryType = dst_type;
4579  if (dst_type == hipMemoryTypeArray) {
4580  cpyDesc.dstArray = reinterpret_cast<hipCUarray>(dst_ptr);
4581  cpyDesc.dstXInBytes = dst_offset[0];
4582  cpyDesc.dstY = dst_offset[1];
4583  cpyDesc.dstZ = dst_offset[2];
4584  } else {
4585  cpyDesc.dstHost = dst_ptr;
4586  }
4587  cpyDesc.WidthInBytes = region[0];
4588  cpyDesc.Height = region[1];
4589  cpyDesc.Depth = region[2];
4590  return PI_CHECK_ERROR(hipDrvMemcpy3DAsync(&cpyDesc, hip_stream));
4591  return PI_ERROR_UNKNOWN;
4592  }
4593 
4594  return PI_ERROR_INVALID_VALUE;
4595 }
4596 
4598  pi_bool blocking_read, const size_t *origin,
4599  const size_t *region, size_t row_pitch,
4600  size_t slice_pitch, void *ptr,
4601  pi_uint32 num_events_in_wait_list,
4602  const pi_event *event_wait_list,
4603  pi_event *event) {
4604  (void)row_pitch;
4605  (void)slice_pitch;
4606 
4607  assert(command_queue != nullptr);
4608  assert(image != nullptr);
4609  assert(image->mem_type_ == _pi_mem::mem_type::surface);
4610 
4611  pi_result retErr = PI_SUCCESS;
4612 
4613  try {
4614  ScopedContext active(command_queue->get_context());
4615  hipStream_t hipStream = command_queue->get_next_transfer_stream();
4616 
4617  if (event_wait_list) {
4618  retErr = enqueueEventsWait(command_queue, hipStream,
4619  num_events_in_wait_list, event_wait_list);
4620  }
4621 
4622  hipArray *array = image->mem_.surface_mem_.get_array();
4623 
4624  hipArray_Format Format;
4625  size_t NumChannels;
4626  getArrayDesc(array, Format, NumChannels);
4627 
4628  int elementByteSize = imageElementByteSize(Format);
4629 
4630  size_t byteOffsetX = origin[0] * elementByteSize * NumChannels;
4631  size_t bytesToCopy = elementByteSize * NumChannels * region[0];
4632 
4633  pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
4634 
4635  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4636  size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]};
4637 
4638  retErr = commonEnqueueMemImageNDCopy(hipStream, imgType, adjustedRegion,
4639  array, hipMemoryTypeArray, srcOffset,
4640  ptr, hipMemoryTypeHost, nullptr);
4641 
4642  if (retErr != PI_SUCCESS) {
4643  return retErr;
4644  }
4645 
4646  if (event) {
4648  command_queue, hipStream);
4649  new_event->record();
4650  *event = new_event;
4651  }
4652 
4653  if (blocking_read) {
4654  retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
4655  }
4656  } catch (pi_result err) {
4657  return err;
4658  } catch (...) {
4659  return PI_ERROR_UNKNOWN;
4660  }
4661  return PI_SUCCESS;
4662  return retErr;
4663 }
4664 
4666  pi_bool blocking_write,
4667  const size_t *origin, const size_t *region,
4668  size_t input_row_pitch,
4669  size_t input_slice_pitch, const void *ptr,
4670  pi_uint32 num_events_in_wait_list,
4671  const pi_event *event_wait_list,
4672  pi_event *event) {
4673  (void)blocking_write;
4674  (void)input_row_pitch;
4675  (void)input_slice_pitch;
4676  assert(command_queue != nullptr);
4677  assert(image != nullptr);
4678  assert(image->mem_type_ == _pi_mem::mem_type::surface);
4679 
4680  pi_result retErr = PI_SUCCESS;
4681 
4682  try {
4683  ScopedContext active(command_queue->get_context());
4684  hipStream_t hipStream = command_queue->get_next_transfer_stream();
4685 
4686  if (event_wait_list) {
4687  retErr = enqueueEventsWait(command_queue, hipStream,
4688  num_events_in_wait_list, event_wait_list);
4689  }
4690 
4691  hipArray *array = image->mem_.surface_mem_.get_array();
4692 
4693  hipArray_Format Format;
4694  size_t NumChannels;
4695  getArrayDesc(array, Format, NumChannels);
4696 
4697  int elementByteSize = imageElementByteSize(Format);
4698 
4699  size_t byteOffsetX = origin[0] * elementByteSize * NumChannels;
4700  size_t bytesToCopy = elementByteSize * NumChannels * region[0];
4701 
4702  pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
4703 
4704  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4705  size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]};
4706 
4707  retErr = commonEnqueueMemImageNDCopy(hipStream, imgType, adjustedRegion,
4708  ptr, hipMemoryTypeHost, nullptr, array,
4709  hipMemoryTypeArray, dstOffset);
4710 
4711  if (retErr != PI_SUCCESS) {
4712  return retErr;
4713  }
4714 
4715  if (event) {
4717  command_queue, hipStream);
4718  new_event->record();
4719  *event = new_event;
4720  }
4721  } catch (pi_result err) {
4722  return err;
4723  } catch (...) {
4724  return PI_ERROR_UNKNOWN;
4725  }
4726 
4727  return PI_SUCCESS;
4728 
4729  return retErr;
4730 }
4731 
4733  pi_mem dst_image, const size_t *src_origin,
4734  const size_t *dst_origin,
4735  const size_t *region,
4736  pi_uint32 num_events_in_wait_list,
4737  const pi_event *event_wait_list,
4738  pi_event *event) {
4739 
4740  assert(src_image->mem_type_ == _pi_mem::mem_type::surface);
4741  assert(dst_image->mem_type_ == _pi_mem::mem_type::surface);
4742  assert(src_image->mem_.surface_mem_.get_image_type() ==
4743  dst_image->mem_.surface_mem_.get_image_type());
4744 
4745  pi_result retErr = PI_SUCCESS;
4746 
4747  try {
4748  ScopedContext active(command_queue->get_context());
4749  hipStream_t hipStream = command_queue->get_next_transfer_stream();
4750  if (event_wait_list) {
4751  retErr = enqueueEventsWait(command_queue, hipStream,
4752  num_events_in_wait_list, event_wait_list);
4753  }
4754 
4755  hipArray *srcArray = src_image->mem_.surface_mem_.get_array();
4756  hipArray_Format srcFormat;
4757  size_t srcNumChannels;
4758  getArrayDesc(srcArray, srcFormat, srcNumChannels);
4759 
4760  hipArray *dstArray = dst_image->mem_.surface_mem_.get_array();
4761  hipArray_Format dstFormat;
4762  size_t dstNumChannels;
4763  getArrayDesc(dstArray, dstFormat, dstNumChannels);
4764 
4765  assert(srcFormat == dstFormat);
4766  assert(srcNumChannels == dstNumChannels);
4767 
4768  int elementByteSize = imageElementByteSize(srcFormat);
4769 
4770  size_t dstByteOffsetX = dst_origin[0] * elementByteSize * srcNumChannels;
4771  size_t srcByteOffsetX = src_origin[0] * elementByteSize * dstNumChannels;
4772  size_t bytesToCopy = elementByteSize * srcNumChannels * region[0];
4773 
4774  pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type();
4775 
4776  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4777  size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
4778  size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
4779 
4780  retErr = commonEnqueueMemImageNDCopy(
4781  hipStream, imgType, adjustedRegion, srcArray, hipMemoryTypeArray,
4782  srcOffset, dstArray, hipMemoryTypeArray, dstOffset);
4783 
4784  if (retErr != PI_SUCCESS) {
4785  return retErr;
4786  }
4787 
4788  if (event) {
4790  command_queue, hipStream);
4791  new_event->record();
4792  *event = new_event;
4793  }
4794  } catch (pi_result err) {
4795  return err;
4796  } catch (...) {
4797  return PI_ERROR_UNKNOWN;
4798  }
4799 
4800  return PI_SUCCESS;
4801  return retErr;
4802 }
4803 
4806  const void *fill_color,
4807  const size_t *origin, const size_t *region,
4808  pi_uint32 num_events_in_wait_list,
4809  const pi_event *event_wait_list,
4810  pi_event *event) {
4811  (void)command_queue;
4812  (void)image;
4813  (void)fill_color;
4814  (void)origin;
4815  (void)region;
4816  (void)num_events_in_wait_list;
4817  (void)event_wait_list;
4818  (void)event;
4819 
4820  sycl::detail::pi::die("hip_piEnqueueMemImageFill not implemented");
4821  return {};
4822 }
4823 
4830  pi_bool blocking_map,
4831  pi_map_flags map_flags, size_t offset,
4832  size_t size,
4833  pi_uint32 num_events_in_wait_list,
4834  const pi_event *event_wait_list,
4835  pi_event *event, void **ret_map) {
4836  assert(ret_map != nullptr);
4837  assert(command_queue != nullptr);
4838  assert(buffer != nullptr);
4839  assert(buffer->mem_type_ == _pi_mem::mem_type::buffer);
4840 
4841  pi_result ret_err = PI_ERROR_INVALID_OPERATION;
4842  const bool is_pinned = buffer->mem_.buffer_mem_.allocMode_ ==
4844 
4845  // Currently no support for overlapping regions
4846  if (buffer->mem_.buffer_mem_.get_map_ptr() != nullptr) {
4847  return ret_err;
4848  }
4849 
4850  // Allocate a pointer in the host to store the mapped information
4851  auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags);
4852  *ret_map = buffer->mem_.buffer_mem_.get_map_ptr();
4853  if (hostPtr) {
4854  ret_err = PI_SUCCESS;
4855  }
4856 
4857  if (!is_pinned && ((map_flags & PI_MAP_READ) || (map_flags & PI_MAP_WRITE))) {
4858  // Pinned host memory is already on host so it doesn't need to be read.
4859  ret_err = hip_piEnqueueMemBufferRead(
4860  command_queue, buffer, blocking_map, offset, size, hostPtr,
4861  num_events_in_wait_list, event_wait_list, event);
4862  } else {
4863  ScopedContext active(command_queue->get_context());
4864 
4865  if (is_pinned) {
4866  ret_err = hip_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
4867  event_wait_list, nullptr);
4868  }
4869 
4870  if (event) {
4871  try {
4872  *event = _pi_event::make_native(
4873  PI_COMMAND_TYPE_MEM_BUFFER_MAP, command_queue,
4874  command_queue->get_next_transfer_stream());
4875  (*event)->start();
4876  (*event)->record();
4877  } catch (pi_result error) {
4878  ret_err = error;
4879  }
4880  }
4881  }
4882 
4883  return ret_err;
4884 }
4885 
4891  void *mapped_ptr,
4892  pi_uint32 num_events_in_wait_list,
4893  const pi_event *event_wait_list,
4894  pi_event *event) {
4895  pi_result ret_err = PI_SUCCESS;
4896 
4897  assert(command_queue != nullptr);
4898  assert(mapped_ptr != nullptr);
4899  assert(memobj != nullptr);
4900  assert(memobj->mem_type_ == _pi_mem::mem_type::buffer);
4901  assert(memobj->mem_.buffer_mem_.get_map_ptr() != nullptr);
4902  assert(memobj->mem_.buffer_mem_.get_map_ptr() == mapped_ptr);
4903 
4904  const bool is_pinned = memobj->mem_.buffer_mem_.allocMode_ ==
4906 
4907  if (!is_pinned &&
4908  ((memobj->mem_.buffer_mem_.get_map_flags() & PI_MAP_WRITE) ||
4909  (memobj->mem_.buffer_mem_.get_map_flags() &
4911  // Pinned host memory is only on host so it doesn't need to be written to.
4912  ret_err = hip_piEnqueueMemBufferWrite(
4913  command_queue, memobj, true,
4914  memobj->mem_.buffer_mem_.get_map_offset(mapped_ptr),
4915  memobj->mem_.buffer_mem_.get_size(), mapped_ptr,
4916  num_events_in_wait_list, event_wait_list, event);
4917  } else {
4918  ScopedContext active(command_queue->get_context());
4919 
4920  if (is_pinned) {
4921  ret_err = hip_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
4922  event_wait_list, nullptr);
4923  }
4924 
4925  if (event) {
4926  try {
4927  *event = _pi_event::make_native(
4928  PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, command_queue,
4929  command_queue->get_next_transfer_stream());
4930  (*event)->start();
4931  (*event)->record();
4932  } catch (pi_result error) {
4933  ret_err = error;
4934  }
4935  }
4936  }
4937 
4938  memobj->mem_.buffer_mem_.unmap(mapped_ptr);
4939  return ret_err;
4940 }
4941 
4944 pi_result hip_piextUSMHostAlloc(void **result_ptr, pi_context context,
4945  pi_usm_mem_properties *properties, size_t size,
4946  pi_uint32 alignment) {
4947  assert(result_ptr != nullptr);
4948  assert(context != nullptr);
4949  assert(properties == nullptr || *properties == 0);
4950  pi_result result = PI_SUCCESS;
4951  try {
4952  ScopedContext active(context);
4953  result = PI_CHECK_ERROR(hipHostMalloc(result_ptr, size));
4954  } catch (pi_result error) {
4955  result = error;
4956  }
4957 
4958  assert(alignment == 0 ||
4959  (result == PI_SUCCESS &&
4960  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
4961  return result;
4962 }
4963 
4966 pi_result hip_piextUSMDeviceAlloc(void **result_ptr, pi_context context,
4967  pi_device device,
4968  pi_usm_mem_properties *properties,
4969  size_t size, pi_uint32 alignment) {
4970  assert(result_ptr != nullptr);
4971  assert(context != nullptr);
4972  assert(device != nullptr);
4973  assert(properties == nullptr || *properties == 0);
4974  pi_result result = PI_SUCCESS;
4975  try {
4976  ScopedContext active(context);
4977  result = PI_CHECK_ERROR(hipMalloc(result_ptr, size));
4978  } catch (pi_result error) {
4979  result = error;
4980  }
4981 
4982  assert(alignment == 0 ||
4983  (result == PI_SUCCESS &&
4984  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
4985  return result;
4986 }
4987 
4990 pi_result hip_piextUSMSharedAlloc(void **result_ptr, pi_context context,
4991  pi_device device,
4992  pi_usm_mem_properties *properties,
4993  size_t size, pi_uint32 alignment) {
4994  assert(result_ptr != nullptr);
4995  assert(context != nullptr);
4996  assert(device != nullptr);
4997  assert(properties == nullptr || *properties == 0);
4998  pi_result result = PI_SUCCESS;
4999  try {
5000  ScopedContext active(context);
5001  result =
5002  PI_CHECK_ERROR(hipMallocManaged(result_ptr, size, hipMemAttachGlobal));
5003  } catch (pi_result error) {
5004  result = error;
5005  }
5006 
5007  assert(alignment == 0 ||
5008  (result == PI_SUCCESS &&
5009  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
5010  return result;
5011 }
5012 
5016 
5017  assert(context != nullptr);
5018  pi_result result = PI_SUCCESS;
5019  try {
5020  ScopedContext active(context);
5021  unsigned int type;
5022  hipPointerAttribute_t hipPointerAttributeType;
5023  result =
5024  PI_CHECK_ERROR(hipPointerGetAttributes(&hipPointerAttributeType, ptr));
5025  type = hipPointerAttributeType.memoryType;
5026  assert(type == hipMemoryTypeDevice or type == hipMemoryTypeHost);
5027  if (type == hipMemoryTypeDevice) {
5028  result = PI_CHECK_ERROR(hipFree(ptr));
5029  }
5030  if (type == hipMemoryTypeHost) {
5031  result = PI_CHECK_ERROR(hipFreeHost(ptr));
5032  }
5033  } catch (pi_result error) {
5034  result = error;
5035  }
5036  return result;
5037 }
5038 
5040  size_t count,
5041  pi_uint32 num_events_in_waitlist,
5042  const pi_event *events_waitlist,
5043  pi_event *event) {
5044 
5045  assert(queue != nullptr);
5046  assert(ptr != nullptr);
5047  pi_result result = PI_SUCCESS;
5048  std::unique_ptr<_pi_event> event_ptr{nullptr};
5049 
5050  try {
5051  ScopedContext active(queue->get_context());
5052  pi_uint32 stream_token;
5053  _pi_stream_guard guard;
5054  hipStream_t hipStream = queue->get_next_compute_stream(
5055  num_events_in_waitlist, events_waitlist, guard, &stream_token);
5056  result = enqueueEventsWait(queue, hipStream, num_events_in_waitlist,
5057  events_waitlist);
5058  if (event) {
5059  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5060  PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue, hipStream, stream_token));
5061  event_ptr->start();
5062  }
5063  result = PI_CHECK_ERROR(
5064  hipMemsetD8Async(reinterpret_cast<hipDeviceptr_t>(ptr),
5065  (unsigned char)value & 0xFF, count, hipStream));
5066  if (event) {
5067  result = event_ptr->record();
5068  *event = event_ptr.release();
5069  }
5070  } catch (pi_result err) {
5071  result = err;
5072  }
5073 
5074  return result;
5075 }
5076 
5078  void *dst_ptr, const void *src_ptr,
5079  size_t size,
5080  pi_uint32 num_events_in_waitlist,
5081  const pi_event *events_waitlist,
5082  pi_event *event) {
5083  assert(queue != nullptr);
5084  assert(dst_ptr != nullptr);
5085  assert(src_ptr != nullptr);
5086  pi_result result = PI_SUCCESS;
5087 
5088  std::unique_ptr<_pi_event> event_ptr{nullptr};
5089 
5090  try {
5091  ScopedContext active(queue->get_context());
5092  hipStream_t hipStream = queue->get_next_transfer_stream();
5093  result = enqueueEventsWait(queue, hipStream, num_events_in_waitlist,
5094  events_waitlist);
5095  if (event) {
5096  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5097  PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, hipStream));
5098  event_ptr->start();
5099  }
5100  result = PI_CHECK_ERROR(
5101  hipMemcpyAsync(dst_ptr, src_ptr, size, hipMemcpyDefault, hipStream));
5102  if (event) {
5103  result = event_ptr->record();
5104  }
5105  if (blocking) {
5106  result = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
5107  }
5108  if (event) {
5109  *event = event_ptr.release();
5110  }
5111  } catch (pi_result err) {
5112  result = err;
5113  }
5114  return result;
5115 }
5116 
5118  size_t size, pi_usm_migration_flags flags,
5119  pi_uint32 num_events_in_waitlist,
5120  const pi_event *events_waitlist,
5121  pi_event *event) {
5122 
5123  // flags is currently unused so fail if set
5124  if (flags != 0)
5125  return PI_ERROR_INVALID_VALUE;
5126  assert(queue != nullptr);
5127  assert(ptr != nullptr);
5128  pi_result result = PI_SUCCESS;
5129  std::unique_ptr<_pi_event> event_ptr{nullptr};
5130 
5131  try {
5132  ScopedContext active(queue->get_context());
5133  hipStream_t hipStream = queue->get_next_transfer_stream();
5134  result = enqueueEventsWait(queue, hipStream, num_events_in_waitlist,
5135  events_waitlist);
5136  if (event) {
5137  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5138  PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, hipStream));
5139  event_ptr->start();
5140  }
5141  result = PI_CHECK_ERROR(hipMemPrefetchAsync(
5142  ptr, size, queue->get_context()->get_device()->get(), hipStream));
5143  if (event) {
5144  result = event_ptr->record();
5145  *event = event_ptr.release();
5146  }
5147  } catch (pi_result err) {
5148  result = err;
5149  }
5150 
5151  return result;
5152 }
5153 
5156  size_t length, pi_mem_advice advice,
5157  pi_event *event) {
5158  (void)length;
5159  (void)advice;
5160 
5161  assert(queue != nullptr);
5162  assert(ptr != nullptr);
5163  // TODO implement a mapping to hipMemAdvise once the expected behaviour
5164  // of piextUSMEnqueueMemAdvise is detailed in the USM extension
5165  return hip_piEnqueueEventsWait(queue, 0, nullptr, event);
5166 
5167  return PI_SUCCESS;
5168 }
5169 
5170 // TODO: Implement this. Remember to return true for
5171 // PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT when it is implemented.
5173  const void *, size_t, size_t, pi_uint32,
5174  const pi_event *, pi_event *) {
5175  sycl::detail::pi::die("piextUSMEnqueueFill2D: not implemented");
5176  return {};
5177 }
5178 
5179 // TODO: Implement this. Remember to return true for
5180 // PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT when it is implemented.
5181 pi_result hip_piextUSMEnqueueMemset2D(pi_queue, void *, size_t, int, size_t,
5182  size_t, pi_uint32, const pi_event *,
5183  pi_event *) {
5184  sycl::detail::pi::die("hip_piextUSMEnqueueMemset2D: not implemented");
5185  return {};
5186 }
5187 
5203  void *dst_ptr, size_t dst_pitch,
5204  const void *src_ptr, size_t src_pitch,
5205  size_t width, size_t height,
5206  pi_uint32 num_events_in_wait_list,
5207  const pi_event *event_wait_list,
5208  pi_event *event) {
5209  assert(queue != nullptr);
5210 
5211  pi_result result = PI_SUCCESS;
5212 
5213  try {
5214  ScopedContext active(queue->get_context());
5215  hipStream_t hipStream = queue->get_next_transfer_stream();
5216  result = enqueueEventsWait(queue, hipStream, num_events_in_wait_list,
5217  event_wait_list);
5218  if (event) {
5220  queue, hipStream);
5221  (*event)->start();
5222  }
5223 
5224  result = PI_CHECK_ERROR(hipMemcpy2DAsync(dst_ptr, dst_pitch, src_ptr,
5225  src_pitch, width, height,
5226  hipMemcpyDefault, hipStream));
5227 
5228  if (event) {
5229  (*event)->record();
5230  }
5231  if (blocking) {
5232  result = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
5233  }
5234  } catch (pi_result err) {
5235  result = err;
5236  }
5237 
5238  return result;
5239 }
5240 
5258  pi_mem_alloc_info param_name,
5259  size_t param_value_size,
5260  void *param_value,
5261  size_t *param_value_size_ret) {
5262 
5263  assert(context != nullptr);
5264  assert(ptr != nullptr);
5265  pi_result result = PI_SUCCESS;
5266  hipPointerAttribute_t hipPointerAttributeType;
5267 
5268  try {
5269  ScopedContext active(context);
5270  switch (param_name) {
5271  case PI_MEM_ALLOC_TYPE: {
5272  unsigned int value;
5273  // do not throw if hipPointerGetAttribute returns hipErrorInvalidValue
5274  hipError_t ret = hipPointerGetAttributes(&hipPointerAttributeType, ptr);
5275  if (ret == hipErrorInvalidValue) {
5276  // pointer not known to the HIP subsystem
5277  return getInfo(param_value_size, param_value, param_value_size_ret,
5279  }
5280  result = check_error(ret, __func__, __LINE__ - 5, __FILE__);
5281  value = hipPointerAttributeType.isManaged;
5282  if (value) {
5283  // pointer to managed memory
5284  return getInfo(param_value_size, param_value, param_value_size_ret,
5286  }
5287  result = PI_CHECK_ERROR(
5288  hipPointerGetAttributes(&hipPointerAttributeType, ptr));
5289  value = hipPointerAttributeType.memoryType;
5290  assert(value == hipMemoryTypeDevice or value == hipMemoryTypeHost);
5291  if (value == hipMemoryTypeDevice) {
5292  // pointer to device memory
5293  return getInfo(param_value_size, param_value, param_value_size_ret,
5295  }
5296  if (value == hipMemoryTypeHost) {
5297  // pointer to host memory
5298  return getInfo(param_value_size, param_value, param_value_size_ret,
5300  }
5301  // should never get here
5302  __builtin_unreachable();
5303  return getInfo(param_value_size, param_value, param_value_size_ret,
5305  }
5306  case PI_MEM_ALLOC_BASE_PTR: {
5307  return PI_ERROR_INVALID_VALUE;
5308  }
5309  case PI_MEM_ALLOC_SIZE: {
5310  return PI_ERROR_INVALID_VALUE;
5311  }
5312 
5313  case PI_MEM_ALLOC_DEVICE: {
5314  // get device index associated with this pointer
5315  result = PI_CHECK_ERROR(
5316  hipPointerGetAttributes(&hipPointerAttributeType, ptr));
5317  int device_idx = hipPointerAttributeType.device;
5318 
5319  // currently each device is in its own platform, so find the platform at
5320  // the same index
5321  std::vector<pi_platform> platforms;
5322  platforms.resize(device_idx + 1);
5323  result = hip_piPlatformsGet(device_idx + 1, platforms.data(), nullptr);
5324 
5325  // get the device from the platform
5326  pi_device device = platforms[device_idx]->devices_[0].get();
5327  return getInfo(param_value_size, param_value, param_value_size_ret,
5328  device);
5329  }
5330  }
5331  } catch (pi_result error) {
5332  result = error;
5333  }
5334 
5335  return result;
5336 }
5337 
5339  pi_queue queue, pi_program program, const char *name,
5340  pi_bool blocking_write, size_t count, size_t offset, const void *src,
5341  pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
5342  pi_event *event) {
5343  (void)queue;
5344  (void)program;
5345  (void)name;
5346  (void)blocking_write;
5347  (void)count;
5348  (void)offset;
5349  (void)src;
5350  (void)num_events_in_wait_list;
5351  (void)event_wait_list;
5352  (void)event;
5353 
5355  "hip_piextEnqueueDeviceGlobalVariableWrite not implemented");
5356  return {};
5357 }
5358 
5360  pi_queue queue, pi_program program, const char *name, pi_bool blocking_read,
5361  size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list,
5362  const pi_event *event_wait_list, pi_event *event) {
5363  (void)queue;
5364  (void)program;
5365  (void)name;
5366  (void)blocking_read;
5367  (void)count;
5368  (void)offset;
5369  (void)dst;
5370  (void)num_events_in_wait_list;
5371  (void)event_wait_list;
5372  (void)event;
5373 
5375  "hip_piextEnqueueDeviceGlobalVariableRead not implemented");
5376  return {};
5377 }
5378 
5379 // This API is called by Sycl RT to notify the end of the plugin lifetime.
5380 // Windows: dynamically loaded plugins might have been unloaded already
5381 // when this is called. Sycl RT holds onto the PI plugin so it can be
5382 // called safely. But this is not transitive. If the PI plugin in turn
5383 // dynamically loaded a different DLL, that may have been unloaded.
5384 // TODO: add a global variable lifetime management code here (see
5385 // pi_level_zero.cpp for reference) Currently this is just a NOOP.
5386 pi_result hip_piTearDown(void *PluginParameter) {
5387  (void)PluginParameter;
5388  return PI_SUCCESS;
5389 }
5390 
5391 pi_result hip_piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime,
5392  uint64_t *HostTime) {
5393  if (!DeviceTime && !HostTime)
5394  return PI_SUCCESS;
5395 
5396  _pi_event::native_type event;
5397 
5398  ScopedContext active(Device->get_context());
5399 
5400  if (DeviceTime) {
5401  PI_CHECK_ERROR(hipEventCreateWithFlags(&event, hipEventDefault));
5402  PI_CHECK_ERROR(hipEventRecord(event));
5403  }
5404  if (HostTime) {
5405  using namespace std::chrono;
5406  *HostTime =
5407  duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
5408  .count();
5409  }
5410 
5411  if (DeviceTime) {
5412  PI_CHECK_ERROR(hipEventSynchronize(event));
5413 
5414  float elapsedTime = 0.0f;
5415  PI_CHECK_ERROR(
5416  hipEventElapsedTime(&elapsedTime, _pi_platform::evBase_, event));
5417  *DeviceTime = (uint64_t)(elapsedTime * (double)1e6);
5418  }
5419  return PI_SUCCESS;
5420 }
5421 
5423 
5425  // Check that the major version matches in PiVersion and SupportedVersion
5427 
5428  // PI interface supports higher version or the same version.
5429  size_t PluginVersionSize = sizeof(PluginInit->PluginVersion);
5430  if (strlen(SupportedVersion) >= PluginVersionSize)
5431  return PI_ERROR_INVALID_VALUE;
5432  strncpy(PluginInit->PluginVersion, SupportedVersion, PluginVersionSize);
5433 
5434  // Set whole function table to zero to make it easier to detect if
5435  // functions are not set up below.
5436  std::memset(&(PluginInit->PiFunctionTable), 0,
5437  sizeof(PluginInit->PiFunctionTable));
5438 
5439 // Forward calls to HIP RT.
5440 #define _PI_CL(pi_api, hip_api) \
5441  (PluginInit->PiFunctionTable).pi_api = (decltype(&::pi_api))(&hip_api);
5442 
5443  // Platform
5446  // Device
5457  // Context
5466  // Queue
5476  // Memory
5486  // Program
5500  // Kernel
5512  // Event
5523  // Sampler
5528  // Queue commands
5546  // USM
5559  // Device global variable
5564 
5567  _PI_CL(piPluginGetLastError, hip_piPluginGetLastError)
5570 
5571 #undef _PI_CL
5572 
5573  return PI_SUCCESS;
5574 }
5575 
5576 #ifdef _WIN32
5577 #define __SYCL_PLUGIN_DLL_NAME "pi_hip.dll"
5578 #include "../common_win_pi_trace/common_win_pi_trace.hpp"
5579 #undef __SYCL_PLUGIN_DLL_NAME
5580 #endif
5581 
5582 } // extern "C"
5583 
5584 hipEvent_t _pi_platform::evBase_{nullptr};
_pi_sampler::context_
pi_context context_
Definition: pi_cuda.hpp:990
PI_COMMAND_TYPE_USER
@ PI_COMMAND_TYPE_USER
Definition: pi.h:431
setErrorMessage
static void setErrorMessage(const char *message, pi_result error_code)
Definition: pi_esimd_emulator.cpp:157
PI_PROFILING_INFO_COMMAND_START
@ PI_PROFILING_INFO_COMMAND_START
Definition: pi.h:577
piEventGetProfilingInfo
pi_result piEventGetProfilingInfo(pi_event event, pi_profiling_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1440
hip_piextUSMEnqueueMemset
pi_result hip_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, size_t count, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Definition: pi_hip.cpp:5039
PI_DEVICE_INFO_HOST_UNIFIED_MEMORY
@ PI_DEVICE_INFO_HOST_UNIFIED_MEMORY
Definition: pi.h:259
PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT
#define PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT
Extension to denote native support of assert feature by an arbitrary device piDeviceGetInfo call shou...
Definition: pi.h:801
piKernelCreate
pi_result piKernelCreate(pi_program program, const char *kernel_name, pi_kernel *ret_kernel)
Definition: pi_esimd_emulator.cpp:1366
PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR
Definition: pi.h:220
piextEnqueueDeviceGlobalVariableRead
pi_result piextEnqueueDeviceGlobalVariableRead(pi_queue queue, pi_program program, const char *name, pi_bool blocking_read, size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API reading data from a device global variable to host.
Definition: pi_esimd_emulator.cpp:2043
hip_piEnqueueMemImageFill
pi_result hip_piEnqueueMemImageFill(pi_queue command_queue, pi_mem image, const void *fill_color, const size_t *origin, const size_t *region, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
\TODO Not implemented in HIP.
Definition: pi_hip.cpp:4805
PI_DEVICE_INFO_OPENCL_C_VERSION
@ PI_DEVICE_INFO_OPENCL_C_VERSION
Definition: pi.h:277
_pi_mem
PI Mem mapping to CUDA memory allocations, both data and texture/surface.
Definition: pi_cuda.hpp:224
pi_buff_rect_region_struct::depth_scalar
size_t depth_scalar
Definition: pi.h:911
piEventRelease
pi_result piEventRelease(pi_event event)
Definition: pi_esimd_emulator.cpp:1488
PI_MAP_WRITE
constexpr pi_map_flags PI_MAP_WRITE
Definition: pi.h:597
PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY
@ PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY
Definition: pi.h:308
PI_USM_ACCESS
@ PI_USM_ACCESS
Definition: pi.h:1689
PI_DEVICE_INFO_PROFILE
@ PI_DEVICE_INFO_PROFILE
Definition: pi.h:275
piEnqueueKernelLaunch
pi_result piEnqueueKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1813
piMemGetInfo
pi_result piMemGetInfo(pi_mem mem, pi_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1096
_pi_mem_type
_pi_mem_type
Definition: pi.h:455
hip_piEnqueueNativeKernel
pi_result hip_piEnqueueNativeKernel(pi_queue queue, void(*user_func)(void *), void *args, size_t cb_args, pi_uint32 num_mem_objects, const pi_mem *mem_list, const void **args_mem_loc, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
\TODO Not implemented
Definition: pi_hip.cpp:3041
PI_KERNEL_INFO_REFERENCE_COUNT
@ PI_KERNEL_INFO_REFERENCE_COUNT
Definition: pi.h:379
PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH
@ PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH
Definition: pi.h:242
PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR
Definition: pi.h:227
_pi_mem::mem_::surface_mem_::get_surface
CUsurfObject get_surface() const noexcept
Definition: pi_cuda.hpp:327
PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES
@ PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES
Definition: pi.h:354
hip_piEnqueueMemBufferMap
pi_result hip_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, pi_bool blocking_map, pi_map_flags map_flags, size_t offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event, void **ret_map)
Implements mapping on the host using a BufferRead operation.
Definition: pi_hip.cpp:4829
imageElementByteSize
static size_t imageElementByteSize(hipArray_Format array_format)
Definition: pi_hip.cpp:4503
ReleaseGuard::ReleaseGuard
ReleaseGuard(T Obj)
Obj can be nullptr.
Definition: pi_hip.cpp:788
PI_DEVICE_INFO_DOUBLE_FP_CONFIG
@ PI_DEVICE_INFO_DOUBLE_FP_CONFIG
Definition: pi.h:218
hip_piProgramCompile
pi_result hip_piProgramCompile(pi_program program, pi_uint32 num_devices, const pi_device *device_list, const char *options, pi_uint32 num_input_headers, const pi_program *input_headers, const char **header_include_names, void(*pfn_notify)(pi_program program, void *user_data), void *user_data)
Creates a new program that is the outcome of the compilation of the headers and the program.
Definition: pi_hip.cpp:3401
_pi_context_info
_pi_context_info
Definition: pi.h:347
PI_EXT_INTEL_DEVICE_INFO_MEMORY_BUS_WIDTH
@ PI_EXT_INTEL_DEVICE_INFO_MEMORY_BUS_WIDTH
Definition: pi.h:314
pi_buff_rect_region_struct::height_scalar
size_t height_scalar
Definition: pi.h:910
PI_MEM_TYPE_IMAGE1D
@ PI_MEM_TYPE_IMAGE1D
Definition: pi.h:460
_pi_context::kind::primary
@ primary
pi_buff_rect_offset_struct
Definition: pi.h:899
_pi_context::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:211
_pi_program::context_
_pi_context * context_
Definition: pi_cuda.hpp:766
piextUSMFree
pi_result piextUSMFree(pi_context context, void *ptr)
Indicates that the allocated USM memory is no longer needed on the runtime side.
Definition: pi_esimd_emulator.cpp:1943
piKernelSetArg
pi_result piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Definition: pi_esimd_emulator.cpp:1370
_pi_event::get_context
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:672
hip_piEventSetStatus
pi_result hip_piEventSetStatus(pi_event event, pi_int32 execution_status)
Definition: pi_hip.cpp:3828
PI_KERNEL_INFO_ATTRIBUTES
@ PI_KERNEL_INFO_ATTRIBUTES
Definition: pi.h:382
hip_piextProgramCreateWithNativeHandle
pi_result hip_piextProgramCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, bool ownNativeHandle, pi_program *program)
Created a PI program object from a HIP program handle.
Definition: pi_hip.cpp:3516
_pi_mem::mem_::buffer_mem_::hostPtr_
void * hostPtr_
Pointer associated with this device on the host.
Definition: pi_cuda.hpp:253
_pi_context::deviceId_
_pi_device * deviceId_
Definition: pi_cuda.hpp:182
PI_DEVICE_INFO_DRIVER_VERSION
@ PI_DEVICE_INFO_DRIVER_VERSION
Definition: pi.h:274
PI_DEVICE_INFO_GPU_EU_COUNT
@ PI_DEVICE_INFO_GPU_EU_COUNT
Definition: pi.h:299
pi_bool
pi_uint32 pi_bool
Definition: pi.h:131
_pi_queue::context_
_pi_context * context_
Definition: pi_cuda.hpp:410
PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT
@ PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT
Definition: pi.h:513
PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC
@ PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC
Definition: pi.h:280
_pi_event::get_compute_stream_token
pi_uint32 get_compute_stream_token() const noexcept
Definition: pi_cuda.hpp:648
pi_hip.hpp
_pi_image_format::image_channel_data_type
pi_image_channel_type image_channel_data_type
Definition: pi.h:977
hip_piQueueCreate
pi_result hip_piQueueCreate(pi_context context, pi_device device, pi_queue_properties properties, pi_queue *queue)
Creates a pi_queue object on the HIP backend.
Definition: pi_hip.cpp:2447
hip_piextUSMGetMemAllocInfo
pi_result hip_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, pi_mem_alloc_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
API to query information about USM allocated pointers Valid Queries: PI_MEM_ALLOC_TYPE returns host/d...
Definition: pi_hip.cpp:5257
piPluginGetLastError
pi_result piPluginGetLastError(char **message)
API to get Plugin specific warning and error messages.
Definition: pi_esimd_emulator.cpp:165
ErrorMessage
thread_local char ErrorMessage[MaxMessageSize]
Definition: pi_esimd_emulator.cpp:154
PI_QUEUE_INFO_CONTEXT
@ PI_QUEUE_INFO_CONTEXT
Definition: pi.h:365
hip_piEventGetProfilingInfo
pi_result hip_piEventGetProfilingInfo(pi_event event, pi_profiling_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Obtain profiling information from PI HIP events Timings from HIP are only elapsed time.
Definition: pi_hip.cpp:3783
ReleaseGuard::ReleaseGuard
ReleaseGuard(ReleaseGuard &&Other) noexcept
Definition: pi_hip.cpp:789
_pi_queue::barrier_tmp_event_
CUevent barrier_tmp_event_
Definition: pi_cuda.hpp:414
_pi_event::is_started
bool is_started() const noexcept
Definition: pi_cuda.hpp:656
_pi_sampler::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:995
_pi_queue::get
native_type get()
Definition: pi_cuda.hpp:472
hip_piContextRelease
pi_result hip_piContextRelease(pi_context ctxt)
Definition: pi_hip.cpp:2084
hip_piEventGetInfo
pi_result hip_piEventGetInfo(pi_event event, pi_event_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_hip.cpp:3752
hip_piextUSMEnqueueFill2D
pi_result hip_piextUSMEnqueueFill2D(pi_queue, void *, size_t, size_t, const void *, size_t, size_t, pi_uint32, const pi_event *, pi_event *)
Definition: pi_hip.cpp:5172
PI_DEVICE_INFO_NAME
@ PI_DEVICE_INFO_NAME
Definition: pi.h:272
hip_piContextRetain
pi_result hip_piContextRetain(pi_context context)
Definition: pi_hip.cpp:1027
hip_piextEventGetNativeHandle
pi_result hip_piextEventGetNativeHandle(pi_event event, pi_native_handle *nativeHandle)
Gets the native HIP handle of a PI event object.
Definition: pi_hip.cpp:3974
_pi_queue::get_next_compute_stream
native_type get_next_compute_stream(pi_uint32 *stream_token=nullptr)
Definition: pi_cuda.cpp:442
hip_piextUSMEnqueueMemAdvise
pi_result hip_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, size_t length, pi_mem_advice advice, pi_event *event)
USM: memadvise API to govern behavior of automatic migration mechanisms.
Definition: pi_hip.cpp:5155
piProgramLink
pi_result piProgramLink(pi_context context, pi_uint32 num_devices, const pi_device *device_list, const char *options, pi_uint32 num_input_programs, const pi_program *input_programs, void(*pfn_notify)(pi_program program, void *user_data), void *user_data, pi_program *ret_program)
Definition: pi_opencl.cpp:1230
PI_IMAGE_CHANNEL_TYPE_UNORM_INT8
@ PI_IMAGE_CHANNEL_TYPE_UNORM_INT8
Definition: pi.h:502
PI_PROFILING_INFO_COMMAND_SUBMIT
@ PI_PROFILING_INFO_COMMAND_SUBMIT
Definition: pi.h:576
PI_MEMORY_ORDER_ACQUIRE
constexpr pi_memory_order_capabilities PI_MEMORY_ORDER_ACQUIRE
Definition: pi.h:562
_pi_sampler::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:997
PI_PROGRAM_INFO_BINARY_SIZES
@ PI_PROGRAM_INFO_BINARY_SIZES
Definition: pi.h:341
PI_MEM_FLAGS_HOST_PTR_COPY
constexpr pi_mem_flags PI_MEM_FLAGS_HOST_PTR_COPY
Definition: pi.h:591
_pi_image_format::image_channel_order
pi_image_channel_order image_channel_order
Definition: pi.h:976
PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT
@ PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT
Definition: pi.h:360
PI_MEM_ALLOC_SIZE
@ PI_MEM_ALLOC_SIZE
Definition: pi.h:1698
_pi_context::get
native_type get() const noexcept
Definition: pi_cuda.hpp:207
PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN
@ PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN
Definition: pi.h:284
piextProgramGetNativeHandle
pi_result piextProgramGetNativeHandle(pi_program program, pi_native_handle *nativeHandle)
Gets the native handle of a PI program object.
Definition: pi_esimd_emulator.cpp:1357
hip_piextKernelSetArgMemObj
pi_result hip_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem *arg_value)
Definition: pi_hip.cpp:2831
__SYCL_INLINE_VER_NAMESPACE
#define __SYCL_INLINE_VER_NAMESPACE(X)
Definition: defines_elementary.hpp:11
_pi_image_desc::image_type
pi_mem_type image_type
Definition: pi.h:981
piProgramRetain
pi_result piProgramRetain(pi_program program)
Definition: pi_esimd_emulator.cpp:1353
piextUSMEnqueueMemcpy2D
pi_result piextUSMEnqueueMemcpy2D(pi_queue queue, pi_bool blocking, void *dst_ptr, size_t dst_pitch, const void *src_ptr, size_t src_pitch, size_t width, size_t height, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
USM 2D Memcpy API.
Definition: pi_esimd_emulator.cpp:1997
_pi_plugin
Definition: pi.h:1992
_pi_program::get_context
pi_context get_context() const
Definition: pi_cuda.hpp:789
sycl::_V1::errc::event
@ event
PI_PROGRAM_INFO_SOURCE
@ PI_PROGRAM_INFO_SOURCE
Definition: pi.h:340
_pi_program::MAX_LOG_SIZE
constexpr static size_t MAX_LOG_SIZE
Definition: pi_cuda.hpp:773
PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE
@ PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE
Definition: pi.h:250
hip_piEnqueueMemImageCopy
pi_result hip_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, pi_mem dst_image, const size_t *src_origin, const size_t *dst_origin, const size_t *region, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_hip.cpp:4732
piDevicePartition
pi_result piDevicePartition(pi_device device, const pi_device_partition_property *properties, pi_uint32 num_devices, pi_device *out_devices, pi_uint32 *out_num_devices)
Definition: pi_esimd_emulator.cpp:822
PI_KERNEL_COMPILE_NUM_SUB_GROUPS
@ PI_KERNEL_COMPILE_NUM_SUB_GROUPS
Definition: pi.h:409
hip_piextGetDeviceFunctionPointer
pi_result hip_piextGetDeviceFunctionPointer(pi_device device, pi_program program, const char *func_name, pi_uint64 *func_pointer_ret)
Definition: pi_hip.cpp:1091
_pi_mem_advice
_pi_mem_advice
Definition: pi.h:465
PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES
@ PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES
Definition: pi.h:214
piEnqueueMemBufferCopy
pi_result piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1621
hip_piextEnqueueDeviceGlobalVariableRead
pi_result hip_piextEnqueueDeviceGlobalVariableRead(pi_queue queue, pi_program program, const char *name, pi_bool blocking_read, size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_hip.cpp:5359
PI_DEVICE_INFO_MAX_COMPUTE_UNITS
@ PI_DEVICE_INFO_MAX_COMPUTE_UNITS
Definition: pi.h:212
_pi_stream_guard
std::unique_lock< std::mutex > _pi_stream_guard
Definition: pi_cuda.hpp:67
hip_piextUSMEnqueueMemset2D
pi_result hip_piextUSMEnqueueMemset2D(pi_queue, void *, size_t, int, size_t, size_t, pi_uint32, const pi_event *, pi_event *)
Definition: pi_hip.cpp:5181
ReleaseGuard
RAII object that calls the reference count release function on the held PI object on destruction.
Definition: pi_hip.cpp:753
PI_EVENT_INFO_CONTEXT
@ PI_EVENT_INFO_CONTEXT
Definition: pi.h:415
_pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr
@ alloc_host_ptr
_pi_plugin::PluginVersion
char PluginVersion[20]
Definition: pi.h:2002
PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU
@ PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU
Definition: pi.h:321
_pi_mem::mem_::buffer_mem_::get_void
void * get_void() const noexcept
Definition: pi_hip.hpp:257
_pi_result
_pi_result
Definition: pi.h:140
hip_piextMemCreateWithNativeHandle
pi_result hip_piextMemCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, bool ownNativeHandle, pi_mem *mem)
Created a PI mem object from a HIP mem handle.
Definition: pi_hip.cpp:2427
PI_PROFILING_INFO_COMMAND_QUEUED
@ PI_PROFILING_INFO_COMMAND_QUEUED
Definition: pi.h:575
sycl::_V1::detail::memcpy
void memcpy(void *Dst, const void *Src, size_t Size)
Definition: memcpy.hpp:16
piextQueueGetNativeHandle
pi_result piextQueueGetNativeHandle(pi_queue queue, pi_native_handle *nativeHandle)
Gets the native handle of a PI queue object.
Definition: pi_esimd_emulator.cpp:1014
piTearDown
pi_result piTearDown(void *PluginParameter)
API to notify that the plugin should clean up its resources.
Definition: pi_esimd_emulator.cpp:2059
_pi_queue::transfer_applied_barrier_
std::vector< bool > transfer_applied_barrier_
Definition: pi_cuda.hpp:409
hip_piPlatformGetInfo
pi_result hip_piPlatformGetInfo(pi_platform platform, pi_platform_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_hip.cpp:916
ReleaseGuard::~ReleaseGuard
~ReleaseGuard()
Calls the related PI object release function if the object held is not nullptr or if dismiss has not ...
Definition: pi_hip.cpp:797
ErrorMessageCode
thread_local pi_result ErrorMessageCode
Definition: pi_esimd_emulator.cpp:153
pi_context_properties
intptr_t pi_context_properties
Definition: pi.h:546
piEnqueueMemUnmap
pi_result piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, void *mapped_ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1684
_pi_queue::for_each_stream
void for_each_stream(T &&f)
Definition: pi_cuda.hpp:526
PI_SAMPLER_INFO_FILTER_MODE
@ PI_SAMPLER_INFO_FILTER_MODE
Definition: pi.h:527
hip_piextContextGetNativeHandle
pi_result hip_piextContextGetNativeHandle(pi_context context, pi_native_handle *nativeHandle)
Gets the native HIP handle of a PI context object.
Definition: pi_hip.cpp:2130
PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE
@ PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE
Definition: pi.h:387
PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8
@ PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8
Definition: pi.h:507
sycl::_V1::ext::oneapi::experimental::alignment
constexpr alignment_key::value_t< K > alignment
Definition: properties.hpp:349
_pi_queue::device_
_pi_device * device_
Definition: pi_cuda.hpp:411
piSamplerGetInfo
pi_result piSamplerGetInfo(pi_sampler sampler, pi_sampler_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1522
_pi_event::get_queued_time
pi_uint64 get_queued_time() const
Definition: pi_cuda.cpp:605
piPlatformGetInfo
pi_result piPlatformGetInfo(pi_platform platform, pi_platform_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:444
_pi_queue::default_num_transfer_streams
static constexpr int default_num_transfer_streams
Definition: pi_cuda.hpp:398
PI_DEVICE_INFO_GPU_SLICES
@ PI_DEVICE_INFO_GPU_SLICES
Definition: pi.h:301
piDeviceRetain
pi_result piDeviceRetain(pi_device device)
Definition: pi_esimd_emulator.cpp:572
hip_piQueueFinish
pi_result hip_piQueueFinish(pi_queue command_queue)
Definition: pi_hip.cpp:2572
PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE
@ PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE
Definition: pi.h:236
piProgramCompile
pi_result piProgramCompile(pi_program program, pi_uint32 num_devices, const pi_device *device_list, const char *options, pi_uint32 num_input_headers, const pi_program *input_headers, const char **header_include_names, void(*pfn_notify)(pi_program program, void *user_data), void *user_data)
_pi_device_type
_pi_device_type
Definition: pi.h:187
hip_piKernelGetGroupInfo
pi_result hip_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_hip.cpp:3566
piContextGetInfo
pi_result piContextGetInfo(pi_context context, pi_context_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:867
piQueueRelease
pi_result piQueueRelease(pi_queue command_queue)
Definition: pi_esimd_emulator.cpp:986
_pi_mem::mem_::buffer_mem_::get_map_offset
size_t get_map_offset(void *) const noexcept
Definition: pi_cuda.hpp:283
PI_EXT_INTEL_DEVICE_INFO_MEMORY_CLOCK_RATE
@ PI_EXT_INTEL_DEVICE_INFO_MEMORY_CLOCK_RATE
Definition: pi.h:311
_pi_mem::context_
pi_context context_
Definition: pi_cuda.hpp:230
PI_DEVICE_INFO_REFERENCE_COUNT
@ PI_DEVICE_INFO_REFERENCE_COUNT
Definition: pi.h:270
PI_MEMORY_SCOPE_WORK_ITEM
constexpr pi_memory_scope_capabilities PI_MEMORY_SCOPE_WORK_ITEM
Definition: pi.h:568
PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT
@ PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT
Definition: pi.h:292
_pi_mem::is_buffer
bool is_buffer() const noexcept
Definition: pi_cuda.hpp:376
hip_piSamplerCreate
pi_result hip_piSamplerCreate(pi_context context, const pi_sampler_properties *sampler_properties, pi_sampler *result_sampler)
Creates a PI sampler object.
Definition: pi_hip.cpp:4011
hip_piEnqueueKernelLaunch
pi_result hip_piEnqueueKernelLaunch(pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_hip.cpp:2883
PI_DEVICE_INFO_PLATFORM
@ PI_DEVICE_INFO_PLATFORM
Definition: pi.h:269
PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE
@ PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE
Definition: pi.h:303
_pi_queue::get_next_transfer_stream
native_type get_next_transfer_stream()
Definition: pi_cuda.cpp:503
PI_DEVICE_LOCAL_MEM_TYPE_LOCAL
@ PI_DEVICE_LOCAL_MEM_TYPE_LOCAL
Definition: pi.h:205
PI_PROGRAM_BUILD_INFO_OPTIONS
@ PI_PROGRAM_BUILD_INFO_OPTIONS
Definition: pi.h:165
PI_PROGRAM_BUILD_STATUS_SUCCESS
@ PI_PROGRAM_BUILD_STATUS_SUCCESS
Definition: pi.h:173
PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES
@ PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES
Definition: pi.h:355
commonEnqueueMemBufferCopyRect
static pi_result commonEnqueueMemBufferCopyRect(hipStream_t hip_stream, pi_buff_rect_region region, const void *src_ptr, const hipMemoryType src_type, pi_buff_rect_offset src_offset, size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr, const hipMemoryType dst_type, pi_buff_rect_offset dst_offset, size_t dst_row_pitch, size_t dst_slice_pitch)
General 3D memory copy operation.
Definition: pi_hip.cpp:4145
hip_piEnqueueMemBufferFill
pi_result hip_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, const void *pattern, size_t pattern_size, size_t offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_hip.cpp:4391
piextDeviceCreateWithNativeHandle
pi_result piextDeviceCreateWithNativeHandle(pi_native_handle nativeHandle, pi_platform platform, pi_device *device)
Creates PI device object from a native handle.
Definition: pi_esimd_emulator.cpp:831
_pi_queue::sync_streams
void sync_streams(T &&f)
Definition: pi_cuda.hpp:547
PI_DEVICE_INFO_DEVICE_ID
@ PI_DEVICE_INFO_DEVICE_ID
Definition: pi.h:297
sycl
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
hip_piextProgramGetNativeHandle
pi_result hip_piextProgramGetNativeHandle(pi_program program, pi_native_handle *nativeHandle)
Gets the native HIP handle of a PI program object.
Definition: pi_hip.cpp:3499
_pi_event::get_stream
CUstream get_stream() const noexcept
Definition: pi_cuda.hpp:646
hip_definitions.hpp
piKernelGetSubGroupInfo
pi_result piKernelGetSubGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name, size_t input_value_size, const void *input_value, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
API to query information from the sub-group from a kernel.
Definition: pi_esimd_emulator.cpp:1392
PI_DEVICE_INFO_ADDRESS_BITS
@ PI_DEVICE_INFO_ADDRESS_BITS
Definition: pi.h:235
_pi_mem::mem_::surface_mem_::get_array
CUarray get_array() const noexcept
Definition: pi_cuda.hpp:325
_pi_image_desc::image_height
size_t image_height
Definition: pi.h:983
_pi_platform
A PI platform stores all known PI devices, in the CUDA plugin this is just a vector of available devi...
Definition: pi_cuda.hpp:74
PI_CONTEXT_INFO_NUM_DEVICES
@ PI_CONTEXT_INFO_NUM_DEVICES
Definition: pi.h:350
PI_DEVICE_TYPE_DEFAULT
@ PI_DEVICE_TYPE_DEFAULT
The default device available in the PI plugin.
Definition: pi.h:188
hip_piMemImageGetInfo
pi_result hip_piMemImageGetInfo(pi_mem image, pi_image_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
\TODO Not implemented
Definition: pi_hip.cpp:3217
_pi_queue::can_reuse_stream
bool can_reuse_stream(pi_uint32 stream_token)
Definition: pi_cuda.hpp:482
PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
@ PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
Definition: pi.h:390
PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE
Definition: pi.h:225
piextUSMEnqueueMemset2D
pi_result piextUSMEnqueueMemset2D(pi_queue queue, void *ptr, size_t pitch, int value, size_t width, size_t height, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
USM 2D Memset API.
Definition: pi_esimd_emulator.cpp:1992
PI_DEVICE_INFO_MAX_SAMPLERS
@ PI_DEVICE_INFO_MAX_SAMPLERS
Definition: pi.h:247
_pi_context::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:213
PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE
@ PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE
Definition: pi.h:389
piEnqueueMemImageFill
pi_result piEnqueueMemImageFill(pi_queue command_queue, pi_mem image, const void *fill_color, const size_t *origin, const size_t *region, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1801
PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS
@ PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS
Definition: pi.h:286
PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN
@ PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN
Definition: pi.h:249
piSamplerCreate
pi_result piSamplerCreate(pi_context context, const pi_sampler_properties *sampler_properties, pi_sampler *result_sampler)
Definition: pi_esimd_emulator.cpp:1517
hip_piEnqueueMemBufferWrite
pi_result hip_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, size_t offset, size_t size, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_hip.cpp:2648
hip_piEventSetCallback
pi_result hip_piEventSetCallback(pi_event event, pi_int32 command_exec_callback_type, pfn_notify notify, void *user_data)
Definition: pi_hip.cpp:3816
piextEventCreateWithNativeHandle
pi_result piextEventCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, bool ownNativeHandle, pi_event *event)
Creates PI event object from a native handle.
Definition: pi_esimd_emulator.cpp:1513
PI_DEVICE_INFO_BUILT_IN_KERNELS
@ PI_DEVICE_INFO_BUILT_IN_KERNELS
Definition: pi.h:268
_pi_sampler_addressing_mode
_pi_sampler_addressing_mode
Definition: pi.h:533
piKernelRelease
pi_result piKernelRelease(pi_kernel kernel)
Definition: pi_esimd_emulator.cpp:1400
PI_DEVICE_INFO_USM_DEVICE_SUPPORT
@ PI_DEVICE_INFO_USM_DEVICE_SUPPORT
Definition: pi.h:290
hip_piProgramCreateWithBinary
pi_result hip_piProgramCreateWithBinary(pi_context context, pi_uint32 num_devices, const pi_device *device_list, const size_t *lengths, const unsigned char **binaries, size_t num_metadata_entries, const pi_device_binary_property *metadata, pi_int32 *binary_status, pi_program *program)
Loads images from a list of PTX or HIPBIN binaries.
Definition: pi_hip.cpp:3300
PI_KERNEL_INFO_PROGRAM
@ PI_KERNEL_INFO_PROGRAM
Definition: pi.h:381
hip_piEnqueueMemUnmap
pi_result hip_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, void *mapped_ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Implements the unmap from the host, using a BufferWrite operation.
Definition: pi_hip.cpp:4890
hip_piProgramRetain
pi_result hip_piProgramRetain(pi_program program)
Definition: pi_hip.cpp:3454
PI_PLATFORM_INFO_NAME
@ PI_PLATFORM_INFO_NAME
Definition: pi.h:157
PI_FP_ROUND_TO_NEAREST
static constexpr pi_device_fp_config PI_FP_ROUND_TO_NEAREST
Definition: pi.h:705
pi.hpp
piextUSMSharedAlloc
pi_result piextUSMSharedAlloc(void **result_ptr, pi_context context, pi_device device, pi_usm_mem_properties *properties, size_t size, pi_uint32 alignment)
Allocates memory accessible on both host and device.
Definition: pi_esimd_emulator.cpp:1904
PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE
@ PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE
Definition: pi.h:246
_pi_device_info
_pi_device_info
Definition: pi.h:209
PI_MEM_TYPE_DEVICE
@ PI_MEM_TYPE_DEVICE
Definition: pi.h:1705
PI_USM_ATOMIC_ACCESS
@ PI_USM_ATOMIC_ACCESS
Definition: pi.h:1690
PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE
@ PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE
Definition: pi.h:254
_pi_image_info
_pi_image_info
Definition: pi.h:396
hip_piDevicePartition
pi_result hip_piDevicePartition(pi_device device, const pi_device_partition_property *properties, pi_uint32 num_devices, pi_device *out_devices, pi_uint32 *out_num_devices)
Not applicable to HIP, devices cannot be partitioned.
Definition: pi_hip.cpp:1043
getKernelNames
std::string getKernelNames(pi_program program)
Finds kernel names by searching for entry points in the PTX source, as the HIP driver API doesn't exp...
Definition: pi_hip.cpp:743
piEventSetStatus
pi_result piEventSetStatus(pi_event event, pi_int32 execution_status)
Definition: pi_esimd_emulator.cpp:1476
_pi_program::build_program
pi_result build_program(const char *build_options)
Definition: pi_cuda.cpp:754
piextUSMDeviceAlloc
pi_result piextUSMDeviceAlloc(void **result_ptr, pi_context context, pi_device device, pi_usm_mem_properties *properties, size_t size, pi_uint32 alignment)
Allocates device memory.
Definition: pi_esimd_emulator.cpp:1899
_PI_HIP_PLUGIN_VERSION_STRING
#define _PI_HIP_PLUGIN_VERSION_STRING
Definition: pi_hip.hpp:25
sycl::_V1::ext::intel::host_ptr
multi_ptr< ElementType, access::address_space::ext_intel_global_host_space, IsDecorated > host_ptr
Definition: usm_pointers.hpp:32
hip_piextDeviceGetNativeHandle
pi_result hip_piextDeviceGetNativeHandle(pi_device device, pi_native_handle *nativeHandle)
Gets the native HIP handle of a PI device object.
Definition: pi_hip.cpp:1949
PI_SAMPLER_PROPERTIES_ADDRESSING_MODE
constexpr pi_sampler_properties PI_SAMPLER_PROPERTIES_ADDRESSING_MODE
Definition: pi.h:557
PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT
Definition: pi.h:222
PI_SAMPLER_FILTER_MODE_NEAREST
@ PI_SAMPLER_FILTER_MODE_NEAREST
Definition: pi.h:542
hip_piextUSMFree
pi_result hip_piextUSMFree(pi_context context, void *ptr)
USM: Frees the given USM pointer associated with the context.
Definition: pi_hip.cpp:5015
hip_piextEventCreateWithNativeHandle
pi_result hip_piextEventCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, bool ownNativeHandle, pi_event *event)
Created a PI event object from a HIP event handle.
Definition: pi_hip.cpp:3988
_pi_kernel
Implementation of a PI Kernel for CUDA.
Definition: pi_cuda.hpp:816
PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT
@ PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT
Definition: pi.h:258
_pi_sampler::props_
pi_uint32 props_
Definition: pi_cuda.hpp:989
piextDeviceGetNativeHandle
pi_result piextDeviceGetNativeHandle(pi_device device, pi_native_handle *nativeHandle)
Gets the native handle of a PI device object.
Definition: pi_esimd_emulator.cpp:827
PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT
@ PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT
Definition: pi.h:241
hip_piEnqueueMemBufferWriteRect
pi_result hip_piEnqueueMemBufferWriteRect(pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, pi_buff_rect_region region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_hip.cpp:4250
hip_piSamplerGetInfo
pi_result hip_piSamplerGetInfo(pi_sampler sampler, pi_sampler_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Gets information from a PI sampler object.
Definition: pi_hip.cpp:4070
pi_buff_rect_offset_struct::y_scalar
size_t y_scalar
Definition: pi.h:901
PI_EVENT_INFO_COMMAND_TYPE
@ PI_EVENT_INFO_COMMAND_TYPE
Definition: pi.h:416
piextContextCreateWithNativeHandle
pi_result piextContextCreateWithNativeHandle(pi_native_handle nativeHandle, pi_uint32 numDevices, const pi_device *devices, bool pluginOwnsNativeHandle, pi_context *context)
Creates PI context object from a native handle.
Definition: pi_esimd_emulator.cpp:881
piProgramGetInfo
pi_result piProgramGetInfo(pi_program program, pi_program_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1325
hip_piQueueRetain
pi_result hip_piQueueRetain(pi_queue command_queue)
Definition: pi_hip.cpp:2539
_pi_mem::mem_::buffer_mem_::alloc_mode
alloc_mode
alloc_mode classic: Just a normal buffer allocated on the device via cuda malloc use_host_ptr: Use an...
Definition: pi_cuda.hpp:270
_pi_plugin::PiVersion
char PiVersion[20]
Definition: pi.h:2000
_pi_context::set_extended_deleter
void set_extended_deleter(pi_context_extended_deleter function, void *user_data)
Definition: pi_cuda.hpp:199
PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE
@ PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE
Definition: pi.h:252
PI_DEVICE_EXEC_CAPABILITIES_KERNEL
constexpr pi_device_exec_capabilities PI_DEVICE_EXEC_CAPABILITIES_KERNEL
Definition: pi.h:549
PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT
@ PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT
Definition: pi.h:361
PI_CONTEXT_INFO_DEVICES
@ PI_CONTEXT_INFO_DEVICES
Definition: pi.h:348
_pi_sampler_filter_mode
_pi_sampler_filter_mode
Definition: pi.h:541
PI_EXT_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES
@ PI_EXT_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES
Definition: pi.h:320
_pi_queue_info
_pi_queue_info
Definition: pi.h:364
pi_buff_rect_region_struct::width_bytes
size_t width_bytes
Definition: pi.h:909
hip_piextMemGetNativeHandle
pi_result hip_piextMemGetNativeHandle(pi_mem mem, pi_native_handle *nativeHandle)
Gets the native HIP handle of a PI mem object.
Definition: pi_hip.cpp:2390
_pi_buffer_create_type
_pi_buffer_create_type
Definition: pi.h:517
_pi_queue::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:617
sycl::_V1::ext::intel::experimental::esimd::line
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_fp_or_dword_type< T >::value &&std::is_floating_point< T >::value, sycl::ext::intel::esimd::simd< T, SZ > > line(sycl::ext::intel::esimd::simd< T, 4 > src0, sycl::ext::intel::esimd::simd< T, SZ > src1, Sat sat={})
Linear equation.
Definition: math.hpp:933
hip_piextUSMSharedAlloc
pi_result hip_piextUSMSharedAlloc(void **result_ptr, pi_context context, pi_device device, pi_usm_mem_properties *properties, size_t size, pi_uint32 alignment)
USM: Implements USM Shared allocations using HIP Managed Memory.
Definition: pi_hip.cpp:4990
_pi_program::buildOptions_
std::string buildOptions_
Definition: pi_cuda.hpp:776
piextUSMEnqueueMemcpy
pi_result piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, void *dst_ptr, const void *src_ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
USM Memcpy API.
Definition: pi_esimd_emulator.cpp:1976
piMemBufferPartition
pi_result piMemBufferPartition(pi_mem buffer, pi_mem_flags flags, pi_buffer_create_type buffer_create_type, void *buffer_create_info, pi_mem *ret_mem)
Definition: pi_esimd_emulator.cpp:1807
_pi_mem::mem_type_
enum _pi_mem::mem_type mem_type_
hip_piEnqueueMemImageRead
pi_result hip_piEnqueueMemImageRead(pi_queue command_queue, pi_mem image, pi_bool blocking_read, const size_t *origin, const size_t *region, size_t row_pitch, size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_hip.cpp:4597
_pi_queue::barrier_mutex_
std::mutex barrier_mutex_
Definition: pi_cuda.hpp:430
_pi_queue::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:621
PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF
Definition: pi.h:233
piextUSMEnqueueMemAdvise
pi_result piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, size_t length, pi_mem_advice advice, pi_event *event)
USM Memadvise API.
Definition: pi_esimd_emulator.cpp:1981
piQueueCreate
pi_result piQueueCreate(pi_context context, pi_device device, pi_queue_properties properties, pi_queue *queue)
Definition: pi_esimd_emulator.cpp:946
hip_piProgramGetInfo
pi_result hip_piProgramGetInfo(pi_program program, pi_program_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_hip.cpp:3339
_pi_queue::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:619
PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE
@ PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE
Definition: pi.h:245
_pi_program::buildStatus_
pi_program_build_status buildStatus_
Definition: pi_cuda.hpp:777
PI_DEVICE_INFO_LOCAL_MEM_SIZE
@ PI_DEVICE_INFO_LOCAL_MEM_SIZE
Definition: pi.h:257
PI_MEMORY_ORDER_RELAXED
constexpr pi_memory_order_capabilities PI_MEMORY_ORDER_RELAXED
Definition: pi.h:561
piextProgramCreateWithNativeHandle
pi_result piextProgramCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, bool pluginOwnsNativeHandle, pi_program *program)
Creates PI program object from a native handle.
Definition: pi_esimd_emulator.cpp:1361
piEventsWait
pi_result piEventsWait(pi_uint32 num_events, const pi_event *event_list)
Definition: pi_esimd_emulator.cpp:1453
_pi_program::module_
native_type module_
Definition: pi_cuda.hpp:762
PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS
@ PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS
Definition: pi.h:324
_pi_platform::evBase_
static hipEvent_t evBase_
Definition: pi_hip.hpp:68
piContextRelease
pi_result piContextRelease(pi_context context)
Definition: pi_esimd_emulator.cpp:897
_pi_queue
PI queue mapping on to CUstream objects.
Definition: pi_cuda.hpp:395
commonEnqueueMemImageNDCopy
static pi_result commonEnqueueMemImageNDCopy(hipStream_t hip_stream, pi_mem_type img_type, const size_t *region, const void *src_ptr, const hipMemoryType src_type, const size_t *src_offset, void *dst_ptr, const hipMemoryType dst_type, const size_t *dst_offset)
General ND memory copy operation for images (where N > 1).
Definition: pi_hip.cpp:4529
std::get
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:199
piProgramRelease
pi_result piProgramRelease(pi_program program)
Definition: pi_esimd_emulator.cpp:1355
PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16
@ PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16
Definition: pi.h:508
hip_piPlatformsGet
pi_result hip_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, pi_uint32 *num_platforms)
Obtains the HIP platform.
Definition: pi_hip.cpp:835
PI_