DPC++ Runtime
Runtime libraries for oneAPI DPC++
pi_hip.cpp
Go to the documentation of this file.
1 //==---------- pi_hip.cpp - HIP Plugin ------------------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
13 
14 #include <pi_hip.hpp>
15 #include <sycl/detail/defines.hpp>
17 #include <sycl/detail/pi.hpp>
18 
19 #include <algorithm>
20 #include <cassert>
21 #include <hip/hip_runtime.h>
22 #include <limits>
23 #include <memory>
24 #include <mutex>
25 #include <regex>
26 #include <string.h>
27 
28 namespace {
29 // Hipify doesn't support cuArrayGetDescriptor, on AMD the hipArray can just be
30 // indexed, but on NVidia it is an opaque type and needs to go through
31 // cuArrayGetDescriptor so implement a utility function to get the array
32 // properties
33 inline void getArrayDesc(hipArray *array, hipArray_Format &format,
34  size_t &channels) {
35 #if defined(__HIP_PLATFORM_AMD__)
36  format = array->Format;
37  channels = array->NumChannels;
38 #elif defined(__HIP_PLATFORM_NVIDIA__)
39  CUDA_ARRAY_DESCRIPTOR arrayDesc;
40  cuArrayGetDescriptor(&arrayDesc, (CUarray)array);
41 
42  format = arrayDesc.Format;
43  channels = arrayDesc.NumChannels;
44 #else
45 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
46 #endif
47 }
48 
49 // NVidia HIP headers guard hipArray3DCreate behind __CUDACC__, this does not
50 // seem to be required and we're not using nvcc to build the HIP PI plugin so
51 // add the translation function here
52 #if defined(__HIP_PLATFORM_NVIDIA__) && !defined(__CUDACC__)
53 inline static hipError_t
54 hipArray3DCreate(hiparray *pHandle,
55  const HIP_ARRAY3D_DESCRIPTOR *pAllocateArray) {
56  return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray));
57 }
58 #endif
59 
60 // hipArray gets turned into cudaArray when using the HIP NVIDIA platform, and
61 // some CUDA APIs use cudaArray* and others use CUarray, these two represent the
62 // same type, however when building cudaArray appears as an opaque type, so it
63 // needs to be explicitly casted to CUarray. In order for this to work for both
64 // AMD and NVidia we introduce an second hipArray type that will be CUarray for
65 // NVIDIA and hipArray* for AMD so that we can place the explicit casts when
66 // necessary for NVIDIA and they will be no-ops for AMD.
67 #if defined(__HIP_PLATFORM_NVIDIA__)
68 typedef CUarray hipCUarray;
69 #elif defined(__HIP_PLATFORM_AMD__)
70 typedef hipArray *hipCUarray;
71 #else
72 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
73 #endif
74 
75 // Add missing HIP to CUDA defines
76 #if defined(__HIP_PLATFORM_NVIDIA__)
77 #define hipMemoryType CUmemorytype
78 #define hipMemoryTypeHost CU_MEMORYTYPE_HOST
79 #define hipMemoryTypeDevice CU_MEMORYTYPE_DEVICE
80 #define hipMemoryTypeArray CU_MEMORYTYPE_ARRAY
81 #define hipMemoryTypeUnified CU_MEMORYTYPE_UNIFIED
82 #endif
83 
84 std::string getHipVersionString() {
85  int driver_version = 0;
86  if (hipDriverGetVersion(&driver_version) != hipSuccess) {
87  return "";
88  }
89  // The version is returned as (1000 major + 10 minor).
90  std::stringstream stream;
91  stream << "HIP " << driver_version / 1000 << "."
92  << driver_version % 1000 / 10;
93  return stream.str();
94 }
95 
96 pi_result map_error(hipError_t result) {
97  switch (result) {
98  case hipSuccess:
99  return PI_SUCCESS;
100  case hipErrorInvalidContext:
101  return PI_ERROR_INVALID_CONTEXT;
102  case hipErrorInvalidDevice:
103  return PI_ERROR_INVALID_DEVICE;
104  case hipErrorInvalidValue:
105  return PI_ERROR_INVALID_VALUE;
106  case hipErrorOutOfMemory:
107  return PI_ERROR_OUT_OF_HOST_MEMORY;
108  case hipErrorLaunchOutOfResources:
109  return PI_ERROR_OUT_OF_RESOURCES;
110  default:
111  return PI_ERROR_UNKNOWN;
112  }
113 }
114 
115 // Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR
116 constexpr size_t MaxMessageSize = 256;
117 thread_local pi_result ErrorMessageCode = PI_SUCCESS;
118 thread_local char ErrorMessage[MaxMessageSize];
119 
120 // Utility function for setting a message and warning
121 [[maybe_unused]] static void setErrorMessage(const char *message,
122  pi_result error_code) {
123  assert(strlen(message) <= MaxMessageSize);
124  strcpy(ErrorMessage, message);
125  ErrorMessageCode = error_code;
126 }
127 
128 // Returns plugin specific error and warning messages
129 pi_result hip_piPluginGetLastError(char **message) {
130  *message = &ErrorMessage[0];
131  return ErrorMessageCode;
132 }
133 
134 // Iterates over the event wait list, returns correct pi_result error codes.
135 // Invokes the callback for the latest event of each queue in the wait list.
136 // The callback must take a single pi_event argument and return a pi_result.
137 template <typename Func>
138 pi_result forLatestEvents(const pi_event *event_wait_list,
139  std::size_t num_events_in_wait_list, Func &&f) {
140 
141  if (event_wait_list == nullptr || num_events_in_wait_list == 0) {
142  return PI_ERROR_INVALID_EVENT_WAIT_LIST;
143  }
144 
145  // Fast path if we only have a single event
146  if (num_events_in_wait_list == 1) {
147  return f(event_wait_list[0]);
148  }
149 
150  std::vector<pi_event> events{event_wait_list,
151  event_wait_list + num_events_in_wait_list};
152  std::sort(events.begin(), events.end(), [](pi_event e0, pi_event e1) {
153  // Tiered sort creating sublists of streams (smallest value first) in which
154  // the corresponding events are sorted into a sequence of newest first.
155  return e0->get_stream() < e1->get_stream() ||
156  (e0->get_stream() == e1->get_stream() &&
157  e0->get_event_id() > e1->get_event_id());
158  });
159 
160  bool first = true;
161  hipStream_t lastSeenStream = 0;
162  for (pi_event event : events) {
163  if (!event || (!first && event->get_stream() == lastSeenStream)) {
164  continue;
165  }
166 
167  first = false;
168  lastSeenStream = event->get_stream();
169 
170  auto result = f(event);
171  if (result != PI_SUCCESS) {
172  return result;
173  }
174  }
175 
176  return PI_SUCCESS;
177 }
178 
186 pi_result check_error(hipError_t result, const char *function, int line,
187  const char *file) {
188  if (result == hipSuccess) {
189  return PI_SUCCESS;
190  }
191 
192  if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) {
193  const char *errorString = nullptr;
194  const char *errorName = nullptr;
195  errorName = hipGetErrorName(result);
196  errorString = hipGetErrorString(result);
197  std::stringstream ss;
198  ss << "\nPI HIP ERROR:"
199  << "\n\tValue: " << result
200  << "\n\tName: " << errorName
201  << "\n\tDescription: " << errorString
202  << "\n\tFunction: " << function << "\n\tSource Location: " << file
203  << ":" << line << "\n"
204  << std::endl;
205  std::cerr << ss.str();
206  }
207 
208  if (std::getenv("PI_HIP_ABORT") != nullptr) {
209  std::abort();
210  }
211 
212  throw map_error(result);
213 }
214 
216 #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)
217 
224 class ScopedContext {
225  pi_context placedContext_;
226  hipCtx_t original_;
227  bool needToRecover_;
228 
229 public:
230  ScopedContext(pi_context ctxt) : placedContext_{ctxt}, needToRecover_{false} {
231 
232  if (!placedContext_) {
233  throw PI_ERROR_INVALID_CONTEXT;
234  }
235 
236  hipCtx_t desired = placedContext_->get();
237  PI_CHECK_ERROR(hipCtxGetCurrent(&original_));
238  if (original_ != desired) {
239  // Sets the desired context as the active one for the thread
240  PI_CHECK_ERROR(hipCtxSetCurrent(desired));
241  if (original_ == nullptr) {
242  // No context is installed on the current thread
243  // This is the most common case. We can activate the context in the
244  // thread and leave it there until all the PI context referring to the
245  // same underlying HIP context are destroyed. This emulates
246  // the behaviour of the HIP runtime api, and avoids costly context
247  // switches. No action is required on this side of the if.
248  } else {
249  needToRecover_ = true;
250  }
251  }
252  }
253 
254  ~ScopedContext() {
255  if (needToRecover_) {
256  PI_CHECK_ERROR(hipCtxSetCurrent(original_));
257  }
258  }
259 };
260 
262 template <typename T, typename Assign>
263 pi_result getInfoImpl(size_t param_value_size, void *param_value,
264  size_t *param_value_size_ret, T value, size_t value_size,
265  Assign &&assign_func) {
266 
267  if (param_value != nullptr) {
268 
269  if (param_value_size < value_size) {
270  return PI_ERROR_INVALID_VALUE;
271  }
272 
273  assign_func(param_value, value, value_size);
274  }
275 
276  if (param_value_size_ret != nullptr) {
277  *param_value_size_ret = value_size;
278  }
279 
280  return PI_SUCCESS;
281 }
282 
283 template <typename T>
284 pi_result getInfo(size_t param_value_size, void *param_value,
285  size_t *param_value_size_ret, T value) {
286 
287  auto assignment = [](void *param_value, T value, size_t value_size) {
288  (void)value_size;
289  *static_cast<T *>(param_value) = value;
290  };
291 
292  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
293  sizeof(T), std::move(assignment));
294 }
295 
296 template <typename T>
297 pi_result getInfoArray(size_t array_length, size_t param_value_size,
298  void *param_value, size_t *param_value_size_ret,
299  T *value) {
300 
301  auto assignment = [](void *param_value, T *value, size_t value_size) {
302  memcpy(param_value, static_cast<const void *>(value), value_size);
303  };
304 
305  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
306  array_length * sizeof(T), std::move(assignment));
307 }
308 
309 template <>
310 pi_result getInfo<const char *>(size_t param_value_size, void *param_value,
311  size_t *param_value_size_ret,
312  const char *value) {
313  return getInfoArray(strlen(value) + 1, param_value_size, param_value,
314  param_value_size_ret, value);
315 }
316 
317 int getAttribute(pi_device device, hipDeviceAttribute_t attribute) {
318  int value;
320  hipDeviceGetAttribute(&value, attribute, device->get()) == hipSuccess);
321  return value;
322 }
324 
325 void simpleGuessLocalWorkSize(size_t *threadsPerBlock,
326  const size_t *global_work_size,
327  const size_t maxThreadsPerBlock[3],
328  pi_kernel kernel) {
329  assert(threadsPerBlock != nullptr);
330  assert(global_work_size != nullptr);
331  assert(kernel != nullptr);
332  // int recommendedBlockSize, minGrid;
333 
334  // PI_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize(
335  // &minGrid, &recommendedBlockSize, kernel->get(),
336  // 0, 0));
337 
338  //(void)minGrid; // Not used, avoid warnings
339 
340  threadsPerBlock[0] = std::min(maxThreadsPerBlock[0], global_work_size[0]);
341 
342  // Find a local work group size that is a divisor of the global
343  // work group size to produce uniform work groups.
344  while (0u != (global_work_size[0] % threadsPerBlock[0])) {
345  --threadsPerBlock[0];
346  }
347 }
348 
349 pi_result enqueueEventsWait(pi_queue command_queue, hipStream_t stream,
350  pi_uint32 num_events_in_wait_list,
351  const pi_event *event_wait_list) {
352  if (!event_wait_list) {
353  return PI_SUCCESS;
354  }
355  try {
356  ScopedContext active(command_queue->get_context());
357 
358  auto result = forLatestEvents(
359  event_wait_list, num_events_in_wait_list,
360  [stream](pi_event event) -> pi_result {
361  if (event->get_stream() == stream) {
362  return PI_SUCCESS;
363  } else {
364  return PI_CHECK_ERROR(hipStreamWaitEvent(stream, event->get(), 0));
365  }
366  });
367 
368  if (result != PI_SUCCESS) {
369  return result;
370  }
371  return PI_SUCCESS;
372  } catch (pi_result err) {
373  return err;
374  } catch (...) {
375  return PI_ERROR_UNKNOWN;
376  }
377 }
378 
379 } // anonymous namespace
380 
382 namespace sycl {
384 namespace detail {
385 namespace pi {
386 
387 // Report error and no return (keeps compiler from printing warnings).
388 // TODO: Probably change that to throw a catchable exception,
389 // but for now it is useful to see every failure.
390 //
391 [[noreturn]] void die(const char *Message) {
392  std::cerr << "pi_die: " << Message << std::endl;
393  std::terminate();
394 }
395 
396 // Reports error messages
397 void hipPrint(const char *Message) {
398  std::cerr << "pi_print: " << Message << std::endl;
399 }
400 
401 void assertion(bool Condition, const char *Message) {
402  if (!Condition)
403  die(Message);
404 }
405 
406 } // namespace pi
407 } // namespace detail
408 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
409 } // namespace sycl
410 
411 //--------------
412 // PI object implementation
413 
414 extern "C" {
415 
416 // Required in a number of functions, so forward declare here
418  pi_uint32 num_events_in_wait_list,
419  const pi_event *event_wait_list,
420  pi_event *event);
422  pi_uint32 num_events_in_wait_list,
423  const pi_event *event_wait_list,
424  pi_event *event);
427 
428 } // extern "C"
429 
431 
433  pi_uint32 stream_i) {
434  if (barrier_event_ && !compute_applied_barrier_[stream_i]) {
435  PI_CHECK_ERROR(hipStreamWaitEvent(stream, barrier_event_, 0));
436  compute_applied_barrier_[stream_i] = true;
437  }
438 }
439 
441  pi_uint32 stream_i) {
442  if (barrier_event_ && !transfer_applied_barrier_[stream_i]) {
443  PI_CHECK_ERROR(hipStreamWaitEvent(stream, barrier_event_, 0));
444  transfer_applied_barrier_[stream_i] = true;
445  }
446 }
447 
448 hipStream_t _pi_queue::get_next_compute_stream(pi_uint32 *stream_token) {
449  pi_uint32 stream_i;
450  pi_uint32 token;
451  while (true) {
452  if (num_compute_streams_ < compute_streams_.size()) {
453  // the check above is for performance - so as not to lock mutex every time
454  std::lock_guard<std::mutex> guard(compute_stream_mutex_);
455  // The second check is done after mutex is locked so other threads can not
456  // change num_compute_streams_ after that
457  if (num_compute_streams_ < compute_streams_.size()) {
458  PI_CHECK_ERROR(hipStreamCreateWithFlags(
460  }
461  }
462  token = compute_stream_idx_++;
463  stream_i = token % compute_streams_.size();
464  // if a stream has been reused before it was next selected round-robin
465  // fashion, we want to delay its next use and instead select another one
466  // that is more likely to have completed all the enqueued work.
467  if (delay_compute_[stream_i]) {
468  delay_compute_[stream_i] = false;
469  } else {
470  break;
471  }
472  }
473  if (stream_token) {
474  *stream_token = token;
475  }
476  hipStream_t res = compute_streams_[stream_i];
478  return res;
479 }
480 
482  pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
483  _pi_stream_guard &guard, pi_uint32 *stream_token) {
484  for (pi_uint32 i = 0; i < num_events_in_wait_list; i++) {
485  pi_uint32 token = event_wait_list[i]->get_compute_stream_token();
486  if (event_wait_list[i]->get_queue() == this && can_reuse_stream(token)) {
487  std::unique_lock<std::mutex> compute_sync_guard(
489  // redo the check after lock to avoid data races on
490  // last_sync_compute_streams_
491  if (can_reuse_stream(token)) {
492  pi_uint32 stream_i = token % delay_compute_.size();
493  delay_compute_[stream_i] = true;
494  if (stream_token) {
495  *stream_token = token;
496  }
497  guard = _pi_stream_guard{std::move(compute_sync_guard)};
498  hipStream_t res = event_wait_list[i]->get_stream();
500  return res;
501  }
502  }
503  }
504  guard = {};
505  return get_next_compute_stream(stream_token);
506 }
507 
509  if (transfer_streams_.empty()) { // for example in in-order queue
510  return get_next_compute_stream();
511  }
513  // the check above is for performance - so as not to lock mutex every time
514  std::lock_guard<std::mutex> guard(transfer_stream_mutex_);
515  // The second check is done after mutex is locked so other threads can not
516  // change num_transfer_streams_ after that
518  PI_CHECK_ERROR(hipStreamCreateWithFlags(
520  }
521  }
522  pi_uint32 stream_i = transfer_stream_idx_++ % transfer_streams_.size();
523  hipStream_t res = transfer_streams_[stream_i];
525  return res;
526 }
527 
529  hipStream_t stream, pi_uint32 stream_token)
530  : commandType_{type}, refCount_{1}, hasBeenWaitedOn_{false},
531  isRecorded_{false}, isStarted_{false},
532  streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr},
533  evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} {
534 
535  assert(type != PI_COMMAND_TYPE_USER);
536 
537  bool profilingEnabled = queue_->properties_ & PI_QUEUE_PROFILING_ENABLE;
538 
539  PI_CHECK_ERROR(hipEventCreateWithFlags(
540  &evEnd_, profilingEnabled ? hipEventDefault : hipEventDisableTiming));
541 
542  if (profilingEnabled) {
543  PI_CHECK_ERROR(hipEventCreateWithFlags(&evQueued_, hipEventDefault));
544  PI_CHECK_ERROR(hipEventCreateWithFlags(&evStart_, hipEventDefault));
545  }
546 
547  if (queue_ != nullptr) {
548  hip_piQueueRetain(queue_);
549  }
551 }
552 
554  if (queue_ != nullptr) {
555  hip_piQueueRelease(queue_);
556  }
557  hip_piContextRelease(context_);
558 }
559 
561  assert(!is_started());
562  pi_result result = PI_SUCCESS;
563 
564  try {
565  if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
566  // NOTE: This relies on the default stream to be unused.
567  PI_CHECK_ERROR(hipEventRecord(evQueued_, 0));
568  PI_CHECK_ERROR(hipEventRecord(evStart_, queue_->get()));
569  }
570  } catch (pi_result error) {
571  result = error;
572  }
573 
574  isStarted_ = true;
575  return result;
576 }
577 
578 bool _pi_event::is_completed() const noexcept {
579  if (!isRecorded_) {
580  return false;
581  }
582  if (!hasBeenWaitedOn_) {
583  const hipError_t ret = hipEventQuery(evEnd_);
584  if (ret != hipSuccess && ret != hipErrorNotReady) {
585  PI_CHECK_ERROR(ret);
586  return false;
587  }
588  if (ret == hipErrorNotReady) {
589  return false;
590  }
591  }
592  return true;
593 }
594 
596  float miliSeconds = 0.0f;
597  assert(is_started());
598 
599  PI_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, evStart_, evEnd_));
600  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
601 }
602 
604  float miliSeconds = 0.0f;
605  assert(is_started());
606 
607  PI_CHECK_ERROR(
608  hipEventElapsedTime(&miliSeconds, context_->evBase_, evStart_));
609  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
610 }
611 
613  float miliSeconds = 0.0f;
614  assert(is_started() && is_recorded());
615 
616  PI_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, context_->evBase_, evEnd_));
617  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
618 }
619 
621 
622  if (is_recorded() || !is_started()) {
623  return PI_ERROR_INVALID_EVENT;
624  }
625 
626  pi_result result = PI_ERROR_INVALID_OPERATION;
627 
628  if (!queue_) {
629  return PI_ERROR_INVALID_QUEUE;
630  }
631 
632  try {
633  eventId_ = queue_->get_next_event_id();
634  if (eventId_ == 0) {
636  "Unrecoverable program state reached in event identifier overflow");
637  }
638  result = PI_CHECK_ERROR(hipEventRecord(evEnd_, stream_));
639  } catch (pi_result error) {
640  result = error;
641  }
642 
643  if (result == PI_SUCCESS) {
644  isRecorded_ = true;
645  }
646 
647  return result;
648 }
649 
651  pi_result retErr;
652  try {
653  retErr = PI_CHECK_ERROR(hipEventSynchronize(evEnd_));
654  hasBeenWaitedOn_ = true;
655  } catch (pi_result error) {
656  retErr = error;
657  }
658 
659  return retErr;
660 }
661 
663  assert(queue_ != nullptr);
664  PI_CHECK_ERROR(hipEventDestroy(evEnd_));
665 
666  if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
667  PI_CHECK_ERROR(hipEventDestroy(evQueued_));
668  PI_CHECK_ERROR(hipEventDestroy(evStart_));
669  }
670 
671  return PI_SUCCESS;
672 }
673 
674 // makes all future work submitted to queue wait for all work captured in event.
676  // for native events, the hipStreamWaitEvent call is used.
677  // This makes all future work submitted to stream wait for all
678  // work captured in event.
679  queue->for_each_stream([e = event->get()](hipStream_t s) {
680  PI_CHECK_ERROR(hipStreamWaitEvent(s, e, 0));
681  });
682  return PI_SUCCESS;
683 }
684 
686  : module_{nullptr}, binary_{},
687  binarySizeInBytes_{0}, refCount_{1}, context_{ctxt} {
688  hip_piContextRetain(context_);
689 }
690 
692 
693 pi_result _pi_program::set_binary(const char *source, size_t length) {
694  assert((binary_ == nullptr && binarySizeInBytes_ == 0) &&
695  "Re-setting program binary data which has already been set");
696  binary_ = source;
698  return PI_SUCCESS;
699 }
700 
701 pi_result _pi_program::build_program(const char *build_options) {
702 
703  this->buildOptions_ = build_options;
704 
705  constexpr const unsigned int numberOfOptions = 4u;
706 
707  hipJitOption options[numberOfOptions];
708  void *optionVals[numberOfOptions];
709 
710  // Pass a buffer for info messages
711  options[0] = hipJitOptionInfoLogBuffer;
712  optionVals[0] = (void *)infoLog_;
713  // Pass the size of the info buffer
714  options[1] = hipJitOptionInfoLogBufferSizeBytes;
715  optionVals[1] = (void *)(long)MAX_LOG_SIZE;
716  // Pass a buffer for error message
717  options[2] = hipJitOptionErrorLogBuffer;
718  optionVals[2] = (void *)errorLog_;
719  // Pass the size of the error buffer
720  options[3] = hipJitOptionErrorLogBufferSizeBytes;
721  optionVals[3] = (void *)(long)MAX_LOG_SIZE;
722 
723  auto result = PI_CHECK_ERROR(
724  hipModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
725  numberOfOptions, options, optionVals));
726 
727  const auto success = (result == PI_SUCCESS);
728 
729  buildStatus_ =
731 
732  // If no exception, result is correct
733  return success ? PI_SUCCESS : PI_ERROR_BUILD_PROGRAM_FAILURE;
734 }
735 
741 std::string getKernelNames(pi_program program) {
742  (void)program;
743  sycl::detail::pi::die("getKernelNames not implemented");
744  return {};
745 }
746 
751 template <typename T> class ReleaseGuard {
752 private:
753  T Captive;
754 
755  static pi_result callRelease(pi_device Captive) {
756  return hip_piDeviceRelease(Captive);
757  }
758 
759  static pi_result callRelease(pi_context Captive) {
760  return hip_piContextRelease(Captive);
761  }
762 
763  static pi_result callRelease(pi_mem Captive) {
764  return hip_piMemRelease(Captive);
765  }
766 
767  static pi_result callRelease(pi_program Captive) {
768  return hip_piProgramRelease(Captive);
769  }
770 
771  static pi_result callRelease(pi_kernel Captive) {
772  return hip_piKernelRelease(Captive);
773  }
774 
775  static pi_result callRelease(pi_queue Captive) {
776  return hip_piQueueRelease(Captive);
777  }
778 
779  static pi_result callRelease(pi_event Captive) {
780  return hip_piEventRelease(Captive);
781  }
782 
783 public:
784  ReleaseGuard() = delete;
786  explicit ReleaseGuard(T Obj) : Captive(Obj) {}
787  ReleaseGuard(ReleaseGuard &&Other) noexcept : Captive(Other.Captive) {
788  Other.Captive = nullptr;
789  }
790 
791  ReleaseGuard(const ReleaseGuard &) = delete;
792 
796  if (Captive != nullptr) {
797  pi_result ret = callRelease(Captive);
798  if (ret != PI_SUCCESS) {
799  // A reported HIP error is either an implementation or an asynchronous
800  // HIP error for which it is unclear if the function that reported it
801  // succeeded or not. Either way, the state of the program is compromised
802  // and likely unrecoverable.
804  "Unrecoverable program state reached in hip_piMemRelease");
805  }
806  }
807  }
808 
809  ReleaseGuard &operator=(const ReleaseGuard &) = delete;
810 
812  Captive = Other.Captive;
813  Other.Captive = nullptr;
814  return *this;
815  }
816 
819  void dismiss() { Captive = nullptr; }
820 };
821 
822 //-- PI API implementation
823 extern "C" {
824 
834  pi_uint32 *num_platforms) {
835 
836  try {
837  static std::once_flag initFlag;
838  static pi_uint32 numPlatforms = 1;
839  static std::vector<_pi_platform> platformIds;
840 
841  if (num_entries == 0 and platforms != nullptr) {
842  return PI_ERROR_INVALID_VALUE;
843  }
844  if (platforms == nullptr and num_platforms == nullptr) {
845  return PI_ERROR_INVALID_VALUE;
846  }
847 
848  pi_result err = PI_SUCCESS;
849 
850  std::call_once(
851  initFlag,
852  [](pi_result &err) {
853  if (hipInit(0) != hipSuccess) {
854  numPlatforms = 0;
855  return;
856  }
857  int numDevices = 0;
858  hipError_t hipErrorCode = hipGetDeviceCount(&numDevices);
859  if (hipErrorCode == hipErrorNoDevice) {
860  numPlatforms = 0;
861  return;
862  }
863  err = PI_CHECK_ERROR(hipErrorCode);
864  if (numDevices == 0) {
865  numPlatforms = 0;
866  return;
867  }
868  try {
869  numPlatforms = numDevices;
870  platformIds.resize(numDevices);
871 
872  for (int i = 0; i < numDevices; ++i) {
873  hipDevice_t device;
874  err = PI_CHECK_ERROR(hipDeviceGet(&device, i));
875  platformIds[i].devices_.emplace_back(
876  new _pi_device{device, &platformIds[i]});
877  }
878  } catch (const std::bad_alloc &) {
879  // Signal out-of-memory situation
880  for (int i = 0; i < numDevices; ++i) {
881  platformIds[i].devices_.clear();
882  }
883  platformIds.clear();
884  err = PI_ERROR_OUT_OF_HOST_MEMORY;
885  } catch (...) {
886  // Clear and rethrow to allow retry
887  for (int i = 0; i < numDevices; ++i) {
888  platformIds[i].devices_.clear();
889  }
890  platformIds.clear();
891  throw;
892  }
893  },
894  err);
895 
896  if (num_platforms != nullptr) {
897  *num_platforms = numPlatforms;
898  }
899 
900  if (platforms != nullptr) {
901  for (unsigned i = 0; i < std::min(num_entries, numPlatforms); ++i) {
902  platforms[i] = &platformIds[i];
903  }
904  }
905 
906  return err;
907  } catch (pi_result err) {
908  return err;
909  } catch (...) {
910  return PI_ERROR_OUT_OF_RESOURCES;
911  }
912 }
913 
915  pi_platform_info param_name,
916  size_t param_value_size, void *param_value,
917  size_t *param_value_size_ret) {
918  assert(platform != nullptr);
919 
920  switch (param_name) {
922  return getInfo(param_value_size, param_value, param_value_size_ret,
923  "AMD HIP BACKEND");
925  return getInfo(param_value_size, param_value, param_value_size_ret,
926  "AMD Corporation");
928  return getInfo(param_value_size, param_value, param_value_size_ret,
929  "FULL PROFILE");
931  auto version = getHipVersionString();
932  return getInfo(param_value_size, param_value, param_value_size_ret,
933  version.c_str());
934  }
936  return getInfo(param_value_size, param_value, param_value_size_ret, "");
937  }
938  default:
940  }
941  sycl::detail::pi::die("Platform info request not implemented");
942  return {};
943 }
944 
951  pi_uint32 num_entries, pi_device *devices,
952  pi_uint32 *num_devices) {
953 
954  pi_result err = PI_SUCCESS;
955  const bool askingForDefault = device_type == PI_DEVICE_TYPE_DEFAULT;
956  const bool askingForGPU = device_type & PI_DEVICE_TYPE_GPU;
957  const bool returnDevices = askingForDefault || askingForGPU;
958 
959  size_t numDevices = returnDevices ? platform->devices_.size() : 0;
960 
961  try {
962  if (num_devices) {
963  *num_devices = numDevices;
964  }
965 
966  if (returnDevices && devices) {
967  for (size_t i = 0; i < std::min(size_t(num_entries), numDevices); ++i) {
968  devices[i] = platform->devices_[i].get();
969  }
970  }
971 
972  return err;
973  } catch (pi_result err) {
974  return err;
975  } catch (...) {
976  return PI_ERROR_OUT_OF_RESOURCES;
977  }
978 }
979 
983  (void)device;
984  return PI_SUCCESS;
985 }
986 
988  size_t param_value_size, void *param_value,
989  size_t *param_value_size_ret) {
990 
991  switch (param_name) {
993  return getInfo(param_value_size, param_value, param_value_size_ret, 1);
995  return getInfo(param_value_size, param_value, param_value_size_ret,
996  context->get_device());
998  return getInfo(param_value_size, param_value, param_value_size_ret,
999  context->get_reference_count());
1001  default:
1003  }
1004 
1005  return PI_ERROR_OUT_OF_RESOURCES;
1006 }
1007 
1009  assert(context != nullptr);
1010  assert(context->get_reference_count() > 0);
1011 
1012  context->increment_reference_count();
1013  return PI_SUCCESS;
1014 }
1015 
1017  pi_context context, pi_context_extended_deleter function, void *user_data) {
1018  context->set_extended_deleter(function, user_data);
1019  return PI_SUCCESS;
1020 }
1021 
1025  const pi_device_partition_property *properties,
1026  pi_uint32 num_devices, pi_device *out_devices,
1027  pi_uint32 *out_num_devices) {
1028  (void)device;
1029  (void)properties;
1030  (void)num_devices;
1031  (void)out_devices;
1032  (void)out_num_devices;
1033 
1034  return PI_ERROR_INVALID_OPERATION;
1035 }
1036 
1040  pi_device_binary *binaries,
1041  pi_uint32 num_binaries,
1042  pi_uint32 *selected_binary) {
1043  (void)device;
1044  if (!binaries) {
1045  sycl::detail::pi::die("No list of device images provided");
1046  }
1047  if (num_binaries < 1) {
1048  sycl::detail::pi::die("No binary images in the list");
1049  }
1050 
1051  // Look for an image for the HIP target, and return the first one that is
1052  // found
1053 #if defined(__HIP_PLATFORM_AMD__)
1054  const char *binary_type = __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN;
1055 #elif defined(__HIP_PLATFORM_NVIDIA__)
1056  const char *binary_type = __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64;
1057 #else
1058 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
1059 #endif
1060 
1061  for (pi_uint32 i = 0; i < num_binaries; i++) {
1062  if (strcmp(binaries[i]->DeviceTargetSpec, binary_type) == 0) {
1063  *selected_binary = i;
1064  return PI_SUCCESS;
1065  }
1066  }
1067 
1068  // No image can be loaded for the given device
1069  return PI_ERROR_INVALID_BINARY;
1070 }
1071 
1073  pi_program program,
1074  const char *func_name,
1075  pi_uint64 *func_pointer_ret) {
1076  // Check if device passed is the same the device bound to the context
1077  assert(device == program->get_context()->get_device());
1078  assert(func_pointer_ret != nullptr);
1079 
1080  hipFunction_t func;
1081  hipError_t ret = hipModuleGetFunction(&func, program->get(), func_name);
1082  *func_pointer_ret = reinterpret_cast<pi_uint64>(func);
1083  pi_result retError = PI_SUCCESS;
1084 
1085  if (ret != hipSuccess && ret != hipErrorNotFound)
1086  retError = PI_CHECK_ERROR(ret);
1087  if (ret == hipErrorNotFound) {
1088  *func_pointer_ret = 0;
1089  retError = PI_ERROR_INVALID_KERNEL_NAME;
1090  }
1091 
1092  return retError;
1093 }
1094 
1098  (void)device;
1099  return PI_SUCCESS;
1100 }
1101 
1103  size_t param_value_size, void *param_value,
1104  size_t *param_value_size_ret) {
1105 
1106  static constexpr pi_uint32 max_work_item_dimensions = 3u;
1107 
1108  assert(device != nullptr);
1109 
1110  switch (param_name) {
1111  case PI_DEVICE_INFO_TYPE: {
1112  return getInfo(param_value_size, param_value, param_value_size_ret,
1114  }
1115  case PI_DEVICE_INFO_VENDOR_ID: {
1116 #if defined(__HIP_PLATFORM_AMD__)
1117  pi_uint32 vendor_id = 4098u;
1118 #elif defined(__HIP_PLATFORM_NVIDIA__)
1119  pi_uint32 vendor_id = 4318u;
1120 #else
1121  pi_uint32 vendor_id = 0u;
1122 #endif
1123 
1124  return getInfo(param_value_size, param_value, param_value_size_ret,
1125  vendor_id);
1126  }
1128  int compute_units = 0;
1130  hipDeviceGetAttribute(&compute_units,
1131  hipDeviceAttributeMultiprocessorCount,
1132  device->get()) == hipSuccess);
1133  sycl::detail::pi::assertion(compute_units >= 0);
1134  return getInfo(param_value_size, param_value, param_value_size_ret,
1135  pi_uint32(compute_units));
1136  }
1138  return getInfo(param_value_size, param_value, param_value_size_ret,
1139  max_work_item_dimensions);
1140  }
1142  size_t return_sizes[max_work_item_dimensions];
1143 
1144  int max_x = 0, max_y = 0, max_z = 0;
1146  hipDeviceGetAttribute(&max_x, hipDeviceAttributeMaxBlockDimX,
1147  device->get()) == hipSuccess);
1148  sycl::detail::pi::assertion(max_x >= 0);
1149 
1151  hipDeviceGetAttribute(&max_y, hipDeviceAttributeMaxBlockDimY,
1152  device->get()) == hipSuccess);
1153  sycl::detail::pi::assertion(max_y >= 0);
1154 
1156  hipDeviceGetAttribute(&max_z, hipDeviceAttributeMaxBlockDimZ,
1157  device->get()) == hipSuccess);
1158  sycl::detail::pi::assertion(max_z >= 0);
1159 
1160  return_sizes[0] = size_t(max_x);
1161  return_sizes[1] = size_t(max_y);
1162  return_sizes[2] = size_t(max_z);
1163  return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
1164  param_value_size_ret, return_sizes);
1165  }
1166 
1168  size_t return_sizes[max_work_item_dimensions];
1169  int max_x = 0, max_y = 0, max_z = 0;
1171  hipDeviceGetAttribute(&max_x, hipDeviceAttributeMaxGridDimX,
1172  device->get()) == hipSuccess);
1173  sycl::detail::pi::assertion(max_x >= 0);
1174 
1176  hipDeviceGetAttribute(&max_y, hipDeviceAttributeMaxGridDimY,
1177  device->get()) == hipSuccess);
1178  sycl::detail::pi::assertion(max_y >= 0);
1179 
1181  hipDeviceGetAttribute(&max_z, hipDeviceAttributeMaxGridDimZ,
1182  device->get()) == hipSuccess);
1183  sycl::detail::pi::assertion(max_z >= 0);
1184 
1185  return_sizes[0] = size_t(max_x);
1186  return_sizes[1] = size_t(max_y);
1187  return_sizes[2] = size_t(max_z);
1188  return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
1189  param_value_size_ret, return_sizes);
1190  }
1191 
1193  int max_work_group_size = 0;
1195  hipDeviceGetAttribute(&max_work_group_size,
1196  hipDeviceAttributeMaxThreadsPerBlock,
1197  device->get()) == hipSuccess);
1198 
1199  sycl::detail::pi::assertion(max_work_group_size >= 0);
1200 
1201  return getInfo(param_value_size, param_value, param_value_size_ret,
1202  size_t(max_work_group_size));
1203  }
1205  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1206  }
1208  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1209  }
1211  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1212  }
1214  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1215  }
1217  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1218  }
1220  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1221  }
1223  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1224  }
1226  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1227  }
1229  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1230  }
1232  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1233  }
1235  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1236  }
1238  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1239  }
1241  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1242  }
1244  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1245  }
1247  // Number of sub-groups = max block size / warp size + possible remainder
1248  int max_threads = 0;
1250  hipDeviceGetAttribute(&max_threads,
1251  hipDeviceAttributeMaxThreadsPerBlock,
1252  device->get()) == hipSuccess);
1253  int warpSize = 0;
1255  hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
1256  device->get()) == hipSuccess);
1257  int maxWarps = (max_threads + warpSize - 1) / warpSize;
1258  return getInfo(param_value_size, param_value, param_value_size_ret,
1259  static_cast<uint32_t>(maxWarps));
1260  }
1262  // Volta provides independent thread scheduling
1263  // TODO: Revisit for previous generation GPUs
1264  int major = 0;
1266  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor,
1267  device->get()) == hipSuccess);
1268  bool ifp = (major >= 7);
1269  return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
1270  }
1272  int warpSize = 0;
1274  hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
1275  device->get()) == hipSuccess);
1276  size_t sizes[1] = {static_cast<size_t>(warpSize)};
1277  return getInfoArray<size_t>(1, param_value_size, param_value,
1278  param_value_size_ret, sizes);
1279  }
1281  int clock_freq = 0;
1283  hipDeviceGetAttribute(&clock_freq, hipDeviceAttributeClockRate,
1284  device->get()) == hipSuccess);
1285  sycl::detail::pi::assertion(clock_freq >= 0);
1286  return getInfo(param_value_size, param_value, param_value_size_ret,
1287  pi_uint32(clock_freq) / 1000u);
1288  }
1290  auto bits = pi_uint32{std::numeric_limits<uintptr_t>::digits};
1291  return getInfo(param_value_size, param_value, param_value_size_ret, bits);
1292  }
1294  // Max size of memory object allocation in bytes.
1295  // The minimum value is max(min(1024 × 1024 ×
1296  // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE),
1297  // 32 × 1024 × 1024) for devices that are not of type
1298  // CL_DEVICE_TYPE_HIPSTOM.
1299 
1300  size_t global = 0;
1301  sycl::detail::pi::assertion(hipDeviceTotalMem(&global, device->get()) ==
1302  hipSuccess);
1303 
1304  auto quarter_global = static_cast<pi_uint32>(global / 4u);
1305 
1306  auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global),
1307  32u * 1024u * 1024u);
1308 
1309  return getInfo(param_value_size, param_value, param_value_size_ret,
1310  pi_uint64{max_alloc});
1311  }
1313  return getInfo(param_value_size, param_value, param_value_size_ret,
1314  PI_TRUE);
1315  }
1317  // This call doesn't match to HIP as it doesn't have images, but instead
1318  // surfaces and textures. No clear call in the HIP API to determine this,
1319  // but some searching found as of SM 2.x 128 are supported.
1320  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1321  }
1323  // This call doesn't match to HIP as it doesn't have images, but instead
1324  // surfaces and textures. No clear call in the HIP API to determine this,
1325  // but some searching found as of SM 2.x 128 are supported.
1326  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1327  }
1328 
1330  // Take the smaller of maximum surface and maximum texture height.
1331  int tex_height = 0;
1333  hipDeviceGetAttribute(&tex_height, hipDeviceAttributeMaxTexture2DHeight,
1334  device->get()) == hipSuccess);
1335  sycl::detail::pi::assertion(tex_height >= 0);
1336  int surf_height = 0;
1338  hipDeviceGetAttribute(&surf_height,
1339  hipDeviceAttributeMaxTexture2DHeight,
1340  device->get()) == hipSuccess);
1341  sycl::detail::pi::assertion(surf_height >= 0);
1342 
1343  int min = std::min(tex_height, surf_height);
1344 
1345  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1346  }
1348  // Take the smaller of maximum surface and maximum texture width.
1349  int tex_width = 0;
1351  hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture2DWidth,
1352  device->get()) == hipSuccess);
1353  sycl::detail::pi::assertion(tex_width >= 0);
1354  int surf_width = 0;
1356  hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture2DWidth,
1357  device->get()) == hipSuccess);
1358  sycl::detail::pi::assertion(surf_width >= 0);
1359 
1360  int min = std::min(tex_width, surf_width);
1361 
1362  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1363  }
1365  // Take the smaller of maximum surface and maximum texture height.
1366  int tex_height = 0;
1368  hipDeviceGetAttribute(&tex_height, hipDeviceAttributeMaxTexture3DHeight,
1369  device->get()) == hipSuccess);
1370  sycl::detail::pi::assertion(tex_height >= 0);
1371  int surf_height = 0;
1373  hipDeviceGetAttribute(&surf_height,
1374  hipDeviceAttributeMaxTexture3DHeight,
1375  device->get()) == hipSuccess);
1376  sycl::detail::pi::assertion(surf_height >= 0);
1377 
1378  int min = std::min(tex_height, surf_height);
1379 
1380  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1381  }
1383  // Take the smaller of maximum surface and maximum texture width.
1384  int tex_width = 0;
1386  hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture3DWidth,
1387  device->get()) == hipSuccess);
1388  sycl::detail::pi::assertion(tex_width >= 0);
1389  int surf_width = 0;
1391  hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture3DWidth,
1392  device->get()) == hipSuccess);
1393  sycl::detail::pi::assertion(surf_width >= 0);
1394 
1395  int min = std::min(tex_width, surf_width);
1396 
1397  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1398  }
1400  // Take the smaller of maximum surface and maximum texture depth.
1401  int tex_depth = 0;
1403  hipDeviceGetAttribute(&tex_depth, hipDeviceAttributeMaxTexture3DDepth,
1404  device->get()) == hipSuccess);
1405  sycl::detail::pi::assertion(tex_depth >= 0);
1406  int surf_depth = 0;
1408  hipDeviceGetAttribute(&surf_depth, hipDeviceAttributeMaxTexture3DDepth,
1409  device->get()) == hipSuccess);
1410  sycl::detail::pi::assertion(surf_depth >= 0);
1411 
1412  int min = std::min(tex_depth, surf_depth);
1413 
1414  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1415  }
1417  // Take the smaller of maximum surface and maximum texture width.
1418  int tex_width = 0;
1420  hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture1DWidth,
1421  device->get()) == hipSuccess);
1422  sycl::detail::pi::assertion(tex_width >= 0);
1423  int surf_width = 0;
1425  hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture1DWidth,
1426  device->get()) == hipSuccess);
1427  sycl::detail::pi::assertion(surf_width >= 0);
1428 
1429  int min = std::min(tex_width, surf_width);
1430 
1431  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1432  }
1434  return getInfo(param_value_size, param_value, param_value_size_ret,
1435  size_t(0));
1436  }
1438  // This call is kind of meaningless for HIP, as samplers don't exist.
1439  // Closest thing is textures, which is 128.
1440  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1441  }
1443  // __global__ function parameters are passed to the device via constant
1444  // memory and are limited to 4 KB.
1445  return getInfo(param_value_size, param_value, param_value_size_ret,
1446  size_t{4000u});
1447  }
1449  int mem_base_addr_align = 0;
1451  hipDeviceGetAttribute(&mem_base_addr_align,
1452  hipDeviceAttributeTextureAlignment,
1453  device->get()) == hipSuccess);
1454  // Multiply by 8 as clGetDeviceInfo returns this value in bits
1455  mem_base_addr_align *= 8;
1456  return getInfo(param_value_size, param_value, param_value_size_ret,
1457  mem_base_addr_align);
1458  }
1460  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1461  }
1466  return getInfo(param_value_size, param_value, param_value_size_ret, config);
1467  }
1471  return getInfo(param_value_size, param_value, param_value_size_ret, config);
1472  }
1474  return getInfo(param_value_size, param_value, param_value_size_ret,
1476  }
1478  // The value is dohipmented for all existing GPUs in the HIP programming
1479  // guidelines, section "H.3.2. Global Memory".
1480  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1481  }
1483  int cache_size = 0;
1485  hipDeviceGetAttribute(&cache_size, hipDeviceAttributeL2CacheSize,
1486  device->get()) == hipSuccess);
1487  sycl::detail::pi::assertion(cache_size >= 0);
1488  // The L2 cache is global to the GPU.
1489  return getInfo(param_value_size, param_value, param_value_size_ret,
1490  pi_uint64(cache_size));
1491  }
1493  size_t bytes = 0;
1494  // Runtime API has easy access to this value, driver API info is scarse.
1495  sycl::detail::pi::assertion(hipDeviceTotalMem(&bytes, device->get()) ==
1496  hipSuccess);
1497  return getInfo(param_value_size, param_value, param_value_size_ret,
1498  pi_uint64{bytes});
1499  }
1501  unsigned int constant_memory = 0;
1502 
1503  // hipDeviceGetAttribute takes a int*, however the size of the constant
1504  // memory on AMD GPU may be larger than what can fit in the positive part
1505  // of a signed integer, so use an unsigned integer and cast the pointer to
1506  // int*.
1508  hipDeviceGetAttribute(reinterpret_cast<int *>(&constant_memory),
1509  hipDeviceAttributeTotalConstantMemory,
1510  device->get()) == hipSuccess);
1511 
1512  return getInfo(param_value_size, param_value, param_value_size_ret,
1513  pi_uint64(constant_memory));
1514  }
1516  // TODO: is there a way to retrieve this from HIP driver API?
1517  // Hard coded to value returned by clinfo for OpenCL 1.2 HIP | GeForce GTX
1518  // 1060 3GB
1519  return getInfo(param_value_size, param_value, param_value_size_ret, 9u);
1520  }
1522  return getInfo(param_value_size, param_value, param_value_size_ret,
1524  }
1526  // OpenCL's "local memory" maps most closely to HIP's "shared memory".
1527  // HIP has its own definition of "local memory", which maps to OpenCL's
1528  // "private memory".
1529  int local_mem_size = 0;
1531  hipDeviceGetAttribute(&local_mem_size,
1532  hipDeviceAttributeMaxSharedMemoryPerBlock,
1533  device->get()) == hipSuccess);
1534  sycl::detail::pi::assertion(local_mem_size >= 0);
1535  return getInfo(param_value_size, param_value, param_value_size_ret,
1536  pi_uint64(local_mem_size));
1537  }
1539  int ecc_enabled = 0;
1541  hipDeviceGetAttribute(&ecc_enabled, hipDeviceAttributeEccEnabled,
1542  device->get()) == hipSuccess);
1543 
1544  sycl::detail::pi::assertion((ecc_enabled == 0) | (ecc_enabled == 1));
1545  auto result = static_cast<pi_bool>(ecc_enabled);
1546  return getInfo(param_value_size, param_value, param_value_size_ret, result);
1547  }
1549  int is_integrated = 0;
1551  hipDeviceGetAttribute(&is_integrated, hipDeviceAttributeIntegrated,
1552  device->get()) == hipSuccess);
1553 
1554  sycl::detail::pi::assertion((is_integrated == 0) | (is_integrated == 1));
1555  auto result = static_cast<pi_bool>(is_integrated);
1556  return getInfo(param_value_size, param_value, param_value_size_ret, result);
1557  }
1559  // Hard coded to value returned by clinfo for OpenCL 1.2 HIP | GeForce GTX
1560  // 1060 3GB
1561  return getInfo(param_value_size, param_value, param_value_size_ret,
1562  size_t{1000u});
1563  }
1565  return getInfo(param_value_size, param_value, param_value_size_ret,
1566  PI_TRUE);
1567  }
1568  case PI_DEVICE_INFO_AVAILABLE: {
1569  return getInfo(param_value_size, param_value, param_value_size_ret,
1570  PI_TRUE);
1571  }
1573  return getInfo(param_value_size, param_value, param_value_size_ret,
1574  PI_TRUE);
1575  }
1577  return getInfo(param_value_size, param_value, param_value_size_ret,
1578  PI_TRUE);
1579  }
1581  return getInfo(param_value_size, param_value, param_value_size_ret,
1582  PI_TRUE);
1583  }
1585  auto capability = PI_DEVICE_EXEC_CAPABILITIES_KERNEL;
1586  return getInfo(param_value_size, param_value, param_value_size_ret,
1587  capability);
1588  }
1590  // The mandated minimum capability:
1591  auto capability =
1593  return getInfo(param_value_size, param_value, param_value_size_ret,
1594  capability);
1595  }
1597  // The mandated minimum capability:
1598  auto capability = PI_QUEUE_PROFILING_ENABLE;
1599  return getInfo(param_value_size, param_value, param_value_size_ret,
1600  capability);
1601  }
1603  // An empty string is returned if no built-in kernels are supported by the
1604  // device.
1605  return getInfo(param_value_size, param_value, param_value_size_ret, "");
1606  }
1607  case PI_DEVICE_INFO_PLATFORM: {
1608  return getInfo(param_value_size, param_value, param_value_size_ret,
1609  device->get_platform());
1610  }
1611  case PI_DEVICE_INFO_NAME: {
1612  static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u;
1613  char name[MAX_DEVICE_NAME_LENGTH];
1614  sycl::detail::pi::assertion(hipDeviceGetName(name, MAX_DEVICE_NAME_LENGTH,
1615  device->get()) == hipSuccess);
1616 
1617  // On AMD GPUs hipDeviceGetName returns an empty string, so return the arch
1618  // name instead, this is also what AMD OpenCL devices return.
1619  if (strlen(name) == 0) {
1620  hipDeviceProp_t props;
1622  hipGetDeviceProperties(&props, device->get()) == hipSuccess);
1623 
1624  return getInfoArray(strlen(props.gcnArchName) + 1, param_value_size,
1625  param_value, param_value_size_ret, props.gcnArchName);
1626  }
1627  return getInfoArray(strlen(name) + 1, param_value_size, param_value,
1628  param_value_size_ret, name);
1629  }
1630  case PI_DEVICE_INFO_VENDOR: {
1631  return getInfo(param_value_size, param_value, param_value_size_ret,
1632  "AMD Corporation");
1633  }
1635  auto version = getHipVersionString();
1636  return getInfo(param_value_size, param_value, param_value_size_ret,
1637  version.c_str());
1638  }
1639  case PI_DEVICE_INFO_PROFILE: {
1640  return getInfo(param_value_size, param_value, param_value_size_ret, "HIP");
1641  }
1643  return getInfo(param_value_size, param_value, param_value_size_ret,
1644  device->get_reference_count());
1645  }
1646  case PI_DEVICE_INFO_VERSION: {
1647  return getInfo(param_value_size, param_value, param_value_size_ret,
1648  "PI 0.0");
1649  }
1651  return getInfo(param_value_size, param_value, param_value_size_ret, "");
1652  }
1654  // TODO: Remove comment when HIP support native asserts.
1655  // DEVICELIB_ASSERT extension is set so fallback assert
1656  // postprocessing is NOP. HIP 4.3 docs indicate support for
1657  // native asserts are in progress
1658  std::string SupportedExtensions = "";
1659  SupportedExtensions += PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT;
1660  SupportedExtensions += " ";
1661 
1662  hipDeviceProp_t props;
1663  sycl::detail::pi::assertion(hipGetDeviceProperties(&props, device->get()) ==
1664  hipSuccess);
1665  if (props.arch.hasDoubles) {
1666  SupportedExtensions += "cl_khr_fp64 ";
1667  }
1668 
1669  return getInfo(param_value_size, param_value, param_value_size_ret,
1670  SupportedExtensions.c_str());
1671  }
1673  // The minimum value for the FULL profile is 1 MB.
1674  return getInfo(param_value_size, param_value, param_value_size_ret,
1675  size_t{1024u});
1676  }
1678  return getInfo(param_value_size, param_value, param_value_size_ret,
1679  PI_TRUE);
1680  }
1682  return getInfo(param_value_size, param_value, param_value_size_ret,
1683  nullptr);
1684  }
1686  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1687  }
1689  return getInfo(param_value_size, param_value, param_value_size_ret,
1690  static_cast<pi_device_partition_property>(0u));
1691  }
1693  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1694  }
1696  return getInfo(param_value_size, param_value, param_value_size_ret,
1697  static_cast<pi_device_partition_property>(0u));
1698  }
1699 
1700  // Intel USM extensions
1701 
1703  // from cl_intel_unified_shared_memory: "The host memory access capabilities
1704  // apply to any host allocation."
1705  //
1706  // query if/how the device can access page-locked host memory, possibly
1707  // through PCIe, using the same pointer as the host
1708  pi_bitfield value = {};
1709  // if (getAttribute(device, HIP_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
1710  // the device shares a unified address space with the host
1711  if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) {
1712  // compute capability 6.x introduces operations that are atomic with
1713  // respect to other CPUs and GPUs in the system
1716  } else {
1717  // on GPU architectures with compute capability lower than 6.x, atomic
1718  // operations from the GPU to CPU memory will not be atomic with respect
1719  // to CPU initiated atomic operations
1721  }
1722  //}
1723  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1724  }
1726  // from cl_intel_unified_shared_memory:
1727  // "The device memory access capabilities apply to any device allocation
1728  // associated with this device."
1729  //
1730  // query how the device can access memory allocated on the device itself (?)
1734  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1735  }
1737  // from cl_intel_unified_shared_memory:
1738  // "The single device shared memory access capabilities apply to any shared
1739  // allocation associated with this device."
1740  //
1741  // query if/how the device can access managed memory associated to it
1742  pi_bitfield value = {};
1743  if (getAttribute(device, hipDeviceAttributeManagedMemory)) {
1744  // the device can allocate managed memory on this system
1746  }
1747  if (getAttribute(device, hipDeviceAttributeConcurrentManagedAccess)) {
1748  // the device can coherently access managed memory concurrently with the
1749  // CPU
1750  value |= PI_USM_CONCURRENT_ACCESS;
1751  if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) {
1752  // compute capability 6.x introduces operations that are atomic with
1753  // respect to other CPUs and GPUs in the system
1755  }
1756  }
1757  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1758  }
1760  // from cl_intel_unified_shared_memory:
1761  // "The cross-device shared memory access capabilities apply to any shared
1762  // allocation associated with this device, or to any shared memory
1763  // allocation on another device that also supports the same cross-device
1764  // shared memory access capability."
1765  //
1766  // query if/how the device can access managed memory associated to other
1767  // devices
1768  pi_bitfield value = {};
1769  if (getAttribute(device, hipDeviceAttributeManagedMemory)) {
1770  // the device can allocate managed memory on this system
1771  value |= PI_USM_ACCESS;
1772  }
1773  if (getAttribute(device, hipDeviceAttributeConcurrentManagedAccess)) {
1774  // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
1775  // attribute can coherently access managed memory concurrently with the
1776  // CPU
1777  value |= PI_USM_CONCURRENT_ACCESS;
1778  }
1779  if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) {
1780  // compute capability 6.x introduces operations that are atomic with
1781  // respect to other CPUs and GPUs in the system
1782  if (value & PI_USM_ACCESS)
1783  value |= PI_USM_ATOMIC_ACCESS;
1784  if (value & PI_USM_CONCURRENT_ACCESS)
1786  }
1787  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1788  }
1790  // from cl_intel_unified_shared_memory:
1791  // "The shared system memory access capabilities apply to any allocations
1792  // made by a system allocator, such as malloc or new."
1793  //
1794  // query if/how the device can access pageable host memory allocated by the
1795  // system allocator
1796  pi_bitfield value = {};
1797  if (getAttribute(device, hipDeviceAttributePageableMemoryAccess)) {
1798  // the link between the device and the host does not support native
1799  // atomic operations
1801  }
1802  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1803  }
1804 
1805  case PI_DEVICE_INFO_ATOMIC_64: {
1806  // TODO: Reconsider it when AMD supports SYCL_USE_NATIVE_FP_ATOMICS.
1807  hipDeviceProp_t props;
1808  sycl::detail::pi::assertion(hipGetDeviceProperties(&props, device->get()) ==
1809  hipSuccess);
1810  return getInfo(param_value_size, param_value, param_value_size_ret,
1811  props.arch.hasGlobalInt64Atomics &&
1812  props.arch.hasSharedInt64Atomics);
1813  }
1814 
1816  size_t FreeMemory = 0;
1817  size_t TotalMemory = 0;
1818  sycl::detail::pi::assertion(hipMemGetInfo(&FreeMemory, &TotalMemory) ==
1819  hipSuccess,
1820  "failed hipMemGetInfo() API.");
1821  return getInfo(param_value_size, param_value, param_value_size_ret,
1822  FreeMemory);
1823  }
1824 
1826  int value = 0;
1828  hipDeviceGetAttribute(&value, hipDeviceAttributeMemoryClockRate,
1829  device->get()) == hipSuccess);
1830  sycl::detail::pi::assertion(value >= 0);
1831  // Convert kilohertz to megahertz when returning.
1832  return getInfo(param_value_size, param_value, param_value_size_ret,
1833  value / 1000);
1834  }
1835 
1837  int value = 0;
1839  hipDeviceGetAttribute(&value, hipDeviceAttributeMemoryBusWidth,
1840  device->get()) == hipSuccess);
1841  sycl::detail::pi::assertion(value >= 0);
1842  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1843  }
1844 
1845  // TODO: Implement.
1847  // TODO: Investigate if this information is available on HIP.
1859  return PI_ERROR_INVALID_VALUE;
1860 
1861  default:
1863  }
1864  sycl::detail::pi::die("Device info request not implemented");
1865  return {};
1866 }
1867 
1875  pi_native_handle *nativeHandle) {
1876  *nativeHandle = static_cast<pi_native_handle>(device->get());
1877  return PI_SUCCESS;
1878 }
1879 
1890  pi_platform platform,
1891  pi_device *device) {
1892  (void)nativeHandle;
1893  (void)platform;
1894  (void)device;
1896  "Creation of PI device from native handle not implemented");
1897  return {};
1898 }
1899 
1900 /* Context APIs */
1901 
1921  pi_uint32 num_devices, const pi_device *devices,
1922  void (*pfn_notify)(const char *errinfo,
1923  const void *private_info,
1924  size_t cb, void *user_data),
1925  void *user_data, pi_context *retcontext) {
1926 
1927  assert(devices != nullptr);
1928  // TODO: How to implement context callback?
1929  assert(pfn_notify == nullptr);
1930  assert(user_data == nullptr);
1931  assert(num_devices == 1);
1932  // Need input context
1933  assert(retcontext != nullptr);
1934  pi_result errcode_ret = PI_SUCCESS;
1935 
1936  // Parse properties.
1937  bool property_hip_primary = false;
1938  while (properties && (0 != *properties)) {
1939  // Consume property ID.
1940  pi_context_properties id = *properties;
1941  ++properties;
1942  // Consume property value.
1943  pi_context_properties value = *properties;
1944  ++properties;
1945  switch (id) {
1947  assert(value == PI_FALSE || value == PI_TRUE);
1948  property_hip_primary = static_cast<bool>(value);
1949  break;
1950  default:
1951  // Unknown property.
1953  "Unknown piContextCreate property in property list");
1954  return PI_ERROR_INVALID_VALUE;
1955  }
1956  }
1957 
1958  std::unique_ptr<_pi_context> piContextPtr{nullptr};
1959  try {
1960  hipCtx_t current = nullptr;
1961 
1962  if (property_hip_primary) {
1963  // Use the HIP primary context and assume that we want to use it
1964  // immediately as we want to forge context switches.
1965  hipCtx_t Ctxt;
1966  errcode_ret =
1967  PI_CHECK_ERROR(hipDevicePrimaryCtxRetain(&Ctxt, devices[0]->get()));
1968  piContextPtr = std::unique_ptr<_pi_context>(
1969  new _pi_context{_pi_context::kind::primary, Ctxt, *devices});
1970  errcode_ret = PI_CHECK_ERROR(hipCtxPushCurrent(Ctxt));
1971  } else {
1972  // Create a scoped context.
1973  hipCtx_t newContext;
1974  PI_CHECK_ERROR(hipCtxGetCurrent(&current));
1975  errcode_ret = PI_CHECK_ERROR(
1976  hipCtxCreate(&newContext, hipDeviceMapHost, devices[0]->get()));
1977  piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{
1978  _pi_context::kind::user_defined, newContext, *devices});
1979  }
1980 
1981  // Use default stream to record base event counter
1982  PI_CHECK_ERROR(
1983  hipEventCreateWithFlags(&piContextPtr->evBase_, hipEventDefault));
1984  PI_CHECK_ERROR(hipEventRecord(piContextPtr->evBase_, 0));
1985 
1986  // For non-primary scoped contexts keep the last active on top of the stack
1987  // as `cuCtxCreate` replaces it implicitly otherwise.
1988  // Primary contexts are kept on top of the stack, so the previous context
1989  // is not queried and therefore not recovered.
1990  if (current != nullptr) {
1991  PI_CHECK_ERROR(hipCtxSetCurrent(current));
1992  }
1993 
1994  *retcontext = piContextPtr.release();
1995  } catch (pi_result err) {
1996  errcode_ret = err;
1997  } catch (...) {
1998  errcode_ret = PI_ERROR_OUT_OF_RESOURCES;
1999  }
2000  return errcode_ret;
2001 }
2002 
2004 
2005  assert(ctxt != nullptr);
2006 
2007  if (ctxt->decrement_reference_count() > 0) {
2008  return PI_SUCCESS;
2009  }
2010  ctxt->invoke_extended_deleters();
2011 
2012  std::unique_ptr<_pi_context> context{ctxt};
2013 
2014  PI_CHECK_ERROR(hipEventDestroy(context->evBase_));
2015 
2016  if (!ctxt->is_primary()) {
2017  hipCtx_t hipCtxt = ctxt->get();
2018  // hipCtxSynchronize is not supported for AMD platform so we can just
2019  // destroy the context, for NVIDIA make sure it's synchronized.
2020 #if defined(__HIP_PLATFORM_NVIDIA__)
2021  hipCtx_t current = nullptr;
2022  PI_CHECK_ERROR(hipCtxGetCurrent(&current));
2023  if (hipCtxt != current) {
2024  PI_CHECK_ERROR(hipCtxPushCurrent(hipCtxt));
2025  }
2026  PI_CHECK_ERROR(hipCtxSynchronize());
2027  PI_CHECK_ERROR(hipCtxGetCurrent(&current));
2028  if (hipCtxt == current) {
2029  PI_CHECK_ERROR(hipCtxPopCurrent(&current));
2030  }
2031 #endif
2032  return PI_CHECK_ERROR(hipCtxDestroy(hipCtxt));
2033  } else {
2034  // Primary context is not destroyed, but released
2035  hipDevice_t hipDev = ctxt->get_device()->get();
2036  hipCtx_t current;
2037  PI_CHECK_ERROR(hipCtxPopCurrent(&current));
2038  return PI_CHECK_ERROR(hipDevicePrimaryCtxRelease(hipDev));
2039  }
2040 
2041  hipCtx_t hipCtxt = ctxt->get();
2042  return PI_CHECK_ERROR(hipCtxDestroy(hipCtxt));
2043 }
2044 
2052  pi_native_handle *nativeHandle) {
2053  *nativeHandle = reinterpret_cast<pi_native_handle>(context->get());
2054  return PI_SUCCESS;
2055 }
2056 
2066  pi_uint32 num_devices,
2067  const pi_device *devices,
2068  bool ownNativeHandle,
2069  pi_context *context) {
2070  (void)nativeHandle;
2071  (void)num_devices;
2072  (void)devices;
2073  (void)ownNativeHandle;
2074  (void)context;
2076  "Creation of PI context from native handle not implemented");
2077  return {};
2078 }
2079 
2085  size_t size, void *host_ptr, pi_mem *ret_mem,
2086  const pi_mem_properties *properties) {
2087  // Need input memory object
2088  assert(ret_mem != nullptr);
2089  assert((properties == nullptr || *properties == 0) &&
2090  "no mem properties goes to HIP RT yet");
2091  // Currently, USE_HOST_PTR is not implemented using host register
2092  // since this triggers a weird segfault after program ends.
2093  // Setting this constant to true enables testing that behavior.
2094  const bool enableUseHostPtr = false;
2095  const bool performInitialCopy =
2096  (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
2097  ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && !enableUseHostPtr);
2098  pi_result retErr = PI_SUCCESS;
2099  pi_mem retMemObj = nullptr;
2100 
2101  try {
2102  ScopedContext active(context);
2103  void *ptr;
2106 
2107  if ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && enableUseHostPtr) {
2108  retErr = PI_CHECK_ERROR(
2109  hipHostRegister(host_ptr, size, hipHostRegisterMapped));
2110  retErr = PI_CHECK_ERROR(hipHostGetDevicePointer(&ptr, host_ptr, 0));
2112  } else if (flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
2113  retErr = PI_CHECK_ERROR(hipHostMalloc(&host_ptr, size));
2114  retErr = PI_CHECK_ERROR(hipHostGetDevicePointer(&ptr, host_ptr, 0));
2116  } else {
2117  retErr = PI_CHECK_ERROR(hipMalloc(&ptr, size));
2118  if (flags & PI_MEM_FLAGS_HOST_PTR_COPY) {
2120  }
2121  }
2122 
2123  if (retErr == PI_SUCCESS) {
2124  pi_mem parentBuffer = nullptr;
2125 
2126  auto devPtr =
2127  reinterpret_cast<_pi_mem::mem_::mem_::buffer_mem_::native_type>(ptr);
2128  auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{
2129  context, parentBuffer, allocMode, devPtr, host_ptr, size});
2130  if (piMemObj != nullptr) {
2131  retMemObj = piMemObj.release();
2132  if (performInitialCopy) {
2133  // Operates on the default stream of the current HIP context.
2134  retErr = PI_CHECK_ERROR(hipMemcpyHtoD(devPtr, host_ptr, size));
2135  // Synchronize with default stream implicitly used by cuMemcpyHtoD
2136  // to make buffer data available on device before any other PI call
2137  // uses it.
2138  if (retErr == PI_SUCCESS) {
2139  hipStream_t defaultStream = 0;
2140  retErr = PI_CHECK_ERROR(hipStreamSynchronize(defaultStream));
2141  }
2142  }
2143  } else {
2144  retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
2145  }
2146  }
2147  } catch (pi_result err) {
2148  retErr = err;
2149  } catch (...) {
2150  retErr = PI_ERROR_OUT_OF_RESOURCES;
2151  }
2152 
2153  *ret_mem = retMemObj;
2154 
2155  return retErr;
2156 }
2157 
2163  assert((memObj != nullptr) && "PI_ERROR_INVALID_MEM_OBJECTS");
2164 
2165  pi_result ret = PI_SUCCESS;
2166 
2167  try {
2168 
2169  // Do nothing if there are other references
2170  if (memObj->decrement_reference_count() > 0) {
2171  return PI_SUCCESS;
2172  }
2173 
2174  // make sure memObj is released in case PI_CHECK_ERROR throws
2175  std::unique_ptr<_pi_mem> uniqueMemObj(memObj);
2176 
2177  if (memObj->is_sub_buffer()) {
2178  return PI_SUCCESS;
2179  }
2180 
2181  ScopedContext active(uniqueMemObj->get_context());
2182 
2183  if (memObj->mem_type_ == _pi_mem::mem_type::buffer) {
2184  switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) {
2187  ret = PI_CHECK_ERROR(
2188  hipFree((void *)uniqueMemObj->mem_.buffer_mem_.ptr_));
2189  break;
2191  ret = PI_CHECK_ERROR(
2192  hipHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
2193  break;
2195  ret = PI_CHECK_ERROR(
2196  hipFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
2197  };
2198  }
2199 
2200  else if (memObj->mem_type_ == _pi_mem::mem_type::surface) {
2201  ret = PI_CHECK_ERROR(hipDestroySurfaceObject(
2202  uniqueMemObj->mem_.surface_mem_.get_surface()));
2203  auto array = uniqueMemObj->mem_.surface_mem_.get_array();
2204  ret = PI_CHECK_ERROR(hipFreeArray(array));
2205  }
2206 
2207  } catch (pi_result err) {
2208  ret = err;
2209  } catch (...) {
2210  ret = PI_ERROR_OUT_OF_RESOURCES;
2211  }
2212 
2213  if (ret != PI_SUCCESS) {
2214  // A reported HIP error is either an implementation or an asynchronous HIP
2215  // error for which it is unclear if the function that reported it succeeded
2216  // or not. Either way, the state of the program is compromised and likely
2217  // unrecoverable.
2219  "Unrecoverable program state reached in hip_piMemRelease");
2220  }
2221 
2222  return PI_SUCCESS;
2223 }
2224 
2230  pi_buffer_create_type buffer_create_type,
2231  void *buffer_create_info, pi_mem *memObj) {
2232  assert((parent_buffer != nullptr) && "PI_ERROR_INVALID_MEM_OBJECT");
2233  assert(parent_buffer->is_buffer() && "PI_ERROR_INVALID_MEM_OBJECTS");
2234  assert(!parent_buffer->is_sub_buffer() && "PI_ERROR_INVALID_MEM_OBJECT");
2235 
2236  // Default value for flags means PI_MEM_FLAGS_ACCCESS_RW.
2237  if (flags == 0) {
2238  flags = PI_MEM_FLAGS_ACCESS_RW;
2239  }
2240 
2241  assert((flags == PI_MEM_FLAGS_ACCESS_RW) && "PI_ERROR_INVALID_VALUE");
2242  assert((buffer_create_type == PI_BUFFER_CREATE_TYPE_REGION) &&
2243  "PI_ERROR_INVALID_VALUE");
2244  assert((buffer_create_info != nullptr) && "PI_ERROR_INVALID_VALUE");
2245  assert(memObj != nullptr);
2246 
2247  const auto bufferRegion =
2248  *reinterpret_cast<pi_buffer_region>(buffer_create_info);
2249  assert((bufferRegion.size != 0u) && "PI_ERROR_INVALID_BUFFER_SIZE");
2250 
2251  assert((bufferRegion.origin <= (bufferRegion.origin + bufferRegion.size)) &&
2252  "Overflow");
2253  assert(((bufferRegion.origin + bufferRegion.size) <=
2254  parent_buffer->mem_.buffer_mem_.get_size()) &&
2255  "PI_ERROR_INVALID_BUFFER_SIZE");
2256  // Retained indirectly due to retaining parent buffer below.
2257  pi_context context = parent_buffer->context_;
2260 
2261  assert(parent_buffer->mem_.buffer_mem_.ptr_ !=
2264  parent_buffer->mem_.buffer_mem_.get_with_offset(bufferRegion.origin);
2265 
2266  void *hostPtr = nullptr;
2267  if (parent_buffer->mem_.buffer_mem_.hostPtr_) {
2268  hostPtr = static_cast<char *>(parent_buffer->mem_.buffer_mem_.hostPtr_) +
2269  bufferRegion.origin;
2270  }
2271 
2272  ReleaseGuard<pi_mem> releaseGuard(parent_buffer);
2273 
2274  std::unique_ptr<_pi_mem> retMemObj{nullptr};
2275  try {
2276  ScopedContext active(context);
2277 
2278  retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{
2279  context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}};
2280  } catch (pi_result err) {
2281  *memObj = nullptr;
2282  return err;
2283  } catch (...) {
2284  *memObj = nullptr;
2285  return PI_ERROR_OUT_OF_HOST_MEMORY;
2286  }
2287 
2288  releaseGuard.dismiss();
2289  *memObj = retMemObj.release();
2290  return PI_SUCCESS;
2291 }
2292 
2294  size_t expectedQuerySize, void *queryOutput,
2295  size_t *writtenQuerySize) {
2296  (void)memObj;
2297  (void)queriedInfo;
2298  (void)expectedQuerySize;
2299  (void)queryOutput;
2300  (void)writtenQuerySize;
2301 
2302  sycl::detail::pi::die("hip_piMemGetInfo not implemented");
2303 }
2304 
2312  pi_native_handle *nativeHandle) {
2313 #if defined(__HIP_PLATFORM_NVIDIA__)
2315  sizeof(pi_native_handle)) {
2316  // Check that all the upper bits that cannot be represented by
2317  // pi_native_handle are empty.
2318  // NOTE: The following shift might trigger a warning, but the check in the
2319  // if above makes sure that this does not underflow.
2321  mem->mem_.buffer_mem_.get() >> (sizeof(pi_native_handle) * CHAR_BIT);
2322  if (upperBits) {
2323  // Return an error if any of the remaining bits is non-zero.
2324  return PI_ERROR_INVALID_MEM_OBJECT;
2325  }
2326  }
2327  *nativeHandle = static_cast<pi_native_handle>(mem->mem_.buffer_mem_.get());
2328 #elif defined(__HIP_PLATFORM_AMD__)
2329  *nativeHandle =
2330  reinterpret_cast<pi_native_handle>(mem->mem_.buffer_mem_.get());
2331 #else
2332 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
2333 #endif
2334  return PI_SUCCESS;
2335 }
2336 
2349  pi_context context,
2350  bool ownNativeHandle,
2351  pi_mem *mem) {
2352  (void)nativeHandle;
2353  (void)context;
2354  (void)ownNativeHandle;
2355  (void)mem;
2356 
2358  "Creation of PI mem from native handle not implemented");
2359  return {};
2360 }
2361 
2369  pi_queue_properties properties, pi_queue *queue) {
2370  try {
2371  std::unique_ptr<_pi_queue> queueImpl{nullptr};
2372 
2373  if (context->get_device() != device) {
2374  *queue = nullptr;
2375  return PI_ERROR_INVALID_DEVICE;
2376  }
2377 
2378  unsigned int flags = 0;
2379 
2380  const bool is_out_of_order =
2382 
2383  std::vector<hipStream_t> computeHipStreams(
2384  is_out_of_order ? _pi_queue::default_num_compute_streams : 1);
2385  std::vector<hipStream_t> transferHipStreams(
2386  is_out_of_order ? _pi_queue::default_num_transfer_streams : 0);
2387 
2388  queueImpl = std::unique_ptr<_pi_queue>(new _pi_queue{
2389  std::move(computeHipStreams), std::move(transferHipStreams), context,
2390  device, properties, flags});
2391 
2392  *queue = queueImpl.release();
2393 
2394  return PI_SUCCESS;
2395  } catch (pi_result err) {
2396 
2397  return err;
2398 
2399  } catch (...) {
2400 
2401  return PI_ERROR_OUT_OF_RESOURCES;
2402  }
2403 }
2404 
2406  size_t param_value_size, void *param_value,
2407  size_t *param_value_size_ret) {
2408  assert(command_queue != nullptr);
2409 
2410  switch (param_name) {
2411  case PI_QUEUE_INFO_CONTEXT:
2412  return getInfo(param_value_size, param_value, param_value_size_ret,
2413  command_queue->context_);
2414  case PI_QUEUE_INFO_DEVICE:
2415  return getInfo(param_value_size, param_value, param_value_size_ret,
2416  command_queue->device_);
2418  return getInfo(param_value_size, param_value, param_value_size_ret,
2419  command_queue->get_reference_count());
2421  return getInfo(param_value_size, param_value, param_value_size_ret,
2422  command_queue->properties_);
2423  default:
2425  }
2426  sycl::detail::pi::die("Queue info request not implemented");
2427  return {};
2428 }
2429 
2431  assert(command_queue != nullptr);
2432  assert(command_queue->get_reference_count() > 0);
2433 
2434  command_queue->increment_reference_count();
2435  return PI_SUCCESS;
2436 }
2437 
2439  assert(command_queue != nullptr);
2440 
2441  if (command_queue->decrement_reference_count() > 0) {
2442  return PI_SUCCESS;
2443  }
2444 
2445  try {
2446  std::unique_ptr<_pi_queue> queueImpl(command_queue);
2447 
2448  ScopedContext active(command_queue->get_context());
2449 
2450  command_queue->for_each_stream([](hipStream_t s) {
2451  PI_CHECK_ERROR(hipStreamSynchronize(s));
2452  PI_CHECK_ERROR(hipStreamDestroy(s));
2453  });
2454 
2455  return PI_SUCCESS;
2456  } catch (pi_result err) {
2457  return err;
2458  } catch (...) {
2459  return PI_ERROR_OUT_OF_RESOURCES;
2460  }
2461 }
2462 
2464 
2465  // set default result to a negative result (avoid false-positve tests)
2466  pi_result result = PI_ERROR_OUT_OF_HOST_MEMORY;
2467 
2468  try {
2469 
2470  assert(command_queue !=
2471  nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code
2472  ScopedContext active(command_queue->get_context());
2473 
2474  command_queue->sync_streams<true>([&result](hipStream_t s) {
2475  result = PI_CHECK_ERROR(hipStreamSynchronize(s));
2476  });
2477 
2478  } catch (pi_result err) {
2479 
2480  result = err;
2481 
2482  } catch (...) {
2483 
2484  result = PI_ERROR_OUT_OF_RESOURCES;
2485  }
2486 
2487  return result;
2488 }
2489 
2490 // There is no HIP counterpart for queue flushing and we don't run into the
2491 // same problem of having to flush cross-queue dependencies as some of the
2492 // other plugins, so it can be left as no-op.
2494  (void)command_queue;
2495  return PI_SUCCESS;
2496 }
2497 
2505  pi_native_handle *nativeHandle) {
2506  ScopedContext active(queue->get_context());
2507  *nativeHandle =
2508  reinterpret_cast<pi_native_handle>(queue->get_next_compute_stream());
2509  return PI_SUCCESS;
2510 }
2511 
2525  pi_context context,
2526  pi_device device,
2527  bool ownNativeHandle,
2528  pi_queue *queue) {
2529  (void)nativeHandle;
2530  (void)context;
2531  (void)device;
2532  (void)queue;
2533  (void)ownNativeHandle;
2535  "Creation of PI queue from native handle not implemented");
2536  return {};
2537 }
2538 
2540  pi_bool blocking_write, size_t offset,
2541  size_t size, void *ptr,
2542  pi_uint32 num_events_in_wait_list,
2543  const pi_event *event_wait_list,
2544  pi_event *event) {
2545 
2546  assert(buffer != nullptr);
2547  assert(command_queue != nullptr);
2548  pi_result retErr = PI_SUCCESS;
2549  std::unique_ptr<_pi_event> retImplEv{nullptr};
2550 
2551  try {
2552  ScopedContext active(command_queue->get_context());
2553  hipStream_t hipStream = command_queue->get_next_transfer_stream();
2554  retErr = enqueueEventsWait(command_queue, hipStream,
2555  num_events_in_wait_list, event_wait_list);
2556 
2557  if (event) {
2558  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
2559  PI_COMMAND_TYPE_MEM_BUFFER_WRITE, command_queue, hipStream));
2560  retImplEv->start();
2561  }
2562 
2563  retErr = PI_CHECK_ERROR(
2564  hipMemcpyHtoDAsync(buffer->mem_.buffer_mem_.get_with_offset(offset),
2565  ptr, size, hipStream));
2566 
2567  if (event) {
2568  retErr = retImplEv->record();
2569  }
2570 
2571  if (blocking_write) {
2572  retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
2573  }
2574 
2575  if (event) {
2576  *event = retImplEv.release();
2577  }
2578  } catch (pi_result err) {
2579  retErr = err;
2580  }
2581  return retErr;
2582 }
2583 
2585  pi_bool blocking_read, size_t offset,
2586  size_t size, void *ptr,
2587  pi_uint32 num_events_in_wait_list,
2588  const pi_event *event_wait_list,
2589  pi_event *event) {
2590 
2591  assert(buffer != nullptr);
2592  assert(command_queue != nullptr);
2593  pi_result retErr = PI_SUCCESS;
2594  std::unique_ptr<_pi_event> retImplEv{nullptr};
2595 
2596  try {
2597  ScopedContext active(command_queue->get_context());
2598  hipStream_t hipStream = command_queue->get_next_transfer_stream();
2599  retErr = enqueueEventsWait(command_queue, hipStream,
2600  num_events_in_wait_list, event_wait_list);
2601 
2602  if (event) {
2603  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
2604  PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue, hipStream));
2605  retImplEv->start();
2606  }
2607 
2608  retErr = PI_CHECK_ERROR(hipMemcpyDtoHAsync(
2609  ptr, buffer->mem_.buffer_mem_.get_with_offset(offset), size,
2610  hipStream));
2611 
2612  if (event) {
2613  retErr = retImplEv->record();
2614  }
2615 
2616  if (blocking_read) {
2617  retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
2618  }
2619 
2620  if (event) {
2621  *event = retImplEv.release();
2622  }
2623 
2624  } catch (pi_result err) {
2625  retErr = err;
2626  }
2627  return retErr;
2628 }
2629 
2630 pi_result hip_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
2631 
2632  try {
2633  assert(num_events != 0);
2634  assert(event_list);
2635  if (num_events == 0) {
2636  return PI_ERROR_INVALID_VALUE;
2637  }
2638 
2639  if (!event_list) {
2640  return PI_ERROR_INVALID_EVENT;
2641  }
2642 
2643  auto context = event_list[0]->get_context();
2644  ScopedContext active(context);
2645 
2646  auto waitFunc = [context](pi_event event) -> pi_result {
2647  if (!event) {
2648  return PI_ERROR_INVALID_EVENT;
2649  }
2650 
2651  if (event->get_context() != context) {
2652  return PI_ERROR_INVALID_CONTEXT;
2653  }
2654 
2655  return event->wait();
2656  };
2657  return forLatestEvents(event_list, num_events, waitFunc);
2658  } catch (pi_result err) {
2659  return err;
2660  } catch (...) {
2661  return PI_ERROR_OUT_OF_RESOURCES;
2662  }
2663 }
2664 
2665 pi_result hip_piKernelCreate(pi_program program, const char *kernel_name,
2666  pi_kernel *kernel) {
2667  assert(kernel != nullptr);
2668  assert(program != nullptr);
2669 
2670  pi_result retErr = PI_SUCCESS;
2671  std::unique_ptr<_pi_kernel> retKernel{nullptr};
2672 
2673  try {
2674  ScopedContext active(program->get_context());
2675 
2676  hipFunction_t hipFunc;
2677  retErr = PI_CHECK_ERROR(
2678  hipModuleGetFunction(&hipFunc, program->get(), kernel_name));
2679 
2680  std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset";
2681  hipFunction_t hipFuncWithOffsetParam;
2682  hipError_t offsetRes = hipModuleGetFunction(
2683  &hipFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str());
2684 
2685  // If there is no kernel with global offset parameter we mark it as missing
2686  if (offsetRes == hipErrorNotFound) {
2687  hipFuncWithOffsetParam = nullptr;
2688  } else {
2689  retErr = PI_CHECK_ERROR(offsetRes);
2690  }
2691 
2692  retKernel = std::unique_ptr<_pi_kernel>(
2693  new _pi_kernel{hipFunc, hipFuncWithOffsetParam, kernel_name, program,
2694  program->get_context()});
2695  } catch (pi_result err) {
2696  retErr = err;
2697  } catch (...) {
2698  retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
2699  }
2700 
2701  *kernel = retKernel.release();
2702  return retErr;
2703 }
2704 
2706  size_t arg_size, const void *arg_value) {
2707 
2708  assert(kernel != nullptr);
2709  pi_result retErr = PI_SUCCESS;
2710  try {
2711  if (arg_value) {
2712  kernel->set_kernel_arg(arg_index, arg_size, arg_value);
2713  } else {
2714  kernel->set_kernel_local_arg(arg_index, arg_size);
2715  }
2716  } catch (pi_result err) {
2717  retErr = err;
2718  }
2719  return retErr;
2720 }
2721 
2723  const pi_mem *arg_value) {
2724 
2725  assert(kernel != nullptr);
2726  assert(arg_value != nullptr);
2727 
2728  pi_result retErr = PI_SUCCESS;
2729  try {
2730  pi_mem arg_mem = *arg_value;
2731 
2732  if (arg_mem->mem_type_ == _pi_mem::mem_type::surface) {
2733  auto array = arg_mem->mem_.surface_mem_.get_array();
2734  hipArray_Format Format;
2735  size_t NumChannels;
2736  getArrayDesc(array, Format, NumChannels);
2737  if (Format != HIP_AD_FORMAT_UNSIGNED_INT32 &&
2738  Format != HIP_AD_FORMAT_SIGNED_INT32 &&
2739  Format != HIP_AD_FORMAT_HALF && Format != HIP_AD_FORMAT_FLOAT) {
2741  "PI HIP kernels only support images with channel types int32, "
2742  "uint32, float, and half.");
2743  }
2744  hipSurfaceObject_t hipSurf = arg_mem->mem_.surface_mem_.get_surface();
2745  kernel->set_kernel_arg(arg_index, sizeof(hipSurf), (void *)&hipSurf);
2746  } else
2747 
2748  {
2749  void *hipPtr = arg_mem->mem_.buffer_mem_.get_void();
2750  kernel->set_kernel_arg(arg_index, sizeof(void *), (void *)&hipPtr);
2751  }
2752  } catch (pi_result err) {
2753  retErr = err;
2754  }
2755  return retErr;
2756 }
2757 
2759  const pi_sampler *arg_value) {
2760 
2761  assert(kernel != nullptr);
2762  assert(arg_value != nullptr);
2763 
2764  pi_result retErr = PI_SUCCESS;
2765  try {
2766  pi_uint32 samplerProps = (*arg_value)->props_;
2767  kernel->set_kernel_arg(arg_index, sizeof(pi_uint32), (void *)&samplerProps);
2768  } catch (pi_result err) {
2769  retErr = err;
2770  }
2771  return retErr;
2772 }
2773 
2775  pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim,
2776  const size_t *global_work_offset, const size_t *global_work_size,
2777  const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
2778  const pi_event *event_wait_list, pi_event *event) {
2779 
2780  // Preconditions
2781  assert(command_queue != nullptr);
2782  assert(command_queue->get_context() == kernel->get_context());
2783  assert(kernel != nullptr);
2784  assert(global_work_offset != nullptr);
2785  assert(work_dim > 0);
2786  assert(work_dim < 4);
2787 
2788  if (*global_work_size == 0) {
2790  command_queue, num_events_in_wait_list, event_wait_list, event);
2791  }
2792 
2793  // Set the number of threads per block to the number of threads per warp
2794  // by default unless user has provided a better number
2795  size_t threadsPerBlock[3] = {32u, 1u, 1u};
2796  size_t maxWorkGroupSize = 0u;
2797  size_t maxThreadsPerBlock[3] = {};
2798  bool providedLocalWorkGroupSize = (local_work_size != nullptr);
2799 
2800  {
2801  pi_result retError = hip_piDeviceGetInfo(
2803  sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
2804  assert(retError == PI_SUCCESS);
2805  (void)retError;
2806 
2807  retError = hip_piDeviceGetInfo(
2809  sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr);
2810  assert(retError == PI_SUCCESS);
2811  // The maxWorkGroupsSize = 1024 for AMD GPU
2812  // The maxThreadsPerBlock = {1024, 1024, 1024}
2813 
2814  if (providedLocalWorkGroupSize) {
2815  auto isValid = [&](int dim) {
2816  if (local_work_size[dim] > maxThreadsPerBlock[dim])
2817  return PI_ERROR_INVALID_WORK_ITEM_SIZE;
2818  // Checks that local work sizes are a divisor of the global work sizes
2819  // which includes that the local work sizes are neither larger than the
2820  // global work sizes and not 0.
2821  if (0u == local_work_size[dim])
2822  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2823  if (0u != (global_work_size[dim] % local_work_size[dim]))
2824  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2825  threadsPerBlock[dim] = local_work_size[dim];
2826  return PI_SUCCESS;
2827  };
2828 
2829  for (size_t dim = 0; dim < work_dim; dim++) {
2830  auto err = isValid(dim);
2831  if (err != PI_SUCCESS)
2832  return err;
2833  }
2834  } else {
2835  simpleGuessLocalWorkSize(threadsPerBlock, global_work_size,
2836  maxThreadsPerBlock, kernel);
2837  }
2838  }
2839 
2840  if (maxWorkGroupSize <
2841  size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
2842  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2843  }
2844 
2845  size_t blocksPerGrid[3] = {1u, 1u, 1u};
2846 
2847  for (size_t i = 0; i < work_dim; i++) {
2848  blocksPerGrid[i] =
2849  (global_work_size[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i];
2850  }
2851 
2852  pi_result retError = PI_SUCCESS;
2853  std::unique_ptr<_pi_event> retImplEv{nullptr};
2854 
2855  try {
2856  ScopedContext active(command_queue->get_context());
2857 
2858  pi_uint32 stream_token;
2859  _pi_stream_guard guard;
2860  hipStream_t hipStream = command_queue->get_next_compute_stream(
2861  num_events_in_wait_list, event_wait_list, guard, &stream_token);
2862  hipFunction_t hipFunc = kernel->get();
2863 
2864  retError = enqueueEventsWait(command_queue, hipStream,
2865  num_events_in_wait_list, event_wait_list);
2866 
2867  // Set the implicit global offset parameter if kernel has offset variant
2868  if (kernel->get_with_offset_parameter()) {
2869  std::uint32_t hip_implicit_offset[3] = {0, 0, 0};
2870  if (global_work_offset) {
2871  for (size_t i = 0; i < work_dim; i++) {
2872  hip_implicit_offset[i] =
2873  static_cast<std::uint32_t>(global_work_offset[i]);
2874  if (global_work_offset[i] != 0) {
2875  hipFunc = kernel->get_with_offset_parameter();
2876  }
2877  }
2878  }
2879  kernel->set_implicit_offset_arg(sizeof(hip_implicit_offset),
2880  hip_implicit_offset);
2881  }
2882 
2883  auto argIndices = kernel->get_arg_indices();
2884 
2885  if (event) {
2886  retImplEv = std::unique_ptr<_pi_event>(
2888  hipStream, stream_token));
2889  retImplEv->start();
2890  }
2891 
2892  retError = PI_CHECK_ERROR(hipModuleLaunchKernel(
2893  hipFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2],
2894  threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2],
2895  kernel->get_local_size(), hipStream, argIndices.data(), nullptr));
2896 
2897  kernel->clear_local_size();
2898 
2899  if (event) {
2900  retError = retImplEv->record();
2901  *event = retImplEv.release();
2902  }
2903  } catch (pi_result err) {
2904  retError = err;
2905  }
2906  return retError;
2907 }
2908 
2910 pi_result
2911 hip_piEnqueueNativeKernel(pi_queue queue, void (*user_func)(void *), void *args,
2912  size_t cb_args, pi_uint32 num_mem_objects,
2913  const pi_mem *mem_list, const void **args_mem_loc,
2914  pi_uint32 num_events_in_wait_list,
2915  const pi_event *event_wait_list, pi_event *event) {
2916  (void)queue;
2917  (void)user_func;
2918  (void)args;
2919  (void)cb_args;
2920  (void)num_mem_objects;
2921  (void)mem_list;
2922  (void)args_mem_loc;
2923  (void)num_events_in_wait_list;
2924  (void)event_wait_list;
2925  (void)event;
2926 
2927  sycl::detail::pi::die("Not implemented in HIP backend");
2928  return {};
2929 }
2930 
2932 
2934  const pi_image_format *image_format,
2935  const pi_image_desc *image_desc, void *host_ptr,
2936  pi_mem *ret_mem) {
2937 
2938  // Need input memory object
2939  assert(ret_mem != nullptr);
2940  const bool performInitialCopy = (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
2941  ((flags & PI_MEM_FLAGS_HOST_PTR_USE));
2942  pi_result retErr = PI_SUCCESS;
2943 
2944  // We only support RBGA channel order
2945  // TODO: check SYCL CTS and spec. May also have to support BGRA
2946  if (image_format->image_channel_order !=
2949  "hip_piMemImageCreate only supports RGBA channel order");
2950  }
2951 
2952  // We have to use cuArray3DCreate, which has some caveats. The height and
2953  // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc gives
2954  // a minimum value of 1, so we need to convert the answer.
2955  HIP_ARRAY3D_DESCRIPTOR array_desc;
2956  array_desc.NumChannels = 4; // Only support 4 channel image
2957  array_desc.Flags = 0; // No flags required
2958  array_desc.Width = image_desc->image_width;
2959  if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
2960  array_desc.Height = 0;
2961  array_desc.Depth = 0;
2962  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
2963  array_desc.Height = image_desc->image_height;
2964  array_desc.Depth = 0;
2965  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
2966  array_desc.Height = image_desc->image_height;
2967  array_desc.Depth = image_desc->image_depth;
2968  }
2969 
2970  // We need to get this now in bytes for calculating the total image size later
2971  size_t pixel_type_size_bytes;
2972 
2973  switch (image_format->image_channel_data_type) {
2976  array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT8;
2977  pixel_type_size_bytes = 1;
2978  break;
2980  array_desc.Format = HIP_AD_FORMAT_SIGNED_INT8;
2981  pixel_type_size_bytes = 1;
2982  break;
2985  array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT16;
2986  pixel_type_size_bytes = 2;
2987  break;
2989  array_desc.Format = HIP_AD_FORMAT_SIGNED_INT16;
2990  pixel_type_size_bytes = 2;
2991  break;
2993  array_desc.Format = HIP_AD_FORMAT_HALF;
2994  pixel_type_size_bytes = 2;
2995  break;
2997  array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT32;
2998  pixel_type_size_bytes = 4;
2999  break;
3001  array_desc.Format = HIP_AD_FORMAT_SIGNED_INT32;
3002  pixel_type_size_bytes = 4;
3003  break;
3005  array_desc.Format = HIP_AD_FORMAT_FLOAT;
3006  pixel_type_size_bytes = 4;
3007  break;
3008  default:
3010  "hip_piMemImageCreate given unsupported image_channel_data_type");
3011  }
3012 
3013  // When a dimension isn't used image_desc has the size set to 1
3014  size_t pixel_size_bytes =
3015  pixel_type_size_bytes * 4; // 4 is the only number of channels we support
3016  size_t image_size_bytes = pixel_size_bytes * image_desc->image_width *
3017  image_desc->image_height * image_desc->image_depth;
3018 
3019  ScopedContext active(context);
3020  hipArray *image_array;
3021  retErr = PI_CHECK_ERROR(hipArray3DCreate(
3022  reinterpret_cast<hipCUarray *>(&image_array), &array_desc));
3023 
3024  try {
3025  if (performInitialCopy) {
3026  // We have to use a different copy function for each image dimensionality
3027  if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
3028  retErr = PI_CHECK_ERROR(
3029  hipMemcpyHtoA(image_array, 0, host_ptr, image_size_bytes));
3030  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
3031  hip_Memcpy2D cpy_desc;
3032  memset(&cpy_desc, 0, sizeof(cpy_desc));
3033  cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost;
3034  cpy_desc.srcHost = host_ptr;
3035  cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray;
3036  cpy_desc.dstArray = reinterpret_cast<hipCUarray>(image_array);
3037  cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
3038  cpy_desc.Height = image_desc->image_height;
3039  retErr = PI_CHECK_ERROR(hipMemcpyParam2D(&cpy_desc));
3040  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
3041  HIP_MEMCPY3D cpy_desc;
3042  memset(&cpy_desc, 0, sizeof(cpy_desc));
3043  cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost;
3044  cpy_desc.srcHost = host_ptr;
3045  cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray;
3046  cpy_desc.dstArray = reinterpret_cast<hipCUarray>(image_array);
3047  cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
3048  cpy_desc.Height = image_desc->image_height;
3049  cpy_desc.Depth = image_desc->image_depth;
3050  retErr = PI_CHECK_ERROR(hipDrvMemcpy3D(&cpy_desc));
3051  }
3052  }
3053 
3054  // HIP_RESOURCE_DESC is a union of different structs, shown here
3055  // We need to fill it as described here to use it for a surface or texture
3056  // HIP_RESOURCE_DESC::resType must be HIP_RESOURCE_TYPE_ARRAY and
3057  // HIP_RESOURCE_DESC::res::array::hArray must be set to a valid HIP array
3058  // handle.
3059  // HIP_RESOURCE_DESC::flags must be set to zero
3060 
3061  hipResourceDesc image_res_desc;
3062  image_res_desc.res.array.array = image_array;
3063  image_res_desc.resType = hipResourceTypeArray;
3064 
3065  hipSurfaceObject_t surface;
3066  retErr = PI_CHECK_ERROR(hipCreateSurfaceObject(&surface, &image_res_desc));
3067 
3068  auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{
3069  context, image_array, surface, image_desc->image_type, host_ptr});
3070 
3071  if (piMemObj == nullptr) {
3072  return PI_ERROR_OUT_OF_HOST_MEMORY;
3073  }
3074 
3075  *ret_mem = piMemObj.release();
3076  } catch (pi_result err) {
3077  PI_CHECK_ERROR(hipFreeArray(image_array));
3078  return err;
3079  } catch (...) {
3080  PI_CHECK_ERROR(hipFreeArray(image_array));
3081  return PI_ERROR_UNKNOWN;
3082  }
3083  return retErr;
3084 }
3085 
3088  size_t param_value_size, void *param_value,
3089  size_t *param_value_size_ret) {
3090  (void)image;
3091  (void)param_name;
3092  (void)param_value_size;
3093  (void)param_value;
3094  (void)param_value_size_ret;
3095 
3096  sycl::detail::pi::die("hip_piMemImageGetInfo not implemented");
3097  return {};
3098 }
3099 
3101  assert(mem != nullptr);
3102  assert(mem->get_reference_count() > 0);
3104  return PI_SUCCESS;
3105 }
3106 
3111  const char **strings,
3112  const size_t *lengths,
3113  pi_program *program) {
3114  (void)context;
3115  (void)count;
3116  (void)strings;
3117  (void)lengths;
3118  (void)program;
3119 
3120  sycl::detail::pi::hipPrint("hip_piclProgramCreateWithSource not implemented");
3121  return PI_ERROR_INVALID_OPERATION;
3122 }
3123 
3129  const pi_device *device_list, const char *options,
3130  void (*pfn_notify)(pi_program program,
3131  void *user_data),
3132  void *user_data) {
3133 
3134  assert(program != nullptr);
3135  assert(num_devices == 1 || num_devices == 0);
3136  assert(device_list != nullptr || num_devices == 0);
3137  assert(pfn_notify == nullptr);
3138  assert(user_data == nullptr);
3139  pi_result retError = PI_SUCCESS;
3140 
3141  try {
3142  ScopedContext active(program->get_context());
3143 
3144  program->build_program(options);
3145 
3146  } catch (pi_result err) {
3147  retError = err;
3148  }
3149  return retError;
3150 }
3151 
3153 pi_result hip_piProgramCreate(pi_context context, const void *il, size_t length,
3154  pi_program *res_program) {
3155  (void)context;
3156  (void)il;
3157  (void)length;
3158  (void)res_program;
3159 
3160  sycl::detail::pi::die("hip_piProgramCreate not implemented");
3161  return {};
3162 }
3163 
3171  pi_context context, pi_uint32 num_devices, const pi_device *device_list,
3172  const size_t *lengths, const unsigned char **binaries,
3173  size_t num_metadata_entries, const pi_device_binary_property *metadata,
3174  pi_int32 *binary_status, pi_program *program) {
3175  (void)num_metadata_entries;
3176  (void)metadata;
3177  (void)binary_status;
3178 
3179  assert(context != nullptr);
3180  assert(binaries != nullptr);
3181  assert(program != nullptr);
3182  assert(device_list != nullptr);
3183  assert(num_devices == 1 && "HIP contexts are for a single device");
3184  assert((context->get_device()->get() == device_list[0]->get()) &&
3185  "Mismatch between devices context and passed context when creating "
3186  "program from binary");
3187 
3188  pi_result retError = PI_SUCCESS;
3189 
3190  std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
3191 
3192  // TODO: Set metadata here and use reqd_work_group_size information.
3193  // See cuda_piProgramCreateWithBinary
3194 
3195  const bool has_length = (lengths != nullptr);
3196  size_t length = has_length
3197  ? lengths[0]
3198  : strlen(reinterpret_cast<const char *>(binaries[0])) + 1;
3199 
3200  assert(length != 0);
3201 
3202  retProgram->set_binary(reinterpret_cast<const char *>(binaries[0]), length);
3203 
3204  *program = retProgram.release();
3205 
3206  return retError;
3207 }
3208 
3210  size_t param_value_size, void *param_value,
3211  size_t *param_value_size_ret) {
3212  assert(program != nullptr);
3213 
3214  switch (param_name) {
3216  return getInfo(param_value_size, param_value, param_value_size_ret,
3217  program->get_reference_count());
3219  return getInfo(param_value_size, param_value, param_value_size_ret,
3220  program->context_);
3222  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
3224  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3225  &program->context_->deviceId_);
3227  return getInfo(param_value_size, param_value, param_value_size_ret,
3228  program->binary_);
3230  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3231  &program->binarySizeInBytes_);
3233  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3234  &program->binary_);
3236  return getInfo(param_value_size, param_value, param_value_size_ret,
3237  getKernelNames(program).c_str());
3238  }
3239  default:
3241  }
3242  sycl::detail::pi::die("Program info request not implemented");
3243  return {};
3244 }
3245 
3247  const pi_device *device_list, const char *options,
3248  pi_uint32 num_input_programs,
3249  const pi_program *input_programs,
3250  void (*pfn_notify)(pi_program program,
3251  void *user_data),
3252  void *user_data, pi_program *ret_program) {
3253  (void)context;
3254  (void)num_devices;
3255  (void)device_list;
3256  (void)options;
3257  (void)num_input_programs;
3258  (void)input_programs;
3259  (void)pfn_notify;
3260  (void)user_data;
3261  (void)ret_program;
3263  "hip_piProgramLink: linking not supported with hip backend");
3264  return {};
3265 }
3266 
3272  pi_program program, pi_uint32 num_devices, const pi_device *device_list,
3273  const char *options, pi_uint32 num_input_headers,
3274  const pi_program *input_headers, const char **header_include_names,
3275  void (*pfn_notify)(pi_program program, void *user_data), void *user_data) {
3276  (void)input_headers;
3277  (void)header_include_names;
3278 
3279  assert(program != nullptr);
3280  assert(num_devices == 1 || num_devices == 0);
3281  assert(device_list != nullptr || num_devices == 0);
3282  assert(pfn_notify == nullptr);
3283  assert(user_data == nullptr);
3284  assert(num_input_headers == 0);
3285  pi_result retError = PI_SUCCESS;
3286 
3287  try {
3288  ScopedContext active(program->get_context());
3289 
3290  program->build_program(options);
3291 
3292  } catch (pi_result err) {
3293  retError = err;
3294  }
3295  return retError;
3296 }
3297 
3299  pi_program_build_info param_name,
3300  size_t param_value_size, void *param_value,
3301  size_t *param_value_size_ret) {
3302  (void)device;
3303 
3304  assert(program != nullptr);
3305 
3306  switch (param_name) {
3308  return getInfo(param_value_size, param_value, param_value_size_ret,
3309  program->buildStatus_);
3310  }
3312  return getInfo(param_value_size, param_value, param_value_size_ret,
3313  program->buildOptions_.c_str());
3315  return getInfoArray(program->MAX_LOG_SIZE, param_value_size, param_value,
3316  param_value_size_ret, program->infoLog_);
3317  default:
3319  }
3320  sycl::detail::pi::die("Program Build info request not implemented");
3321  return {};
3322 }
3323 
3325  assert(program != nullptr);
3326  assert(program->get_reference_count() > 0);
3327  program->increment_reference_count();
3328  return PI_SUCCESS;
3329 }
3330 
3335  assert(program != nullptr);
3336 
3337  // double delete or someone is messing with the ref count.
3338  // either way, cannot safely proceed.
3339  assert(program->get_reference_count() != 0 &&
3340  "Reference count overflow detected in hip_piProgramRelease.");
3341 
3342  // decrement ref count. If it is 0, delete the program.
3343  if (program->decrement_reference_count() == 0) {
3344 
3345  std::unique_ptr<_pi_program> program_ptr{program};
3346 
3347  pi_result result = PI_ERROR_INVALID_PROGRAM;
3348 
3349  try {
3350  ScopedContext active(program->get_context());
3351  auto hipModule = program->get();
3352  result = PI_CHECK_ERROR(hipModuleUnload(hipModule));
3353  } catch (...) {
3354  result = PI_ERROR_OUT_OF_RESOURCES;
3355  }
3356 
3357  return result;
3358  }
3359 
3360  return PI_SUCCESS;
3361 }
3362 
3370  pi_native_handle *nativeHandle) {
3371  *nativeHandle = reinterpret_cast<pi_native_handle>(program->get());
3372  return PI_SUCCESS;
3373 }
3374 
3387  pi_context context,
3388  bool ownNativeHandle,
3389  pi_program *program) {
3390  (void)nativeHandle;
3391  (void)context;
3392  (void)ownNativeHandle;
3393  (void)program;
3394 
3396  "Creation of PI program from native handle not implemented");
3397  return {};
3398 }
3399 
3401  size_t param_value_size, void *param_value,
3402  size_t *param_value_size_ret) {
3403 
3404  if (kernel != nullptr) {
3405 
3406  switch (param_name) {
3408  return getInfo(param_value_size, param_value, param_value_size_ret,
3409  kernel->get_name());
3411  return getInfo(param_value_size, param_value, param_value_size_ret,
3412  kernel->get_num_args());
3414  return getInfo(param_value_size, param_value, param_value_size_ret,
3415  kernel->get_reference_count());
3416  case PI_KERNEL_INFO_CONTEXT: {
3417  return getInfo(param_value_size, param_value, param_value_size_ret,
3418  kernel->get_context());
3419  }
3420  case PI_KERNEL_INFO_PROGRAM: {
3421  return getInfo(param_value_size, param_value, param_value_size_ret,
3422  kernel->get_program());
3423  }
3425  return getInfo(param_value_size, param_value, param_value_size_ret, "");
3426  }
3427  default: {
3429  }
3430  }
3431  }
3432 
3433  return PI_ERROR_INVALID_KERNEL;
3434 }
3435 
3437  pi_kernel_group_info param_name,
3438  size_t param_value_size, void *param_value,
3439  size_t *param_value_size_ret) {
3440 
3441  // here we want to query about a kernel's hip blocks!
3442 
3443  if (kernel != nullptr) {
3444 
3445  switch (param_name) {
3447  int max_threads = 0;
3449  hipFuncGetAttribute(&max_threads,
3450  HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
3451  kernel->get()) == hipSuccess);
3452  return getInfo(param_value_size, param_value, param_value_size_ret,
3453  size_t(max_threads));
3454  }
3456  // Returns the work-group size specified in the kernel source or IL.
3457  // If the work-group size is not specified in the kernel source or IL,
3458  // (0, 0, 0) is returned.
3459  // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html
3460 
3461  // TODO: can we extract the work group size from the PTX?
3462  size_t group_size[3] = {0, 0, 0};
3463  return getInfoArray(3, param_value_size, param_value,
3464  param_value_size_ret, group_size);
3465  }
3467  // OpenCL LOCAL == HIP SHARED
3468  int bytes = 0;
3470  hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
3471  kernel->get()) == hipSuccess);
3472  return getInfo(param_value_size, param_value, param_value_size_ret,
3473  pi_uint64(bytes));
3474  }
3476  // Work groups should be multiples of the warp size
3477  int warpSize = 0;
3479  hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
3480  device->get()) == hipSuccess);
3481  return getInfo(param_value_size, param_value, param_value_size_ret,
3482  static_cast<size_t>(warpSize));
3483  }
3485  // OpenCL PRIVATE == HIP LOCAL
3486  int bytes = 0;
3488  hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
3489  kernel->get()) == hipSuccess);
3490  return getInfo(param_value_size, param_value, param_value_size_ret,
3491  pi_uint64(bytes));
3492  }
3494  sycl::detail::pi::die("PI_KERNEL_GROUP_INFO_NUM_REGS in "
3495  "piKernelGetGroupInfo not implemented\n");
3496  return {};
3497  }
3498 
3499  default:
3501  }
3502  }
3503 
3504  return PI_ERROR_INVALID_KERNEL;
3505 }
3506 
3508  pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name,
3509  size_t input_value_size, const void *input_value, size_t param_value_size,
3510  void *param_value, size_t *param_value_size_ret) {
3511  (void)input_value_size;
3512  (void)input_value;
3513 
3514  if (kernel != nullptr) {
3515  switch (param_name) {
3517  // Sub-group size is equivalent to warp size
3518  int warpSize = 0;
3520  hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
3521  device->get()) == hipSuccess);
3522  return getInfo(param_value_size, param_value, param_value_size_ret,
3523  static_cast<uint32_t>(warpSize));
3524  }
3526  // Number of sub-groups = max block size / warp size + possible remainder
3527  int max_threads = 0;
3529  hipFuncGetAttribute(&max_threads,
3530  HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
3531  kernel->get()) == hipSuccess);
3532  int warpSize = 0;
3534  0, nullptr, sizeof(uint32_t), &warpSize,
3535  nullptr);
3536  int maxWarps = (max_threads + warpSize - 1) / warpSize;
3537  return getInfo(param_value_size, param_value, param_value_size_ret,
3538  static_cast<uint32_t>(maxWarps));
3539  }
3541  // Return value of 0 => not specified
3542  // TODO: Revisit if PTX is generated for compile-time work-group sizes
3543  return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3544  }
3546  // Return value of 0 => unspecified or "auto" sub-group size
3547  // Correct for now, since warp size may be read from special register
3548  // TODO: Return warp size once default is primary sub-group size
3549  // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX
3550  return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3551  }
3552  default:
3554  }
3555  }
3556  return PI_ERROR_INVALID_KERNEL;
3557 }
3558 
3560  assert(kernel != nullptr);
3561  assert(kernel->get_reference_count() > 0u);
3562 
3563  kernel->increment_reference_count();
3564  return PI_SUCCESS;
3565 }
3566 
3568  assert(kernel != nullptr);
3569 
3570  // double delete or someone is messing with the ref count.
3571  // either way, cannot safely proceed.
3572  assert(kernel->get_reference_count() != 0 &&
3573  "Reference count overflow detected in hip_piKernelRelease.");
3574 
3575  // decrement ref count. If it is 0, delete the program.
3576  if (kernel->decrement_reference_count() == 0) {
3577  // no internal hip resources to clean up. Just delete it.
3578  delete kernel;
3579  return PI_SUCCESS;
3580  }
3581 
3582  return PI_SUCCESS;
3583 }
3584 
3585 // A NOP for the HIP backend
3587  pi_kernel_exec_info param_name,
3588  size_t param_value_size,
3589  const void *param_value) {
3590  (void)kernel;
3591  (void)param_name;
3592  (void)param_value_size;
3593  (void)param_value;
3594 
3595  return PI_SUCCESS;
3596 }
3597 
3599  size_t, const void *) {
3600  // This entry point is only used for native specialization constants (SPIR-V),
3601  // and the HIP plugin is AOT only so this entry point is not supported.
3602  sycl::detail::pi::die("Native specialization constants are not supported");
3603  return {};
3604 }
3605 
3607  size_t arg_size, const void *arg_value) {
3608  kernel->set_kernel_arg(arg_index, arg_size, arg_value);
3609  return PI_SUCCESS;
3610 }
3611 
3612 //
3613 // Events
3614 //
3616  (void)context;
3617  (void)event;
3618 
3619  sycl::detail::pi::die("PI Event Create not implemented in HIP backend");
3620 }
3621 
3623  size_t param_value_size, void *param_value,
3624  size_t *param_value_size_ret) {
3625  assert(event != nullptr);
3626 
3627  switch (param_name) {
3629  return getInfo(param_value_size, param_value, param_value_size_ret,
3630  event->get_queue());
3632  return getInfo(param_value_size, param_value, param_value_size_ret,
3633  event->get_command_type());
3635  return getInfo(param_value_size, param_value, param_value_size_ret,
3636  event->get_reference_count());
3638  return getInfo(param_value_size, param_value, param_value_size_ret,
3639  static_cast<pi_event_status>(event->get_execution_status()));
3640  }
3641  case PI_EVENT_INFO_CONTEXT:
3642  return getInfo(param_value_size, param_value, param_value_size_ret,
3643  event->get_context());
3644  default:
3646  }
3647 
3648  return PI_ERROR_INVALID_EVENT;
3649 }
3650 
3654  pi_profiling_info param_name,
3655  size_t param_value_size,
3656  void *param_value,
3657  size_t *param_value_size_ret) {
3658 
3659  assert(event != nullptr);
3660 
3661  pi_queue queue = event->get_queue();
3662  if (queue == nullptr || !(queue->properties_ & PI_QUEUE_PROFILING_ENABLE)) {
3663  return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE;
3664  }
3665 
3666  switch (param_name) {
3669  return getInfo<pi_uint64>(param_value_size, param_value,
3670  param_value_size_ret, event->get_queued_time());
3672  return getInfo<pi_uint64>(param_value_size, param_value,
3673  param_value_size_ret, event->get_start_time());
3675  return getInfo<pi_uint64>(param_value_size, param_value,
3676  param_value_size_ret, event->get_end_time());
3677  default:
3679  }
3680  sycl::detail::pi::die("Event Profiling info request not implemented");
3681  return {};
3682 }
3683 
3685  pi_int32 command_exec_callback_type,
3686  pfn_notify notify, void *user_data) {
3687  (void)event;
3688  (void)command_exec_callback_type;
3689  (void)notify;
3690  (void)user_data;
3691 
3692  sycl::detail::pi::die("Event Callback not implemented in HIP backend");
3693  return PI_SUCCESS;
3694 }
3695 
3697  (void)event;
3698  (void)execution_status;
3699 
3700  sycl::detail::pi::die("Event Set Status not implemented in HIP backend");
3701  return PI_ERROR_INVALID_VALUE;
3702 }
3703 
3705  assert(event != nullptr);
3706 
3707  const auto refCount = event->increment_reference_count();
3708 
3710  refCount != 0, "Reference count overflow detected in hip_piEventRetain.");
3711 
3712  return PI_SUCCESS;
3713 }
3714 
3716  assert(event != nullptr);
3717 
3718  // double delete or someone is messing with the ref count.
3719  // either way, cannot safely proceed.
3721  event->get_reference_count() != 0,
3722  "Reference count overflow detected in hip_piEventRelease.");
3723 
3724  // decrement ref count. If it is 0, delete the event.
3725  if (event->decrement_reference_count() == 0) {
3726  std::unique_ptr<_pi_event> event_ptr{event};
3727  pi_result result = PI_ERROR_INVALID_EVENT;
3728  try {
3729  ScopedContext active(event->get_context());
3730  result = event->release();
3731  } catch (...) {
3732  result = PI_ERROR_OUT_OF_RESOURCES;
3733  }
3734  return result;
3735  }
3736 
3737  return PI_SUCCESS;
3738 }
3739 
3748  pi_uint32 num_events_in_wait_list,
3749  const pi_event *event_wait_list,
3750  pi_event *event) {
3752  command_queue, num_events_in_wait_list, event_wait_list, event);
3753 }
3754 
3761  pi_uint32 num_events_in_wait_list,
3762  const pi_event *event_wait_list,
3763  pi_event *event) {
3764  if (!command_queue) {
3765  return PI_ERROR_INVALID_QUEUE;
3766  }
3767 
3768  pi_result result;
3769 
3770  try {
3771  ScopedContext active(command_queue->get_context());
3772  pi_uint32 stream_token;
3773  _pi_stream_guard guard;
3774  hipStream_t hipStream = command_queue->get_next_compute_stream(
3775  num_events_in_wait_list, event_wait_list, guard, &stream_token);
3776  {
3777  std::lock_guard<std::mutex> guard(command_queue->barrier_mutex_);
3778  if (command_queue->barrier_event_ == nullptr) {
3779  PI_CHECK_ERROR(hipEventCreate(&command_queue->barrier_event_));
3780  }
3781  if (num_events_in_wait_list == 0) { // wait on all work
3782  if (command_queue->barrier_tmp_event_ == nullptr) {
3783  PI_CHECK_ERROR(hipEventCreate(&command_queue->barrier_tmp_event_));
3784  }
3785  command_queue->sync_streams(
3786  [hipStream,
3787  tmp_event = command_queue->barrier_tmp_event_](hipStream_t s) {
3788  if (hipStream != s) {
3789  PI_CHECK_ERROR(hipEventRecord(tmp_event, s));
3790  PI_CHECK_ERROR(hipStreamWaitEvent(hipStream, tmp_event, 0));
3791  }
3792  });
3793  } else { // wait just on given events
3794  forLatestEvents(event_wait_list, num_events_in_wait_list,
3795  [hipStream](pi_event event) -> pi_result {
3796  if (event->get_queue()->has_been_synchronized(
3797  event->get_compute_stream_token())) {
3798  return PI_SUCCESS;
3799  } else {
3800  return PI_CHECK_ERROR(
3801  hipStreamWaitEvent(hipStream, event->get(), 0));
3802  }
3803  });
3804  }
3805 
3806  result = PI_CHECK_ERROR(
3807  hipEventRecord(command_queue->barrier_event_, hipStream));
3808  for (unsigned int i = 0;
3809  i < command_queue->compute_applied_barrier_.size(); i++) {
3810  command_queue->compute_applied_barrier_[i] = false;
3811  }
3812  for (unsigned int i = 0;
3813  i < command_queue->transfer_applied_barrier_.size(); i++) {
3814  command_queue->transfer_applied_barrier_[i] = false;
3815  }
3816  }
3817  if (result != PI_SUCCESS) {
3818  return result;
3819  }
3820 
3821  if (event) {
3822  *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue,
3823  hipStream, stream_token);
3824  (*event)->start();
3825  (*event)->record();
3826  }
3827 
3828  return PI_SUCCESS;
3829  } catch (pi_result err) {
3830  return err;
3831  } catch (...) {
3832  return PI_ERROR_UNKNOWN;
3833  }
3834 }
3835 
3843  pi_native_handle *nativeHandle) {
3844  *nativeHandle = reinterpret_cast<pi_native_handle>(event->get());
3845  return PI_SUCCESS;
3846 }
3847 
3857  pi_context context,
3858  bool ownNativeHandle,
3859  pi_event *event) {
3860  (void)nativeHandle;
3861  (void)context;
3862  (void)ownNativeHandle;
3863  (void)event;
3864 
3866  "Creation of PI event from native handle not implemented");
3867  return {};
3868 }
3869 
3880  const pi_sampler_properties *sampler_properties,
3881  pi_sampler *result_sampler) {
3882  std::unique_ptr<_pi_sampler> retImplSampl{new _pi_sampler(context)};
3883 
3884  bool propSeen[3] = {false, false, false};
3885  for (size_t i = 0; sampler_properties[i] != 0; i += 2) {
3886  switch (sampler_properties[i]) {
3888  if (propSeen[0]) {
3889  return PI_ERROR_INVALID_VALUE;
3890  }
3891  propSeen[0] = true;
3892  retImplSampl->props_ |= sampler_properties[i + 1];
3893  break;
3895  if (propSeen[1]) {
3896  return PI_ERROR_INVALID_VALUE;
3897  }
3898  propSeen[1] = true;
3899  retImplSampl->props_ |=
3900  (sampler_properties[i + 1] - PI_SAMPLER_FILTER_MODE_NEAREST) << 1;
3901  break;
3903  if (propSeen[2]) {
3904  return PI_ERROR_INVALID_VALUE;
3905  }
3906  propSeen[2] = true;
3907  retImplSampl->props_ |=
3908  (sampler_properties[i + 1] - PI_SAMPLER_ADDRESSING_MODE_NONE) << 2;
3909  break;
3910  default:
3911  return PI_ERROR_INVALID_VALUE;
3912  }
3913  }
3914 
3915  if (!propSeen[0]) {
3916  retImplSampl->props_ |= PI_TRUE;
3917  }
3918  // Default filter mode to CL_FILTER_NEAREST
3919  if (!propSeen[2]) {
3920  retImplSampl->props_ |=
3922  << 2;
3923  }
3924 
3925  *result_sampler = retImplSampl.release();
3926  return PI_SUCCESS;
3927 }
3928 
3939  size_t param_value_size, void *param_value,
3940  size_t *param_value_size_ret) {
3941  assert(sampler != nullptr);
3942 
3943  switch (param_name) {
3945  return getInfo(param_value_size, param_value, param_value_size_ret,
3946  sampler->get_reference_count());
3948  return getInfo(param_value_size, param_value, param_value_size_ret,
3949  sampler->context_);
3951  pi_bool norm_coords_prop = static_cast<pi_bool>(sampler->props_ & 0x1);
3952  return getInfo(param_value_size, param_value, param_value_size_ret,
3953  norm_coords_prop);
3954  }
3956  pi_sampler_filter_mode filter_prop = static_cast<pi_sampler_filter_mode>(
3957  ((sampler->props_ >> 1) & 0x1) + PI_SAMPLER_FILTER_MODE_NEAREST);
3958  return getInfo(param_value_size, param_value, param_value_size_ret,
3959  filter_prop);
3960  }
3962  pi_sampler_addressing_mode addressing_prop =
3963  static_cast<pi_sampler_addressing_mode>(
3964  (sampler->props_ >> 2) + PI_SAMPLER_ADDRESSING_MODE_NONE);
3965  return getInfo(param_value_size, param_value, param_value_size_ret,
3966  addressing_prop);
3967  }
3968  default:
3970  }
3971  return {};
3972 }
3973 
3980  assert(sampler != nullptr);
3981  sampler->increment_reference_count();
3982  return PI_SUCCESS;
3983 }
3984 
3992  assert(sampler != nullptr);
3993 
3994  // double delete or someone is messing with the ref count.
3995  // either way, cannot safely proceed.
3997  sampler->get_reference_count() != 0,
3998  "Reference count overflow detected in hip_piSamplerRelease.");
3999 
4000  // decrement ref count. If it is 0, delete the sampler.
4001  if (sampler->decrement_reference_count() == 0) {
4002  delete sampler;
4003  }
4004 
4005  return PI_SUCCESS;
4006 }
4007 
4014  hipStream_t hip_stream, pi_buff_rect_region region, const void *src_ptr,
4015  const hipMemoryType src_type, pi_buff_rect_offset src_offset,
4016  size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr,
4017  const hipMemoryType dst_type, pi_buff_rect_offset dst_offset,
4018  size_t dst_row_pitch, size_t dst_slice_pitch) {
4019 
4020  assert(region != nullptr);
4021  assert(src_offset != nullptr);
4022  assert(dst_offset != nullptr);
4023 
4024  assert(src_type == hipMemoryTypeDevice || src_type == hipMemoryTypeHost);
4025  assert(dst_type == hipMemoryTypeDevice || dst_type == hipMemoryTypeHost);
4026 
4027  src_row_pitch = (!src_row_pitch) ? region->width_bytes : src_row_pitch;
4028  src_slice_pitch = (!src_slice_pitch) ? (region->height_scalar * src_row_pitch)
4029  : src_slice_pitch;
4030  dst_row_pitch = (!dst_row_pitch) ? region->width_bytes : dst_row_pitch;
4031  dst_slice_pitch = (!dst_slice_pitch) ? (region->height_scalar * dst_row_pitch)
4032  : dst_slice_pitch;
4033 
4034  HIP_MEMCPY3D params;
4035 
4036  params.WidthInBytes = region->width_bytes;
4037  params.Height = region->height_scalar;
4038  params.Depth = region->depth_scalar;
4039 
4040  params.srcMemoryType = src_type;
4041  params.srcDevice = src_type == hipMemoryTypeDevice
4042  ? *static_cast<const hipDeviceptr_t *>(src_ptr)
4043  : 0;
4044  params.srcHost = src_type == hipMemoryTypeHost ? src_ptr : nullptr;
4045  params.srcXInBytes = src_offset->x_bytes;
4046  params.srcY = src_offset->y_scalar;
4047  params.srcZ = src_offset->z_scalar;
4048  params.srcPitch = src_row_pitch;
4049  params.srcHeight = src_slice_pitch / src_row_pitch;
4050 
4051  params.dstMemoryType = dst_type;
4052  params.dstDevice = dst_type == hipMemoryTypeDevice
4053  ? *reinterpret_cast<hipDeviceptr_t *>(dst_ptr)
4054  : 0;
4055  params.dstHost = dst_type == hipMemoryTypeHost ? dst_ptr : nullptr;
4056  params.dstXInBytes = dst_offset->x_bytes;
4057  params.dstY = dst_offset->y_scalar;
4058  params.dstZ = dst_offset->z_scalar;
4059  params.dstPitch = dst_row_pitch;
4060  params.dstHeight = dst_slice_pitch / dst_row_pitch;
4061 
4062  return PI_CHECK_ERROR(hipDrvMemcpy3DAsync(&params, hip_stream));
4063 
4064  return PI_SUCCESS;
4065 }
4066 
4068  pi_queue command_queue, pi_mem buffer, pi_bool blocking_read,
4069  pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
4070  pi_buff_rect_region region, size_t buffer_row_pitch,
4071  size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
4072  void *ptr, pi_uint32 num_events_in_wait_list,
4073  const pi_event *event_wait_list, pi_event *event) {
4074 
4075  assert(buffer != nullptr);
4076  assert(command_queue != nullptr);
4077 
4078  pi_result retErr = PI_SUCCESS;
4079  void *devPtr = buffer->mem_.buffer_mem_.get_void();
4080  std::unique_ptr<_pi_event> retImplEv{nullptr};
4081 
4082  try {
4083  ScopedContext active(command_queue->get_context());
4084  hipStream_t hipStream = command_queue->get_next_transfer_stream();
4085 
4086  retErr = enqueueEventsWait(command_queue, hipStream,
4087  num_events_in_wait_list, event_wait_list);
4088 
4089  if (event) {
4090  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4091  PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, command_queue, hipStream));
4092  retImplEv->start();
4093  }
4094 
4096  hipStream, region, &devPtr, hipMemoryTypeDevice, buffer_offset,
4097  buffer_row_pitch, buffer_slice_pitch, ptr, hipMemoryTypeHost,
4098  host_offset, host_row_pitch, host_slice_pitch);
4099 
4100  if (event) {
4101  retErr = retImplEv->record();
4102  }
4103 
4104  if (blocking_read) {
4105  retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
4106  }
4107 
4108  if (event) {
4109  *event = retImplEv.release();
4110  }
4111 
4112  } catch (pi_result err) {
4113  retErr = err;
4114  }
4115  return retErr;
4116 }
4117 
4119  pi_queue command_queue, pi_mem buffer, pi_bool blocking_write,
4120  pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
4121  pi_buff_rect_region region, size_t buffer_row_pitch,
4122  size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
4123  const void *ptr, pi_uint32 num_events_in_wait_list,
4124  const pi_event *event_wait_list, pi_event *event) {
4125 
4126  assert(buffer != nullptr);
4127  assert(command_queue != nullptr);
4128 
4129  pi_result retErr = PI_SUCCESS;
4130  void *devPtr = buffer->mem_.buffer_mem_.get_void();
4131  std::unique_ptr<_pi_event> retImplEv{nullptr};
4132 
4133  try {
4134  ScopedContext active(command_queue->get_context());
4135  hipStream_t hipStream = command_queue->get_next_transfer_stream();
4136  retErr = enqueueEventsWait(command_queue, hipStream,
4137  num_events_in_wait_list, event_wait_list);
4138 
4139  if (event) {
4140  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4141  PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, command_queue, hipStream));
4142  retImplEv->start();
4143  }
4144 
4146  hipStream, region, ptr, hipMemoryTypeHost, host_offset, host_row_pitch,
4147  host_slice_pitch, &devPtr, hipMemoryTypeDevice, buffer_offset,
4148  buffer_row_pitch, buffer_slice_pitch);
4149 
4150  if (event) {
4151  retErr = retImplEv->record();
4152  }
4153 
4154  if (blocking_write) {
4155  retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
4156  }
4157 
4158  if (event) {
4159  *event = retImplEv.release();
4160  }
4161 
4162  } catch (pi_result err) {
4163  retErr = err;
4164  }
4165  return retErr;
4166 }
4167 
4169  pi_mem dst_buffer, size_t src_offset,
4170  size_t dst_offset, size_t size,
4171  pi_uint32 num_events_in_wait_list,
4172  const pi_event *event_wait_list,
4173  pi_event *event) {
4174  if (!command_queue) {
4175  return PI_ERROR_INVALID_QUEUE;
4176  }
4177 
4178  std::unique_ptr<_pi_event> retImplEv{nullptr};
4179 
4180  try {
4181  ScopedContext active(command_queue->get_context());
4182  pi_result result;
4183  auto stream = command_queue->get_next_transfer_stream();
4184 
4185  if (event_wait_list) {
4186  result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
4187  event_wait_list);
4188  }
4189 
4190  if (event) {
4191  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4192  PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue, stream));
4193  result = retImplEv->start();
4194  }
4195 
4196  auto src = src_buffer->mem_.buffer_mem_.get_with_offset(src_offset);
4197  auto dst = dst_buffer->mem_.buffer_mem_.get_with_offset(dst_offset);
4198 
4199  result = PI_CHECK_ERROR(hipMemcpyDtoDAsync(dst, src, size, stream));
4200 
4201  if (event) {
4202  result = retImplEv->record();
4203  *event = retImplEv.release();
4204  }
4205 
4206  return result;
4207  } catch (pi_result err) {
4208  return err;
4209  } catch (...) {
4210  return PI_ERROR_UNKNOWN;
4211  }
4212 }
4213 
4215  pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer,
4216  pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin,
4217  pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch,
4218  size_t dst_row_pitch, size_t dst_slice_pitch,
4219  pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
4220  pi_event *event) {
4221 
4222  assert(src_buffer != nullptr);
4223  assert(dst_buffer != nullptr);
4224  assert(command_queue != nullptr);
4225 
4226  pi_result retErr = PI_SUCCESS;
4227  void *srcPtr = src_buffer->mem_.buffer_mem_.get_void();
4228  void *dstPtr = dst_buffer->mem_.buffer_mem_.get_void();
4229  std::unique_ptr<_pi_event> retImplEv{nullptr};
4230 
4231  try {
4232  ScopedContext active(command_queue->get_context());
4233  hipStream_t hipStream = command_queue->get_next_transfer_stream();
4234  retErr = enqueueEventsWait(command_queue, hipStream,
4235  num_events_in_wait_list, event_wait_list);
4236 
4237  if (event) {
4238  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4239  PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue, hipStream));
4240  retImplEv->start();
4241  }
4242 
4244  hipStream, region, &srcPtr, hipMemoryTypeDevice, src_origin,
4245  src_row_pitch, src_slice_pitch, &dstPtr, hipMemoryTypeDevice,
4246  dst_origin, dst_row_pitch, dst_slice_pitch);
4247 
4248  if (event) {
4249  retImplEv->record();
4250  *event = retImplEv.release();
4251  }
4252 
4253  } catch (pi_result err) {
4254  retErr = err;
4255  }
4256  return retErr;
4257 }
4258 
4260  const void *pattern, size_t pattern_size,
4261  size_t offset, size_t size,
4262  pi_uint32 num_events_in_wait_list,
4263  const pi_event *event_wait_list,
4264  pi_event *event) {
4265  assert(command_queue != nullptr);
4266 
4267  auto args_are_multiples_of_pattern_size =
4268  (offset % pattern_size == 0) || (size % pattern_size == 0);
4269 
4270  auto pattern_is_valid = (pattern != nullptr);
4271 
4272  auto pattern_size_is_valid =
4273  ((pattern_size & (pattern_size - 1)) == 0) && // is power of two
4274  (pattern_size > 0) && (pattern_size <= 128); // falls within valid range
4275 
4276  assert(args_are_multiples_of_pattern_size && pattern_is_valid &&
4277  pattern_size_is_valid);
4278  (void)args_are_multiples_of_pattern_size;
4279  (void)pattern_is_valid;
4280  (void)pattern_size_is_valid;
4281 
4282  std::unique_ptr<_pi_event> retImplEv{nullptr};
4283 
4284  try {
4285  ScopedContext active(command_queue->get_context());
4286 
4287  auto stream = command_queue->get_next_transfer_stream();
4288  pi_result result;
4289  if (event_wait_list) {
4290  result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
4291  event_wait_list);
4292  }
4293 
4294  if (event) {
4295  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4296  PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue, stream));
4297  result = retImplEv->start();
4298  }
4299 
4300  auto dstDevice = buffer->mem_.buffer_mem_.get_with_offset(offset);
4301  auto N = size / pattern_size;
4302 
4303  // pattern size in bytes
4304  switch (pattern_size) {
4305  case 1: {
4306  auto value = *static_cast<const uint8_t *>(pattern);
4307  result = PI_CHECK_ERROR(hipMemsetD8Async(dstDevice, value, N, stream));
4308  break;
4309  }
4310  case 2: {
4311  auto value = *static_cast<const uint16_t *>(pattern);
4312  result = PI_CHECK_ERROR(hipMemsetD16Async(dstDevice, value, N, stream));
4313  break;
4314  }
4315  case 4: {
4316  auto value = *static_cast<const uint32_t *>(pattern);
4317  result = PI_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, N, stream));
4318  break;
4319  }
4320 
4321  default: {
4322  // HIP has no memset functions that allow setting values more than 4
4323  // bytes. PI API lets you pass an arbitrary "pattern" to the buffer
4324  // fill, which can be more than 4 bytes. We must break up the pattern
4325  // into 1 byte values, and set the buffer using multiple strided calls.
4326  // The first 4 patterns are set using hipMemsetD32Async then all
4327  // subsequent 1 byte patterns are set using hipMemset2DAsync which is
4328  // called for each pattern.
4329 
4330  // Calculate the number of patterns, stride, number of times the pattern
4331  // needs to be applied, and the number of times the first 32 bit pattern
4332  // needs to be applied.
4333  auto number_of_steps = pattern_size / sizeof(uint8_t);
4334  auto pitch = number_of_steps * sizeof(uint8_t);
4335  auto height = size / number_of_steps;
4336  auto count_32 = size / sizeof(uint32_t);
4337 
4338  // Get 4-byte chunk of the pattern and call hipMemsetD32Async
4339  auto value = *(static_cast<const uint32_t *>(pattern));
4340  result =
4341  PI_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, count_32, stream));
4342  for (auto step = 4u; step < number_of_steps; ++step) {
4343  // take 1 byte of the pattern
4344  value = *(static_cast<const uint8_t *>(pattern) + step);
4345 
4346  // offset the pointer to the part of the buffer we want to write to
4347  auto offset_ptr = reinterpret_cast<void *>(
4348  reinterpret_cast<uint8_t *>(dstDevice) + (step * sizeof(uint8_t)));
4349 
4350  // set all of the pattern chunks
4351  result = PI_CHECK_ERROR(hipMemset2DAsync(
4352  offset_ptr, pitch, value, sizeof(uint8_t), height, stream));
4353  }
4354  break;
4355  }
4356  }
4357 
4358  if (event) {
4359  result = retImplEv->record();
4360  *event = retImplEv.release();
4361  }
4362 
4363  return result;
4364  } catch (pi_result err) {
4365  return err;
4366  } catch (...) {
4367  return PI_ERROR_UNKNOWN;
4368  }
4369 }
4370 
4371 static size_t imageElementByteSize(hipArray_Format array_format) {
4372  switch (array_format) {
4373  case HIP_AD_FORMAT_UNSIGNED_INT8:
4374  case HIP_AD_FORMAT_SIGNED_INT8:
4375  return 1;
4376  case HIP_AD_FORMAT_UNSIGNED_INT16:
4377  case HIP_AD_FORMAT_SIGNED_INT16:
4378  case HIP_AD_FORMAT_HALF:
4379  return 2;
4380  case HIP_AD_FORMAT_UNSIGNED_INT32:
4381  case HIP_AD_FORMAT_SIGNED_INT32:
4382  case HIP_AD_FORMAT_FLOAT:
4383  return 4;
4384  default:
4385  return 0;
4386  }
4387  sycl::detail::pi::die("Invalid iamge format.");
4388  return 0;
4389 }
4390 
4396 
4398  hipStream_t hip_stream, pi_mem_type img_type, const size_t *region,
4399  const void *src_ptr, const hipMemoryType src_type, const size_t *src_offset,
4400  void *dst_ptr, const hipMemoryType dst_type, const size_t *dst_offset) {
4401  assert(region != nullptr);
4402 
4403  assert(src_type == hipMemoryTypeArray || src_type == hipMemoryTypeHost);
4404  assert(dst_type == hipMemoryTypeArray || dst_type == hipMemoryTypeHost);
4405 
4406  if (img_type == PI_MEM_TYPE_IMAGE2D) {
4407  hip_Memcpy2D cpyDesc;
4408  memset(&cpyDesc, 0, sizeof(cpyDesc));
4409  cpyDesc.srcMemoryType = src_type;
4410  if (src_type == hipMemoryTypeArray) {
4411  cpyDesc.srcArray =
4412  reinterpret_cast<hipCUarray>(const_cast<void *>(src_ptr));
4413  cpyDesc.srcXInBytes = src_offset[0];
4414  cpyDesc.srcY = src_offset[1];
4415  } else {
4416  cpyDesc.srcHost = src_ptr;
4417  }
4418  cpyDesc.dstMemoryType = dst_type;
4419  if (dst_type == hipMemoryTypeArray) {
4420  cpyDesc.dstArray =
4421  reinterpret_cast<hipCUarray>(const_cast<void *>(dst_ptr));
4422  cpyDesc.dstXInBytes = dst_offset[0];
4423  cpyDesc.dstY = dst_offset[1];
4424  } else {
4425  cpyDesc.dstHost = dst_ptr;
4426  }
4427  cpyDesc.WidthInBytes = region[0];
4428  cpyDesc.Height = region[1];
4429  return PI_CHECK_ERROR(hipMemcpyParam2DAsync(&cpyDesc, hip_stream));
4430  }
4431 
4432  if (img_type == PI_MEM_TYPE_IMAGE3D) {
4433 
4434  HIP_MEMCPY3D cpyDesc;
4435  memset(&cpyDesc, 0, sizeof(cpyDesc));
4436  cpyDesc.srcMemoryType = src_type;
4437  if (src_type == hipMemoryTypeArray) {
4438  cpyDesc.srcArray =
4439  reinterpret_cast<hipCUarray>(const_cast<void *>(src_ptr));
4440  cpyDesc.srcXInBytes = src_offset[0];
4441  cpyDesc.srcY = src_offset[1];
4442  cpyDesc.srcZ = src_offset[2];
4443  } else {
4444  cpyDesc.srcHost = src_ptr;
4445  }
4446  cpyDesc.dstMemoryType = dst_type;
4447  if (dst_type == hipMemoryTypeArray) {
4448  cpyDesc.dstArray = reinterpret_cast<hipCUarray>(dst_ptr);
4449  cpyDesc.dstXInBytes = dst_offset[0];
4450  cpyDesc.dstY = dst_offset[1];
4451  cpyDesc.dstZ = dst_offset[2];
4452  } else {
4453  cpyDesc.dstHost = dst_ptr;
4454  }
4455  cpyDesc.WidthInBytes = region[0];
4456  cpyDesc.Height = region[1];
4457  cpyDesc.Depth = region[2];
4458  return PI_CHECK_ERROR(hipDrvMemcpy3DAsync(&cpyDesc, hip_stream));
4459  return PI_ERROR_UNKNOWN;
4460  }
4461 
4462  return PI_ERROR_INVALID_VALUE;
4463 }
4464 
4466  pi_bool blocking_read, const size_t *origin,
4467  const size_t *region, size_t row_pitch,
4468  size_t slice_pitch, void *ptr,
4469  pi_uint32 num_events_in_wait_list,
4470  const pi_event *event_wait_list,
4471  pi_event *event) {
4472  (void)row_pitch;
4473  (void)slice_pitch;
4474 
4475  assert(command_queue != nullptr);
4476  assert(image != nullptr);
4477  assert(image->mem_type_ == _pi_mem::mem_type::surface);
4478 
4479  pi_result retErr = PI_SUCCESS;
4480 
4481  try {
4482  ScopedContext active(command_queue->get_context());
4483  hipStream_t hipStream = command_queue->get_next_transfer_stream();
4484 
4485  if (event_wait_list) {
4486  retErr = enqueueEventsWait(command_queue, hipStream,
4487  num_events_in_wait_list, event_wait_list);
4488  }
4489 
4490  hipArray *array = image->mem_.surface_mem_.get_array();
4491 
4492  hipArray_Format Format;
4493  size_t NumChannels;
4494  getArrayDesc(array, Format, NumChannels);
4495 
4496  int elementByteSize = imageElementByteSize(Format);
4497 
4498  size_t byteOffsetX = origin[0] * elementByteSize * NumChannels;
4499  size_t bytesToCopy = elementByteSize * NumChannels * region[0];
4500 
4501  pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
4502 
4503  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4504  size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]};
4505 
4506  retErr = commonEnqueueMemImageNDCopy(hipStream, imgType, adjustedRegion,
4507  array, hipMemoryTypeArray, srcOffset,
4508  ptr, hipMemoryTypeHost, nullptr);
4509 
4510  if (retErr != PI_SUCCESS) {
4511  return retErr;
4512  }
4513 
4514  if (event) {
4516  command_queue, hipStream);
4517  new_event->record();
4518  *event = new_event;
4519  }
4520 
4521  if (blocking_read) {
4522  retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
4523  }
4524  } catch (pi_result err) {
4525  return err;
4526  } catch (...) {
4527  return PI_ERROR_UNKNOWN;
4528  }
4529  return PI_SUCCESS;
4530  return retErr;
4531 }
4532 
4534  pi_bool blocking_write,
4535  const size_t *origin, const size_t *region,
4536  size_t input_row_pitch,
4537  size_t input_slice_pitch, const void *ptr,
4538  pi_uint32 num_events_in_wait_list,
4539  const pi_event *event_wait_list,
4540  pi_event *event) {
4541  (void)blocking_write;
4542  (void)input_row_pitch;
4543  (void)input_slice_pitch;
4544  assert(command_queue != nullptr);
4545  assert(image != nullptr);
4546  assert(image->mem_type_ == _pi_mem::mem_type::surface);
4547 
4548  pi_result retErr = PI_SUCCESS;
4549 
4550  try {
4551  ScopedContext active(command_queue->get_context());
4552  hipStream_t hipStream = command_queue->get_next_transfer_stream();
4553 
4554  if (event_wait_list) {
4555  retErr = enqueueEventsWait(command_queue, hipStream,
4556  num_events_in_wait_list, event_wait_list);
4557  }
4558 
4559  hipArray *array = image->mem_.surface_mem_.get_array();
4560 
4561  hipArray_Format Format;
4562  size_t NumChannels;
4563  getArrayDesc(array, Format, NumChannels);
4564 
4565  int elementByteSize = imageElementByteSize(Format);
4566 
4567  size_t byteOffsetX = origin[0] * elementByteSize * NumChannels;
4568  size_t bytesToCopy = elementByteSize * NumChannels * region[0];
4569 
4570  pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
4571 
4572  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4573  size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]};
4574 
4575  retErr = commonEnqueueMemImageNDCopy(hipStream, imgType, adjustedRegion,
4576  ptr, hipMemoryTypeHost, nullptr, array,
4577  hipMemoryTypeArray, dstOffset);
4578 
4579  if (retErr != PI_SUCCESS) {
4580  return retErr;
4581  }
4582 
4583  if (event) {
4585  command_queue, hipStream);
4586  new_event->record();
4587  *event = new_event;
4588  }
4589  } catch (pi_result err) {
4590  return err;
4591  } catch (...) {
4592  return PI_ERROR_UNKNOWN;
4593  }
4594 
4595  return PI_SUCCESS;
4596 
4597  return retErr;
4598 }
4599 
4601  pi_mem dst_image, const size_t *src_origin,
4602  const size_t *dst_origin,
4603  const size_t *region,
4604  pi_uint32 num_events_in_wait_list,
4605  const pi_event *event_wait_list,
4606  pi_event *event) {
4607 
4608  assert(src_image->mem_type_ == _pi_mem::mem_type::surface);
4609  assert(dst_image->mem_type_ == _pi_mem::mem_type::surface);
4610  assert(src_image->mem_.surface_mem_.get_image_type() ==
4611  dst_image->mem_.surface_mem_.get_image_type());
4612 
4613  pi_result retErr = PI_SUCCESS;
4614 
4615  try {
4616  ScopedContext active(command_queue->get_context());
4617  hipStream_t hipStream = command_queue->get_next_transfer_stream();
4618  if (event_wait_list) {
4619  retErr = enqueueEventsWait(command_queue, hipStream,
4620  num_events_in_wait_list, event_wait_list);
4621  }
4622 
4623  hipArray *srcArray = src_image->mem_.surface_mem_.get_array();
4624  hipArray_Format srcFormat;
4625  size_t srcNumChannels;
4626  getArrayDesc(srcArray, srcFormat, srcNumChannels);
4627 
4628  hipArray *dstArray = dst_image->mem_.surface_mem_.get_array();
4629  hipArray_Format dstFormat;
4630  size_t dstNumChannels;
4631  getArrayDesc(dstArray, dstFormat, dstNumChannels);
4632 
4633  assert(srcFormat == dstFormat);
4634  assert(srcNumChannels == dstNumChannels);
4635 
4636  int elementByteSize = imageElementByteSize(srcFormat);
4637 
4638  size_t dstByteOffsetX = dst_origin[0] * elementByteSize * srcNumChannels;
4639  size_t srcByteOffsetX = src_origin[0] * elementByteSize * dstNumChannels;
4640  size_t bytesToCopy = elementByteSize * srcNumChannels * region[0];
4641 
4642  pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type();
4643 
4644  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4645  size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
4646  size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
4647 
4648  retErr = commonEnqueueMemImageNDCopy(
4649  hipStream, imgType, adjustedRegion, srcArray, hipMemoryTypeArray,
4650  srcOffset, dstArray, hipMemoryTypeArray, dstOffset);
4651 
4652  if (retErr != PI_SUCCESS) {
4653  return retErr;
4654  }
4655 
4656  if (event) {
4658  command_queue, hipStream);
4659  new_event->record();
4660  *event = new_event;
4661  }
4662  } catch (pi_result err) {
4663  return err;
4664  } catch (...) {
4665  return PI_ERROR_UNKNOWN;
4666  }
4667 
4668  return PI_SUCCESS;
4669  return retErr;
4670 }
4671 
4674  const void *fill_color,
4675  const size_t *origin, const size_t *region,
4676  pi_uint32 num_events_in_wait_list,
4677  const pi_event *event_wait_list,
4678  pi_event *event) {
4679  (void)command_queue;
4680  (void)image;
4681  (void)fill_color;
4682  (void)origin;
4683  (void)region;
4684  (void)num_events_in_wait_list;
4685  (void)event_wait_list;
4686  (void)event;
4687 
4688  sycl::detail::pi::die("hip_piEnqueueMemImageFill not implemented");
4689  return {};
4690 }
4691 
4698  pi_bool blocking_map,
4699  pi_map_flags map_flags, size_t offset,
4700  size_t size,
4701  pi_uint32 num_events_in_wait_list,
4702  const pi_event *event_wait_list,
4703  pi_event *event, void **ret_map) {
4704  assert(ret_map != nullptr);
4705  assert(command_queue != nullptr);
4706  assert(buffer != nullptr);
4707  assert(buffer->mem_type_ == _pi_mem::mem_type::buffer);
4708 
4709  pi_result ret_err = PI_ERROR_INVALID_OPERATION;
4710  const bool is_pinned = buffer->mem_.buffer_mem_.allocMode_ ==
4712 
4713  // Currently no support for overlapping regions
4714  if (buffer->mem_.buffer_mem_.get_map_ptr() != nullptr) {
4715  return ret_err;
4716  }
4717 
4718  // Allocate a pointer in the host to store the mapped information
4719  auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags);
4720  *ret_map = buffer->mem_.buffer_mem_.get_map_ptr();
4721  if (hostPtr) {
4722  ret_err = PI_SUCCESS;
4723  }
4724 
4725  if (!is_pinned && ((map_flags & PI_MAP_READ) || (map_flags & PI_MAP_WRITE))) {
4726  // Pinned host memory is already on host so it doesn't need to be read.
4727  ret_err = hip_piEnqueueMemBufferRead(
4728  command_queue, buffer, blocking_map, offset, size, hostPtr,
4729  num_events_in_wait_list, event_wait_list, event);
4730  } else {
4731  ScopedContext active(command_queue->get_context());
4732 
4733  if (is_pinned) {
4734  ret_err = hip_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
4735  event_wait_list, nullptr);
4736  }
4737 
4738  if (event) {
4739  try {
4740  *event = _pi_event::make_native(
4741  PI_COMMAND_TYPE_MEM_BUFFER_MAP, command_queue,
4742  command_queue->get_next_transfer_stream());
4743  (*event)->start();
4744  (*event)->record();
4745  } catch (pi_result error) {
4746  ret_err = error;
4747  }
4748  }
4749  }
4750 
4751  return ret_err;
4752 }
4753 
4759  void *mapped_ptr,
4760  pi_uint32 num_events_in_wait_list,
4761  const pi_event *event_wait_list,
4762  pi_event *event) {
4763  pi_result ret_err = PI_SUCCESS;
4764 
4765  assert(command_queue != nullptr);
4766  assert(mapped_ptr != nullptr);
4767  assert(memobj != nullptr);
4768  assert(memobj->mem_type_ == _pi_mem::mem_type::buffer);
4769  assert(memobj->mem_.buffer_mem_.get_map_ptr() != nullptr);
4770  assert(memobj->mem_.buffer_mem_.get_map_ptr() == mapped_ptr);
4771 
4772  const bool is_pinned = memobj->mem_.buffer_mem_.allocMode_ ==
4774 
4775  if (!is_pinned &&
4776  ((memobj->mem_.buffer_mem_.get_map_flags() & PI_MAP_WRITE) ||
4777  (memobj->mem_.buffer_mem_.get_map_flags() &
4779  // Pinned host memory is only on host so it doesn't need to be written to.
4780  ret_err = hip_piEnqueueMemBufferWrite(
4781  command_queue, memobj, true,
4782  memobj->mem_.buffer_mem_.get_map_offset(mapped_ptr),
4783  memobj->mem_.buffer_mem_.get_size(), mapped_ptr,
4784  num_events_in_wait_list, event_wait_list, event);
4785  } else {
4786  ScopedContext active(command_queue->get_context());
4787 
4788  if (is_pinned) {
4789  ret_err = hip_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
4790  event_wait_list, nullptr);
4791  }
4792 
4793  if (event) {
4794  try {
4795  *event = _pi_event::make_native(
4796  PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, command_queue,
4797  command_queue->get_next_transfer_stream());
4798  (*event)->start();
4799  (*event)->record();
4800  } catch (pi_result error) {
4801  ret_err = error;
4802  }
4803  }
4804  }
4805 
4806  memobj->mem_.buffer_mem_.unmap(mapped_ptr);
4807  return ret_err;
4808 }
4809 
4812 pi_result hip_piextUSMHostAlloc(void **result_ptr, pi_context context,
4813  pi_usm_mem_properties *properties, size_t size,
4814  pi_uint32 alignment) {
4815  assert(result_ptr != nullptr);
4816  assert(context != nullptr);
4817  assert(properties == nullptr || *properties == 0);
4818  pi_result result = PI_SUCCESS;
4819  try {
4820  ScopedContext active(context);
4821  result = PI_CHECK_ERROR(hipHostMalloc(result_ptr, size));
4822  } catch (pi_result error) {
4823  result = error;
4824  }
4825 
4826  assert(alignment == 0 ||
4827  (result == PI_SUCCESS &&
4828  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
4829  return result;
4830 }
4831 
4834 pi_result hip_piextUSMDeviceAlloc(void **result_ptr, pi_context context,
4835  pi_device device,
4836  pi_usm_mem_properties *properties,
4837  size_t size, pi_uint32 alignment) {
4838  assert(result_ptr != nullptr);
4839  assert(context != nullptr);
4840  assert(device != nullptr);
4841  assert(properties == nullptr || *properties == 0);
4842  pi_result result = PI_SUCCESS;
4843  try {
4844  ScopedContext active(context);
4845  result = PI_CHECK_ERROR(hipMalloc(result_ptr, size));
4846  } catch (pi_result error) {
4847  result = error;
4848  }
4849 
4850  assert(alignment == 0 ||
4851  (result == PI_SUCCESS &&
4852  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
4853  return result;
4854 }
4855 
4858 pi_result hip_piextUSMSharedAlloc(void **result_ptr, pi_context context,
4859  pi_device device,
4860  pi_usm_mem_properties *properties,
4861  size_t size, pi_uint32 alignment) {
4862  assert(result_ptr != nullptr);
4863  assert(context != nullptr);
4864  assert(device != nullptr);
4865  assert(properties == nullptr || *properties == 0);
4866  pi_result result = PI_SUCCESS;
4867  try {
4868  ScopedContext active(context);
4869  result =
4870  PI_CHECK_ERROR(hipMallocManaged(result_ptr, size, hipMemAttachGlobal));
4871  } catch (pi_result error) {
4872  result = error;
4873  }
4874 
4875  assert(alignment == 0 ||
4876  (result == PI_SUCCESS &&
4877  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
4878  return result;
4879 }
4880 
4884 
4885  assert(context != nullptr);
4886  pi_result result = PI_SUCCESS;
4887  try {
4888  ScopedContext active(context);
4889  unsigned int type;
4890  hipPointerAttribute_t hipPointerAttributeType;
4891  result =
4892  PI_CHECK_ERROR(hipPointerGetAttributes(&hipPointerAttributeType, ptr));
4893  type = hipPointerAttributeType.memoryType;
4894  assert(type == hipMemoryTypeDevice or type == hipMemoryTypeHost);
4895  if (type == hipMemoryTypeDevice) {
4896  result = PI_CHECK_ERROR(hipFree(ptr));
4897  }
4898  if (type == hipMemoryTypeHost) {
4899  result = PI_CHECK_ERROR(hipFreeHost(ptr));
4900  }
4901  } catch (pi_result error) {
4902  result = error;
4903  }
4904  return result;
4905 }
4906 
4908  size_t count,
4909  pi_uint32 num_events_in_waitlist,
4910  const pi_event *events_waitlist,
4911  pi_event *event) {
4912 
4913  assert(queue != nullptr);
4914  assert(ptr != nullptr);
4915  pi_result result = PI_SUCCESS;
4916  std::unique_ptr<_pi_event> event_ptr{nullptr};
4917 
4918  try {
4919  ScopedContext active(queue->get_context());
4920  pi_uint32 stream_token;
4921  _pi_stream_guard guard;
4922  hipStream_t hipStream = queue->get_next_compute_stream(
4923  num_events_in_waitlist, events_waitlist, guard, &stream_token);
4924  result = enqueueEventsWait(queue, hipStream, num_events_in_waitlist,
4925  events_waitlist);
4926  if (event) {
4927  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
4928  PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue, hipStream, stream_token));
4929  event_ptr->start();
4930  }
4931  result = PI_CHECK_ERROR(
4932  hipMemsetD8Async(reinterpret_cast<hipDeviceptr_t>(ptr),
4933  (unsigned char)value & 0xFF, count, hipStream));
4934  if (event) {
4935  result = event_ptr->record();
4936  *event = event_ptr.release();
4937  }
4938  } catch (pi_result err) {
4939  result = err;
4940  }
4941 
4942  return result;
4943 }
4944 
4946  void *dst_ptr, const void *src_ptr,
4947  size_t size,
4948  pi_uint32 num_events_in_waitlist,
4949  const pi_event *events_waitlist,
4950  pi_event *event) {
4951  assert(queue != nullptr);
4952  assert(dst_ptr != nullptr);
4953  assert(src_ptr != nullptr);
4954  pi_result result = PI_SUCCESS;
4955 
4956  std::unique_ptr<_pi_event> event_ptr{nullptr};
4957 
4958  try {
4959  ScopedContext active(queue->get_context());
4960  hipStream_t hipStream = queue->get_next_transfer_stream();
4961  result = enqueueEventsWait(queue, hipStream, num_events_in_waitlist,
4962  events_waitlist);
4963  if (event) {
4964  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
4965  PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, hipStream));
4966  event_ptr->start();
4967  }
4968  result = PI_CHECK_ERROR(
4969  hipMemcpyAsync(dst_ptr, src_ptr, size, hipMemcpyDefault, hipStream));
4970  if (event) {
4971  result = event_ptr->record();
4972  }
4973  if (blocking) {
4974  result = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
4975  }
4976  if (event) {
4977  *event = event_ptr.release();
4978  }
4979  } catch (pi_result err) {
4980  result = err;
4981  }
4982  return result;
4983 }
4984 
4986  size_t size, pi_usm_migration_flags flags,
4987  pi_uint32 num_events_in_waitlist,
4988  const pi_event *events_waitlist,
4989  pi_event *event) {
4990 
4991  // flags is currently unused so fail if set
4992  if (flags != 0)
4993  return PI_ERROR_INVALID_VALUE;
4994  assert(queue != nullptr);
4995  assert(ptr != nullptr);
4996  pi_result result = PI_SUCCESS;
4997  std::unique_ptr<_pi_event> event_ptr{nullptr};
4998 
4999  try {
5000  ScopedContext active(queue->get_context());
5001  hipStream_t hipStream = queue->get_next_transfer_stream();
5002  result = enqueueEventsWait(queue, hipStream, num_events_in_waitlist,
5003  events_waitlist);
5004  if (event) {
5005  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5006  PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, hipStream));
5007  event_ptr->start();
5008  }
5009  result = PI_CHECK_ERROR(hipMemPrefetchAsync(
5010  ptr, size, queue->get_context()->get_device()->get(), hipStream));
5011  if (event) {
5012  result = event_ptr->record();
5013  *event = event_ptr.release();
5014  }
5015  } catch (pi_result err) {
5016  result = err;
5017  }
5018 
5019  return result;
5020 }
5021 
5024  size_t length, pi_mem_advice advice,
5025  pi_event *event) {
5026  (void)length;
5027  (void)advice;
5028 
5029  assert(queue != nullptr);
5030  assert(ptr != nullptr);
5031  // TODO implement a mapping to hipMemAdvise once the expected behaviour
5032  // of piextUSMEnqueueMemAdvise is detailed in the USM extension
5033  return hip_piEnqueueEventsWait(queue, 0, nullptr, event);
5034 
5035  return PI_SUCCESS;
5036 }
5037 
5055  pi_mem_alloc_info param_name,
5056  size_t param_value_size,
5057  void *param_value,
5058  size_t *param_value_size_ret) {
5059 
5060  assert(context != nullptr);
5061  assert(ptr != nullptr);
5062  pi_result result = PI_SUCCESS;
5063  hipPointerAttribute_t hipPointerAttributeType;
5064 
5065  try {
5066  ScopedContext active(context);
5067  switch (param_name) {
5068  case PI_MEM_ALLOC_TYPE: {
5069  unsigned int value;
5070  // do not throw if hipPointerGetAttribute returns hipErrorInvalidValue
5071  hipError_t ret = hipPointerGetAttributes(&hipPointerAttributeType, ptr);
5072  if (ret == hipErrorInvalidValue) {
5073  // pointer not known to the HIP subsystem
5074  return getInfo(param_value_size, param_value, param_value_size_ret,
5076  }
5077  result = check_error(ret, __func__, __LINE__ - 5, __FILE__);
5078  value = hipPointerAttributeType.isManaged;
5079  if (value) {
5080  // pointer to managed memory
5081  return getInfo(param_value_size, param_value, param_value_size_ret,
5083  }
5084  result = PI_CHECK_ERROR(
5085  hipPointerGetAttributes(&hipPointerAttributeType, ptr));
5086  value = hipPointerAttributeType.memoryType;
5087  assert(value == hipMemoryTypeDevice or value == hipMemoryTypeHost);
5088  if (value == hipMemoryTypeDevice) {
5089  // pointer to device memory
5090  return getInfo(param_value_size, param_value, param_value_size_ret,
5092  }
5093  if (value == hipMemoryTypeHost) {
5094  // pointer to host memory
5095  return getInfo(param_value_size, param_value, param_value_size_ret,
5097  }
5098  // should never get here
5099  __builtin_unreachable();
5100  return getInfo(param_value_size, param_value, param_value_size_ret,
5102  }
5103  case PI_MEM_ALLOC_BASE_PTR: {
5104  return PI_ERROR_INVALID_VALUE;
5105  }
5106  case PI_MEM_ALLOC_SIZE: {
5107  return PI_ERROR_INVALID_VALUE;
5108  }
5109 
5110  case PI_MEM_ALLOC_DEVICE: {
5111  // get device index associated with this pointer
5112  result = PI_CHECK_ERROR(
5113  hipPointerGetAttributes(&hipPointerAttributeType, ptr));
5114  int device_idx = hipPointerAttributeType.device;
5115 
5116  // currently each device is in its own platform, so find the platform at
5117  // the same index
5118  std::vector<pi_platform> platforms;
5119  platforms.resize(device_idx + 1);
5120  result = hip_piPlatformsGet(device_idx + 1, platforms.data(), nullptr);
5121 
5122  // get the device from the platform
5123  pi_device device = platforms[device_idx]->devices_[0].get();
5124  return getInfo(param_value_size, param_value, param_value_size_ret,
5125  device);
5126  }
5127  }
5128  } catch (pi_result error) {
5129  result = error;
5130  }
5131 
5132  return result;
5133 }
5134 
5135 // This API is called by Sycl RT to notify the end of the plugin lifetime.
5136 // TODO: add a global variable lifetime management code here (see
5137 // pi_level_zero.cpp for reference) Currently this is just a NOOP.
5138 pi_result hip_piTearDown(void *PluginParameter) {
5139  (void)PluginParameter;
5140  return PI_SUCCESS;
5141 }
5142 
5144 
5146  // Check that the major version matches in PiVersion and SupportedVersion
5148 
5149  // PI interface supports higher version or the same version.
5150  size_t PluginVersionSize = sizeof(PluginInit->PluginVersion);
5151  if (strlen(SupportedVersion) >= PluginVersionSize)
5152  return PI_ERROR_INVALID_VALUE;
5153  strncpy(PluginInit->PluginVersion, SupportedVersion, PluginVersionSize);
5154 
5155  // Set whole function table to zero to make it easier to detect if
5156  // functions are not set up below.
5157  std::memset(&(PluginInit->PiFunctionTable), 0,
5158  sizeof(PluginInit->PiFunctionTable));
5159 
5160 // Forward calls to HIP RT.
5161 #define _PI_CL(pi_api, hip_api) \
5162  (PluginInit->PiFunctionTable).pi_api = (decltype(&::pi_api))(&hip_api);
5163 
5164  // Platform
5167  // Device
5178  // Context
5187  // Queue
5196  // Memory
5206  // Program
5220  // Kernel
5232  // Event
5243  // Sampler
5248  // Queue commands
5266  // USM
5276 
5279  _PI_CL(piPluginGetLastError, hip_piPluginGetLastError)
5281 
5282 #undef _PI_CL
5283 
5284  return PI_SUCCESS;
5285 }
5286 
5287 } // extern "C"
RAII object that calls the reference count release function on the held PI object on destruction.
Definition: pi_cuda.cpp:762
ReleaseGuard(ReleaseGuard &&Other) noexcept
Definition: pi_hip.cpp:787
void dismiss()
End the guard and do not release the reference count of the held PI object.
Definition: pi_hip.cpp:819
~ReleaseGuard()
Calls the related PI object release function if the object held is not nullptr or if dismiss has not ...
Definition: pi_hip.cpp:795
ReleaseGuard(const ReleaseGuard &)=delete
ReleaseGuard()=delete
ReleaseGuard(T Obj)
Obj can be nullptr.
Definition: pi_hip.cpp:786
ReleaseGuard & operator=(const ReleaseGuard &)=delete
ReleaseGuard & operator=(ReleaseGuard &&Other)
Definition: pi_hip.cpp:811
#define __SYCL_INLINE_VER_NAMESPACE(X)
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_fp_or_dword_type< T >::value &&std::is_floating_point< T >::value, sycl::ext::intel::esimd::simd< T, SZ > > line(sycl::ext::intel::esimd::simd< T, 4 > src0, sycl::ext::intel::esimd::simd< T, SZ > src1, Sat sat={})
Linear equation.
Definition: math.hpp:921
#define __SYCL_PI_CONTEXT_PROPERTIES_HIP_PRIMARY
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:199
__SYCL_EXTERN_STREAM_ATTRS ostream cerr
Linked to standard error (unbuffered)
void die(const char *Message)
Definition: pi.cpp:536
void assertion(bool Condition, const char *Message=nullptr)
Definition: pi.cpp:541
void hipPrint(const char *Message)
Definition: pi_hip.cpp:397
void memcpy(void *Dst, const void *Src, std::size_t Size)
detail::enable_if_t< detail::is_genfloat< T >::value, T > step(T edge, T x) __NOEXC
Definition: builtins.hpp:589
float length(T p) __NOEXC
Definition: builtins.hpp:1032
multi_ptr< ElementType, access::address_space::ext_intel_global_host_space, IsDecorated > host_ptr
Definition: pointers.hpp:40
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
pi_result piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, size_t count, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
USM Memset API.
#define PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT
Extension to denote native support of assert feature by an arbitrary device piDeviceGetInfo call shou...
Definition: pi.h:740
pi_result piKernelCreate(pi_program program, const char *kernel_name, pi_kernel *ret_kernel)
int32_t pi_int32
Definition: pi.h:102
constexpr pi_sampler_properties PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS
Definition: pi.h:512
pi_result piQueueFinish(pi_queue command_queue)
uintptr_t pi_native_handle
Definition: pi.h:107
pi_result piProgramGetBuildInfo(pi_program program, pi_device device, _pi_program_build_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_bitfield pi_map_flags
Definition: pi.h:552
pi_result piextDeviceGetNativeHandle(pi_device device, pi_native_handle *nativeHandle)
Gets the native handle of a PI device object.
pi_result piEnqueueMemBufferCopyRect(pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
_pi_result
Definition: pi.h:114
pi_result piMemImageGetInfo(pi_mem image, pi_image_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_result piKernelRelease(pi_kernel kernel)
constexpr pi_sampler_properties PI_SAMPLER_PROPERTIES_ADDRESSING_MODE
Definition: pi.h:514
pi_uint32 pi_bool
Definition: pi.h:105
pi_result piextQueueCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, pi_device device, bool pluginOwnsNativeHandle, pi_queue *queue)
Creates PI queue object from a native handle.
pi_result piextUSMFree(pi_context context, void *ptr)
Indicates that the allocated USM memory is no longer needed on the runtime side.
pi_bitfield pi_usm_mem_properties
Definition: pi.h:564
_pi_device_info
Definition: pi.h:183
@ PI_DEVICE_INFO_PRINTF_BUFFER_SIZE
Definition: pi.h:253
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR
Definition: pi.h:194
@ PI_DEVICE_INFO_MAX_MEM_BANDWIDTH
Definition: pi.h:278
@ PI_DEVICE_INFO_GPU_SLICES
Definition: pi.h:275
@ PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC
Definition: pi.h:254
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT
Definition: pi.h:205
@ PI_DEVICE_INFO_BUILD_ON_SUBDEVICE
Definition: pi.h:281
@ PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE
Definition: pi.h:189
@ PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH
Definition: pi.h:216
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT
Definition: pi.h:195
@ PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT
Definition: pi.h:217
@ PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY
Definition: pi.h:208
@ PI_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT
Definition: pi.h:265
@ PI_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS
Definition: pi.h:187
@ PI_DEVICE_INFO_EXECUTION_CAPABILITIES
Definition: pi.h:239
@ PI_DEVICE_INFO_GPU_EU_COUNT
Definition: pi.h:273
@ PI_DEVICE_INFO_COMPILER_AVAILABLE
Definition: pi.h:237
@ PI_DEVICE_INFO_IMAGE_SUPPORT
Definition: pi.h:211
@ PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT
Definition: pi.h:232
@ PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY
Definition: pi.h:282
@ PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES
Definition: pi.h:291
@ PI_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES
Definition: pi.h:240
@ PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES
Definition: pi.h:257
@ PI_DEVICE_INFO_BUILT_IN_KERNELS
Definition: pi.h:242
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT
Definition: pi.h:198
@ PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS
Definition: pi.h:260
@ PI_DEVICE_INFO_AVAILABLE
Definition: pi.h:236
@ PI_DEVICE_INFO_HALF_FP_CONFIG
Definition: pi.h:191
@ PI_DEVICE_INFO_PARENT_DEVICE
Definition: pi.h:255
@ PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL
Definition: pi.h:262
@ PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES
Definition: pi.h:290
@ PI_DEVICE_INFO_GPU_EU_SIMD_WIDTH
Definition: pi.h:274
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT
Definition: pi.h:202
@ PI_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES
Definition: pi.h:241
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE
Definition: pi.h:206
@ PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE
Definition: pi.h:224
@ PI_DEVICE_INFO_DRIVER_VERSION
Definition: pi.h:248
@ PI_DEVICE_INFO_MAX_PARAMETER_SIZE
Definition: pi.h:222
@ PI_DEVICE_INFO_PARTITION_PROPERTIES
Definition: pi.h:256
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT
Definition: pi.h:196
@ PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT
Definition: pi.h:266
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF
Definition: pi.h:200
@ PI_DEVICE_INFO_VERSION
Definition: pi.h:250
@ PI_DEVICE_INFO_IMAGE3D_MAX_DEPTH
Definition: pi.h:218
@ PI_DEVICE_INFO_MAX_COMPUTE_UNITS
Definition: pi.h:186
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR
Definition: pi.h:201
@ PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT
Definition: pi.h:215
@ PI_DEVICE_INFO_VENDOR_ID
Definition: pi.h:185
@ PI_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE
Definition: pi.h:225
@ PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU
Definition: pi.h:292
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG
Definition: pi.h:197
@ PI_DEVICE_INFO_PLATFORM
Definition: pi.h:243
@ PI_DEVICE_INFO_LOCAL_MEM_SIZE
Definition: pi.h:231
@ PI_DEVICE_INFO_REFERENCE_COUNT
Definition: pi.h:244
@ PI_DEVICE_INFO_MAX_SAMPLERS
Definition: pi.h:221
@ PI_DEVICE_INFO_DEVICE_ID
Definition: pi.h:271
@ PI_DEVICE_INFO_PROFILE
Definition: pi.h:249
@ PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES
Definition: pi.h:188
@ PI_DEVICE_INFO_EXTENSIONS
Definition: pi.h:252
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT
Definition: pi.h:203
@ PI_DEVICE_INFO_HOST_UNIFIED_MEMORY
Definition: pi.h:233
@ PI_DEVICE_INFO_PCI_ADDRESS
Definition: pi.h:272
@ PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE
Definition: pi.h:210
@ PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE
Definition: pi.h:226
@ PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE
Definition: pi.h:220
@ PI_DEVICE_INFO_LINKER_AVAILABLE
Definition: pi.h:238
@ PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D
Definition: pi.h:299
@ PI_DEVICE_INFO_ADDRESS_BITS
Definition: pi.h:209
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE
Definition: pi.h:199
@ PI_DEVICE_INFO_ATOMIC_64
Definition: pi.h:289
@ PI_DEVICE_INFO_USM_HOST_SUPPORT
Definition: pi.h:263
@ PI_DEVICE_INFO_USM_DEVICE_SUPPORT
Definition: pi.h:264
@ PI_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS
Definition: pi.h:261
@ PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE
Definition: pi.h:277
@ PI_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS
Definition: pi.h:213
@ PI_DEVICE_INFO_PROFILING_TIMER_RESOLUTION
Definition: pi.h:234
@ PI_DEVICE_INFO_VENDOR
Definition: pi.h:247
@ PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN
Definition: pi.h:223
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG
Definition: pi.h:204
@ PI_DEVICE_INFO_MAX_READ_IMAGE_ARGS
Definition: pi.h:212
@ PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN
Definition: pi.h:258
@ PI_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT
Definition: pi.h:267
@ PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE
Definition: pi.h:219
@ PI_EXT_INTEL_DEVICE_INFO_MEMORY_CLOCK_RATE
Definition: pi.h:285
@ PI_DEVICE_INFO_DOUBLE_FP_CONFIG
Definition: pi.h:192
@ PI_EXT_INTEL_DEVICE_INFO_MEMORY_BUS_WIDTH
Definition: pi.h:288
@ PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE
Definition: pi.h:228
@ PI_DEVICE_INFO_PARTITION_TYPE
Definition: pi.h:259
@ PI_DEVICE_INFO_TYPE
Definition: pi.h:184
@ PI_DEVICE_INFO_SINGLE_FP_CONFIG
Definition: pi.h:190
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF
Definition: pi.h:207
@ PI_DEVICE_INFO_GLOBAL_MEM_SIZE
Definition: pi.h:227
@ PI_DEVICE_INFO_NAME
Definition: pi.h:246
@ PI_DEVICE_INFO_LOCAL_MEM_TYPE
Definition: pi.h:230
@ PI_DEVICE_INFO_MAX_CONSTANT_ARGS
Definition: pi.h:229
@ PI_DEVICE_INFO_OPENCL_C_VERSION
Definition: pi.h:251
@ PI_DEVICE_INFO_ENDIAN_LITTLE
Definition: pi.h:235
@ PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH
Definition: pi.h:214
@ PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE
Definition: pi.h:276
@ PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS
Definition: pi.h:295
pi_uint64 pi_bitfield
Definition: pi.h:106
static constexpr pi_device_fp_config PI_FP_DENORM
Definition: pi.h:642
pi_result piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, size_t length, pi_mem_advice advice, pi_event *event)
USM Memadvise API.
constexpr pi_mem_flags PI_MEM_FLAGS_HOST_PTR_ALLOC
Definition: pi.h:549
@ PI_PROGRAM_BUILD_STATUS_SUCCESS
Definition: pi.h:147
@ PI_PROGRAM_BUILD_STATUS_ERROR
Definition: pi.h:146
_pi_queue_info
Definition: pi.h:326
@ PI_QUEUE_INFO_PROPERTIES
Definition: pi.h:330
@ PI_QUEUE_INFO_DEVICE
Definition: pi.h:328
@ PI_QUEUE_INFO_CONTEXT
Definition: pi.h:327
@ PI_QUEUE_INFO_REFERENCE_COUNT
Definition: pi.h:331
_pi_mem_advice
Definition: pi.h:422
pi_result piEnqueueMemImageRead(pi_queue command_queue, pi_mem image, pi_bool blocking_read, pi_image_offset origin, pi_image_region region, size_t row_pitch, size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
@ PI_IMAGE_CHANNEL_TYPE_FLOAT
Definition: pi.h:471
@ PI_IMAGE_CHANNEL_TYPE_UNORM_INT8
Definition: pi.h:459
@ PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16
Definition: pi.h:465
@ PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
Definition: pi.h:467
@ PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8
Definition: pi.h:464
@ PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
Definition: pi.h:469
@ PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
Definition: pi.h:468
@ PI_IMAGE_CHANNEL_TYPE_SIGNED_INT32
Definition: pi.h:466
@ PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT
Definition: pi.h:470
@ PI_IMAGE_CHANNEL_TYPE_UNORM_INT16
Definition: pi.h:460
_pi_kernel_sub_group_info
Definition: pi.h:365
@ PI_KERNEL_COMPILE_NUM_SUB_GROUPS
Definition: pi.h:368
@ PI_KERNEL_MAX_NUM_SUB_GROUPS
Definition: pi.h:367
@ PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL
Definition: pi.h:369
@ PI_KERNEL_MAX_SUB_GROUP_SIZE
Definition: pi.h:366
pi_result piextProgramCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, bool pluginOwnsNativeHandle, pi_program *program)
Creates PI program object from a native handle.
pi_bitfield pi_mem_properties
Definition: pi.h:558
pi_result piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
pi_result piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piEnqueueKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
_pi_kernel_group_info
Definition: pi.h:344
@ PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE
Definition: pi.h:346
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
Definition: pi.h:347
@ PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE
Definition: pi.h:350
@ PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE
Definition: pi.h:348
@ PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
Definition: pi.h:349
@ PI_KERNEL_GROUP_INFO_NUM_REGS
Definition: pi.h:352
pi_result piEnqueueMemBufferRead(pi_queue queue, pi_mem buffer, pi_bool blocking_read, size_t offset, size_t size, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piProgramRetain(pi_program program)
const pi_bool PI_TRUE
Definition: pi.h:476
pi_result piMemImageCreate(pi_context context, pi_mem_flags flags, const pi_image_format *image_format, const pi_image_desc *image_desc, void *host_ptr, pi_mem *ret_mem)
pi_result piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, size_t size, pi_usm_migration_flags flags, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Hint to migrate memory to the device.
_pi_kernel_exec_info
Definition: pi.h:1304
pi_result piProgramRelease(pi_program program)
_pi_platform_info
Definition: pi.h:129
@ PI_PLATFORM_INFO_VENDOR
Definition: pi.h:133
@ PI_PLATFORM_INFO_EXTENSIONS
Definition: pi.h:130
@ PI_PLATFORM_INFO_PROFILE
Definition: pi.h:132
@ PI_PLATFORM_INFO_NAME
Definition: pi.h:131
@ PI_PLATFORM_INFO_VERSION
Definition: pi.h:134
pi_result piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, const pi_sampler *arg_value)
pi_result piSamplerRetain(pi_sampler sampler)
pi_result piMemGetInfo(pi_mem mem, pi_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_result piextUSMHostAlloc(void **result_ptr, pi_context context, pi_usm_mem_properties *properties, size_t size, pi_uint32 alignment)
Allocates host memory accessible by the device.
pi_result piDeviceGetInfo(pi_device device, pi_device_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Returns requested info for provided native device Return PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT fo...
pi_result piKernelRetain(pi_kernel kernel)
pi_result piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, pi_uint32 *num_platforms)
_pi_image_info
Definition: pi.h:355
pi_result piextEventGetNativeHandle(pi_event event, pi_native_handle *nativeHandle)
Gets the native handle of a PI event object.
pi_result piEnqueueEventsWait(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
@ PI_USM_ATOMIC_ACCESS
Definition: pi.h:1617
@ PI_USM_ACCESS
Definition: pi.h:1616
@ PI_USM_CONCURRENT_ACCESS
Definition: pi.h:1618
@ PI_USM_CONCURRENT_ATOMIC_ACCESS
Definition: pi.h:1619
@ PI_IMAGE_CHANNEL_ORDER_RGBA
Definition: pi.h:444
_pi_profiling_info
Definition: pi.h:531
@ PI_PROFILING_INFO_COMMAND_END
Definition: pi.h:535
@ PI_PROFILING_INFO_COMMAND_START
Definition: pi.h:534
@ PI_PROFILING_INFO_COMMAND_SUBMIT
Definition: pi.h:533
@ PI_PROFILING_INFO_COMMAND_QUEUED
Definition: pi.h:532
@ PI_MEM_TYPE_SHARED
Definition: pi.h:1633
@ PI_MEM_TYPE_UNKNOWN
Definition: pi.h:1630
@ PI_MEM_TYPE_DEVICE
Definition: pi.h:1632
@ PI_MEM_TYPE_HOST
Definition: pi.h:1631
constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE
Definition: pi.h:579
_pi_usm_migration_flags
Definition: pi.h:1639
pi_result piTearDown(void *PluginParameter)
API to notify that the plugin should clean up its resources.
_pi_device_type
Definition: pi.h:161
@ PI_DEVICE_TYPE_DEFAULT
The default device available in the PI plugin.
Definition: pi.h:162
@ PI_DEVICE_TYPE_GPU
A PI device that is a GPU.
Definition: pi.h:166
pi_result piContextRetain(pi_context context)
@ PI_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE
Definition: pi.h:175
uint64_t pi_uint64
Definition: pi.h:104
pi_bitfield pi_queue_properties
Definition: pi.h:577
pi_result piMemRetain(pi_mem mem)
pi_result piKernelGetSubGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name, size_t input_value_size, const void *input_value, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
API to query information from the sub-group from a kernel.
pi_result piextProgramSetSpecializationConstant(pi_program prog, pi_uint32 spec_id, size_t spec_size, const void *spec_value)
Sets a specialization constant to a specific value.
const pi_bool PI_FALSE
Definition: pi.h:477
pi_result piEventGetInfo(pi_event event, pi_event_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
#define __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64
PTX 64-bit image <-> "nvptx64", 64-bit NVIDIA PTX device.
Definition: pi.h:734
pi_result piDevicesGet(pi_platform platform, pi_device_type device_type, pi_uint32 num_entries, pi_device *devices, pi_uint32 *num_devices)
pi_result piQueueRelease(pi_queue command_queue)
pi_bitfield pi_sampler_properties
Definition: pi.h:511
pi_result piextContextCreateWithNativeHandle(pi_native_handle nativeHandle, pi_uint32 numDevices, const pi_device *devices, bool pluginOwnsNativeHandle, pi_context *context)
Creates PI context object from a native handle.
pi_result piEventGetProfilingInfo(pi_event event, pi_profiling_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_result piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, size_t offset, size_t size, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
constexpr pi_mem_flags PI_MEM_FLAGS_HOST_PTR_COPY
Definition: pi.h:548
pi_result piEnqueueMemBufferWriteRect(pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, pi_buff_rect_region region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
_pi_buffer_create_type
Definition: pi.h:474
@ PI_BUFFER_CREATE_TYPE_REGION
Definition: pi.h:474
pi_result piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, void *mapped_ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
constexpr pi_queue_properties PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
Definition: pi.h:578
pi_result piKernelSetExecInfo(pi_kernel kernel, pi_kernel_exec_info value_name, size_t param_value_size, const void *param_value)
API to set attributes controlling kernel execution.
static constexpr pi_device_fp_config PI_FP_ROUND_TO_NEAREST
Definition: pi.h:644
pi_result piProgramGetInfo(pi_program program, pi_program_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_result piProgramBuild(pi_program program, pi_uint32 num_devices, const pi_device *device_list, const char *options, void(*pfn_notify)(pi_program program, void *user_data), void *user_data)
uint32_t pi_uint32
Definition: pi.h:103
_pi_event_info
Definition: pi.h:372
@ PI_EVENT_INFO_COMMAND_EXECUTION_STATUS
Definition: pi.h:376
@ PI_EVENT_INFO_REFERENCE_COUNT
Definition: pi.h:377
@ PI_EVENT_INFO_COMMAND_TYPE
Definition: pi.h:375
@ PI_EVENT_INFO_CONTEXT
Definition: pi.h:374
@ PI_EVENT_INFO_COMMAND_QUEUE
Definition: pi.h:373
pi_result piextEventCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, bool ownNativeHandle, pi_event *event)
Creates PI event object from a native handle.
static constexpr pi_device_fp_config PI_FP_ROUND_TO_INF
Definition: pi.h:646
#define __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN
Definition: pi.h:735
intptr_t pi_context_properties
Definition: pi.h:503
pi_result piEventsWait(pi_uint32 num_events, const pi_event *event_list)
pi_bitfield pi_mem_flags
Definition: pi.h:542
pi_result piEnqueueNativeKernel(pi_queue queue, void(*user_func)(void *), void *args, size_t cb_args, pi_uint32 num_mem_objects, const pi_mem *mem_list, const void **args_mem_loc, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piContextCreate(const pi_context_properties *properties, pi_uint32 num_devices, const pi_device *devices, void(*pfn_notify)(const char *errinfo, const void *private_info, size_t cb, void *user_data), void *user_data, pi_context *ret_context)
pi_result piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, pi_bool blocking_map, pi_map_flags map_flags, size_t offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event, void **ret_map)
pi_result piEventSetCallback(pi_event event, pi_int32 command_exec_callback_type, void(*pfn_notify)(pi_event event, pi_int32 event_command_status, void *user_data), void *user_data)
pi_result piQueueFlush(pi_queue command_queue)
pi_result piMemRelease(pi_mem mem)
constexpr pi_mem_flags PI_MEM_FLAGS_HOST_PTR_USE
Definition: pi.h:547
void(* pi_context_extended_deleter)(void *user_data)
Definition: pi.h:1070
constexpr pi_map_flags PI_MAP_WRITE
Definition: pi.h:554
constexpr pi_map_flags PI_MAP_WRITE_INVALIDATE_REGION
Definition: pi.h:555
_pi_mem_type
Definition: pi.h:412
@ PI_MEM_TYPE_IMAGE1D
Definition: pi.h:417
@ PI_MEM_TYPE_IMAGE2D
Definition: pi.h:414
@ PI_MEM_TYPE_IMAGE3D
Definition: pi.h:415
pi_result piSamplerCreate(pi_context context, const pi_sampler_properties *sampler_properties, pi_sampler *result_sampler)
pi_result piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, pi_mem dst_image, pi_image_offset src_origin, pi_image_offset dst_origin, pi_image_region region, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piextQueueGetNativeHandle(pi_queue queue, pi_native_handle *nativeHandle)
Gets the native handle of a PI queue object.
_pi_sampler_info
Definition: pi.h:479
@ PI_SAMPLER_INFO_CONTEXT
Definition: pi.h:481
@ PI_SAMPLER_INFO_NORMALIZED_COORDS
Definition: pi.h:482
@ PI_SAMPLER_INFO_FILTER_MODE
Definition: pi.h:484
@ PI_SAMPLER_INFO_REFERENCE_COUNT
Definition: pi.h:480
@ PI_SAMPLER_INFO_ADDRESSING_MODE
Definition: pi.h:483
pi_result piProgramLink(pi_context context, pi_uint32 num_devices, const pi_device *device_list, const char *options, pi_uint32 num_input_programs, const pi_program *input_programs, void(*pfn_notify)(pi_program program, void *user_data), void *user_data, pi_program *ret_program)
Definition: pi_opencl.cpp:882
pi_result piQueueCreate(pi_context context, pi_device device, pi_queue_properties properties, pi_queue *queue)
pi_result piextMemGetNativeHandle(pi_mem mem, pi_native_handle *nativeHandle)
Gets the native handle of a PI mem object.
pi_result piEventSetStatus(pi_event event, pi_int32 execution_status)
static constexpr pi_device_fp_config PI_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT
Definition: pi.h:649
constexpr pi_map_flags PI_MAP_READ
Definition: pi.h:553
pi_result piextContextGetNativeHandle(pi_context context, pi_native_handle *nativeHandle)
Gets the native handle of a PI context object.
pi_result piPluginGetLastError(char **message)
API to get Plugin specific warning and error messages.
pi_result piProgramCreate(pi_context context, const void *il, size_t length, pi_program *res_program)
pi_result piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, const void *pattern, size_t pattern_size, size_t offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piextDeviceCreateWithNativeHandle(pi_native_handle nativeHandle, pi_platform platform, pi_device *device)
Creates PI device object from a native handle.
pi_result piDeviceRetain(pi_device device)
pi_result piDeviceRelease(pi_device device)
pi_result piProgramCompile(pi_program program, pi_uint32 num_devices, const pi_device *device_list, const char *options, pi_uint32 num_input_headers, const pi_program *input_headers, const char **header_include_names, void(*pfn_notify)(pi_program program, void *user_data), void *user_data)
pi_result piMemBufferPartition(pi_mem buffer, pi_mem_flags flags, pi_buffer_create_type buffer_create_type, void *buffer_create_info, pi_mem *ret_mem)
pi_result piextUSMDeviceAlloc(void **result_ptr, pi_context context, pi_device device, pi_usm_mem_properties *properties, size_t size, pi_uint32 alignment)
Allocates device memory.
constexpr pi_sampler_properties PI_SAMPLER_PROPERTIES_FILTER_MODE
Definition: pi.h:515
constexpr pi_mem_flags PI_MEM_FLAGS_ACCESS_RW
Definition: pi.h:544
_pi_kernel_info
Definition: pi.h:335
@ PI_KERNEL_INFO_PROGRAM
Definition: pi.h:340
@ PI_KERNEL_INFO_FUNCTION_NAME
Definition: pi.h:336
@ PI_KERNEL_INFO_REFERENCE_COUNT
Definition: pi.h:338
@ PI_KERNEL_INFO_CONTEXT
Definition: pi.h:339
@ PI_KERNEL_INFO_ATTRIBUTES
Definition: pi.h:341
@ PI_KERNEL_INFO_NUM_ARGS
Definition: pi.h:337
_pi_command_type
Definition: pi.h:380
@ PI_COMMAND_TYPE_MARKER
Definition: pi.h:399
@ PI_COMMAND_TYPE_IMAGE_WRITE
Definition: pi.h:393
@ PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT
Definition: pi.h:389
@ PI_COMMAND_TYPE_MEM_BUFFER_READ
Definition: pi.h:382
@ PI_COMMAND_TYPE_MEM_BUFFER_UNMAP
Definition: pi.h:386
@ PI_COMMAND_TYPE_IMAGE_READ
Definition: pi.h:392
@ PI_COMMAND_TYPE_IMAGE_COPY
Definition: pi.h:394
@ PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT
Definition: pi.h:387
@ PI_COMMAND_TYPE_MEM_BUFFER_COPY
Definition: pi.h:384
@ PI_COMMAND_TYPE_MEM_BUFFER_MAP
Definition: pi.h:385
@ PI_COMMAND_TYPE_MEM_BUFFER_WRITE
Definition: pi.h:383
@ PI_COMMAND_TYPE_NDRANGE_KERNEL
Definition: pi.h:381
@ PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT
Definition: pi.h:388
@ PI_COMMAND_TYPE_MEM_BUFFER_FILL
Definition: pi.h:391
@ PI_COMMAND_TYPE_USER
Definition: pi.h:390
pi_result piSamplerGetInfo(pi_sampler sampler, pi_sampler_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
_pi_mem_alloc_info
Definition: pi.h:1622
@ PI_MEM_ALLOC_BASE_PTR
Definition: pi.h:1624
@ PI_MEM_ALLOC_TYPE
Definition: pi.h:1623
@ PI_MEM_ALLOC_DEVICE
Definition: pi.h:1626
@ PI_MEM_ALLOC_SIZE
Definition: pi.h:1625
pi_result piclProgramCreateWithSource(pi_context context, pi_uint32 count, const char **strings, const size_t *lengths, pi_program *ret_program)
@ PI_DEVICE_LOCAL_MEM_TYPE_LOCAL
Definition: pi.h:179
pi_result piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, pi_bool blocking_write, pi_image_offset origin, pi_image_region region, size_t input_row_pitch, size_t input_slice_pitch, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
_pi_context_info
Definition: pi.h:315
@ PI_CONTEXT_INFO_NUM_DEVICES
Definition: pi.h:318
@ PI_CONTEXT_INFO_DEVICES
Definition: pi.h:316
@ PI_CONTEXT_INFO_REFERENCE_COUNT
Definition: pi.h:320
@ PI_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES
Definition: pi.h:323
pi_result piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size, void *host_ptr, pi_mem *ret_mem, const pi_mem_properties *properties=nullptr)
_pi_sampler_filter_mode
Definition: pi.h:498
@ PI_SAMPLER_FILTER_MODE_NEAREST
Definition: pi.h:499
pi_result piEventRelease(pi_event event)
pi_result piQueueGetInfo(pi_queue command_queue, pi_queue_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_result piContextRelease(pi_context context)
_pi_event_status
Definition: pi.h:122
pi_result piEnqueueMemImageFill(pi_queue command_queue, pi_mem image, const void *fill_color, const size_t *origin, const size_t *region, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piProgramCreateWithBinary(pi_context context, pi_uint32 num_devices, const pi_device *device_list, const size_t *lengths, const unsigned char **binaries, size_t num_metadata_entries, const pi_device_binary_property *metadata, pi_int32 *binary_status, pi_program *ret_program)
Creates a PI program for a context and loads the given binary into it.
pi_result piextProgramGetNativeHandle(pi_program program, pi_native_handle *nativeHandle)
Gets the native handle of a PI program object.
pi_result piextUSMSharedAlloc(void **result_ptr, pi_context context, pi_device device, pi_usm_mem_properties *properties, size_t size, pi_uint32 alignment)
Allocates memory accessible on both host and device.
pi_result piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, void *dst_ptr, const void *src_ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
USM Memcpy API.
static constexpr pi_device_fp_config PI_FP_ROUND_TO_ZERO
Definition: pi.h:645
static constexpr pi_device_fp_config PI_FP_FMA
Definition: pi.h:647
pi_result piEnqueueMemBufferReadRect(pi_queue command_queue, pi_mem buffer, pi_bool blocking_read, pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, pi_buff_rect_region region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
_pi_sampler_addressing_mode
Definition: pi.h:490
@ PI_SAMPLER_ADDRESSING_MODE_CLAMP
Definition: pi.h:494
@ PI_SAMPLER_ADDRESSING_MODE_NONE
Definition: pi.h:495
#define _PI_PLUGIN_VERSION_CHECK(PI_API_VERSION, PI_PLUGIN_VERSION)
Definition: pi.h:81
pi_result piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Sets up pointer arguments for CL kernels.
_pi_program_info
Definition: pi.h:303
@ PI_PROGRAM_INFO_NUM_DEVICES
Definition: pi.h:306
@ PI_PROGRAM_INFO_SOURCE
Definition: pi.h:308
@ PI_PROGRAM_INFO_BINARY_SIZES
Definition: pi.h:309
@ PI_PROGRAM_INFO_KERNEL_NAMES
Definition: pi.h:312
@ PI_PROGRAM_INFO_CONTEXT
Definition: pi.h:305
@ PI_PROGRAM_INFO_REFERENCE_COUNT
Definition: pi.h:304
@ PI_PROGRAM_INFO_BINARIES
Definition: pi.h:310
@ PI_PROGRAM_INFO_DEVICES
Definition: pi.h:307
pi_result piKernelGetInfo(pi_kernel kernel, pi_kernel_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
intptr_t pi_device_partition_property
Definition: pi.h:615
pi_result piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_result piextUSMGetMemAllocInfo(pi_context context, const void *ptr, pi_mem_alloc_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
API to query information about USM allocated pointers Valid Queries: PI_MEM_ALLOC_TYPE returns host/d...
pi_result piPlatformGetInfo(pi_platform platform, pi_platform_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_result piextMemCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, bool ownNativeHandle, pi_mem *mem)
Creates PI mem object from a native handle.
pi_result piextGetDeviceFunctionPointer(pi_device device, pi_program program, const char *function_name, pi_uint64 *function_pointer_ret)
Retrieves a device function pointer to a user-defined function.
pi_result piContextGetInfo(pi_context context, pi_context_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_result piextDeviceSelectBinary(pi_device device, pi_device_binary *binaries, pi_uint32 num_binaries, pi_uint32 *selected_binary_ind)
Selects the most appropriate device binary based on runtime information and the IR characteristics.
constexpr pi_device_exec_capabilities PI_DEVICE_EXEC_CAPABILITIES_KERNEL
Definition: pi.h:506
static constexpr pi_device_fp_config PI_FP_INF_NAN
Definition: pi.h:643
_pi_mem_info
Definition: pi.h:934
pi_result piQueueRetain(pi_queue command_queue)
pi_result piextContextSetExtendedDeleter(pi_context context, pi_context_extended_deleter func, void *user_data)
pi_result piSamplerRelease(pi_sampler sampler)
pi_result piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem *arg_value)
pi_result piDevicePartition(pi_device device, const pi_device_partition_property *properties, pi_uint32 num_devices, pi_device *out_devices, pi_uint32 *out_num_devices)
pi_result piEventRetain(pi_event event)
_pi_program_build_info
Definition: pi.h:137
@ PI_PROGRAM_BUILD_INFO_LOG
Definition: pi.h:140
@ PI_PROGRAM_BUILD_INFO_OPTIONS
Definition: pi.h:139
@ PI_PROGRAM_BUILD_INFO_STATUS
Definition: pi.h:138
C++ wrapper of extern "C" PI interfaces.
#define __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(parameter)
Definition: pi.hpp:104
void(* pfn_notify)(pi_event event, pi_int32 eventCommandStatus, void *userData)
Definition: pi_cuda.hpp:604
std::unique_lock< std::mutex > _pi_stream_guard
Definition: pi_cuda.hpp:66
thread_local char ErrorMessage[MaxMessageSize]
thread_local pi_result ErrorMessageCode
constexpr size_t MaxMessageSize
static void setErrorMessage(const char *message, pi_result error_code)
pi_result hip_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags, pi_buffer_create_type buffer_create_type, void *buffer_create_info, pi_mem *memObj)
Implements a buffer partition in the HIP backend.
Definition: pi_hip.cpp:2229
pi_result hip_piEnqueueMemImageRead(pi_queue command_queue, pi_mem image, pi_bool blocking_read, const size_t *origin, const size_t *region, size_t row_pitch, size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_hip.cpp:4465
pi_result hip_piextContextSetExtendedDeleter(pi_context context, pi_context_extended_deleter function, void *user_data)
Definition: pi_hip.cpp:1016
pi_result hip_piProgramBuild(pi_program program, pi_uint32 num_devices, const pi_device *device_list, const char *options, void(*pfn_notify)(pi_program program, void *user_data), void *user_data)
Loads the images from a PI program into a HIPmodule that can be used later on to extract functions (k...
Definition: pi_hip.cpp:3128
pi_result hip_piextContextGetNativeHandle(pi_context context, pi_native_handle *nativeHandle)
Gets the native HIP handle of a PI context object.
Definition: pi_hip.cpp:2051
pi_result hip_piContextRetain(pi_context context)
Definition: pi_hip.cpp:1008
pi_result hip_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_hip.cpp:3436
pi_result hip_piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Definition: pi_hip.cpp:3606
pi_result hip_piSamplerGetInfo(pi_sampler sampler, pi_sampler_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Gets information from a PI sampler object.
Definition: pi_hip.cpp:3938
pi_result hip_piEventCreate(pi_context context, pi_event *event)
Definition: pi_hip.cpp:3615
pi_result hip_piDevicePartition(pi_device device, const pi_device_partition_property *properties, pi_uint32 num_devices, pi_device *out_devices, pi_uint32 *out_num_devices)
Not applicable to HIP, devices cannot be partitioned.
Definition: pi_hip.cpp:1024
pi_result hip_piProgramGetInfo(pi_program program, pi_program_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_hip.cpp:3209
std::string getKernelNames(pi_program program)
Finds kernel names by searching for entry points in the PTX source, as the HIP driver API doesn't exp...
Definition: pi_hip.cpp:741
pi_result hip_piextUSMHostAlloc(void **result_ptr, pi_context context, pi_usm_mem_properties *properties, size_t size, pi_uint32 alignment)
USM: Implements USM Host allocations using HIP Pinned Memory.
Definition: pi_hip.cpp:4812
pi_result hip_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle, pi_uint32 num_devices, const pi_device *devices, bool ownNativeHandle, pi_context *context)
Created a PI context object from a HIP context handle.
Definition: pi_hip.cpp:2065
pi_result hip_piPlatformGetInfo(pi_platform platform, pi_platform_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_hip.cpp:914
pi_result hip_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, size_t count, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Definition: pi_hip.cpp:4907
pi_result hip_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, void *dst_ptr, const void *src_ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Definition: pi_hip.cpp:4945
pi_result hip_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, const pi_sampler *arg_value)
Definition: pi_hip.cpp:2758
pi_result hip_piEventGetInfo(pi_event event, pi_event_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_hip.cpp:3622
pi_result hip_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, pi_bool blocking_read, size_t offset, size_t size, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_hip.cpp:2584
pi_result hip_piQueueRelease(pi_queue command_queue)
Definition: pi_hip.cpp:2438
pi_result hip_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, size_t length, pi_mem_advice advice, pi_event *event)
USM: memadvise API to govern behavior of automatic migration mechanisms.
Definition: pi_hip.cpp:5023
pi_result hip_piKernelGetInfo(pi_kernel kernel, pi_kernel_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_hip.cpp:3400
pi_result hip_piextGetDeviceFunctionPointer(pi_device device, pi_program program, const char *func_name, pi_uint64 *func_pointer_ret)
Definition: pi_hip.cpp:1072
pi_result hip_piextUSMSharedAlloc(void **result_ptr, pi_context context, pi_device device, pi_usm_mem_properties *properties, size_t size, pi_uint32 alignment)
USM: Implements USM Shared allocations using HIP Managed Memory.
Definition: pi_hip.cpp:4858
pi_result hip_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem *arg_value)
Definition: pi_hip.cpp:2722
pi_result hip_piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Definition: pi_hip.cpp:2705
pi_result hip_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, pi_bool blocking_write, const size_t *origin, const size_t *region, size_t input_row_pitch, size_t input_slice_pitch, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_hip.cpp:4533
pi_result hip_piDeviceRetain(pi_device device)
Definition: pi_hip.cpp:982