DPC++ Runtime
Runtime libraries for oneAPI DPC++
pi_cuda.cpp
Go to the documentation of this file.
1 //==---------- pi_cuda.cpp - CUDA Plugin -----------------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
13 
14 #include <pi_cuda.hpp>
16 #include <sycl/detail/defines.hpp>
17 #include <sycl/detail/pi.hpp>
18 
19 #include <algorithm>
20 #include <cassert>
21 #include <cuda.h>
22 #include <cuda_device_runtime_api.h>
23 #include <limits>
24 #include <memory>
25 #include <mutex>
26 #include <regex>
27 
28 // Forward declarations
29 void enableCUDATracing();
30 void disableCUDATracing();
31 
32 namespace {
33 std::string getCudaVersionString() {
34  int driver_version = 0;
35  cuDriverGetVersion(&driver_version);
36  // The version is returned as (1000 major + 10 minor).
37  std::stringstream stream;
38  stream << "CUDA " << driver_version / 1000 << "."
39  << driver_version % 1000 / 10;
40  return stream.str();
41 }
42 
43 pi_result map_error(CUresult result) {
44  switch (result) {
45  case CUDA_SUCCESS:
46  return PI_SUCCESS;
47  case CUDA_ERROR_NOT_PERMITTED:
48  return PI_ERROR_INVALID_OPERATION;
49  case CUDA_ERROR_INVALID_CONTEXT:
50  return PI_ERROR_INVALID_CONTEXT;
51  case CUDA_ERROR_INVALID_DEVICE:
52  return PI_ERROR_INVALID_DEVICE;
53  case CUDA_ERROR_INVALID_VALUE:
54  return PI_ERROR_INVALID_VALUE;
55  case CUDA_ERROR_OUT_OF_MEMORY:
56  return PI_ERROR_OUT_OF_HOST_MEMORY;
57  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
58  return PI_ERROR_OUT_OF_RESOURCES;
59  default:
60  return PI_ERROR_UNKNOWN;
61  }
62 }
63 
64 // Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR
65 constexpr size_t MaxMessageSize = 256;
66 thread_local pi_result ErrorMessageCode = PI_SUCCESS;
67 thread_local char ErrorMessage[MaxMessageSize];
68 
69 // Utility function for setting a message and warning
70 static void setErrorMessage(const char *message, pi_result error_code) {
71  assert(strlen(message) <= MaxMessageSize);
72  strcpy(ErrorMessage, message);
73  ErrorMessageCode = error_code;
74 }
75 
76 // Returns plugin specific error and warning messages
77 pi_result cuda_piPluginGetLastError(char **message) {
78  *message = &ErrorMessage[0];
79  return ErrorMessageCode;
80 }
81 
82 // Iterates over the event wait list, returns correct pi_result error codes.
83 // Invokes the callback for the latest event of each queue in the wait list.
84 // The callback must take a single pi_event argument and return a pi_result.
85 template <typename Func>
86 pi_result forLatestEvents(const pi_event *event_wait_list,
87  std::size_t num_events_in_wait_list, Func &&f) {
88 
89  if (event_wait_list == nullptr || num_events_in_wait_list == 0) {
90  return PI_ERROR_INVALID_EVENT_WAIT_LIST;
91  }
92 
93  // Fast path if we only have a single event
94  if (num_events_in_wait_list == 1) {
95  return f(event_wait_list[0]);
96  }
97 
98  std::vector<pi_event> events{event_wait_list,
99  event_wait_list + num_events_in_wait_list};
100  std::sort(events.begin(), events.end(), [](pi_event e0, pi_event e1) {
101  // Tiered sort creating sublists of streams (smallest value first) in which
102  // the corresponding events are sorted into a sequence of newest first.
103  return e0->get_stream() < e1->get_stream() ||
104  (e0->get_stream() == e1->get_stream() &&
105  e0->get_event_id() > e1->get_event_id());
106  });
107 
108  bool first = true;
109  CUstream lastSeenStream = 0;
110  for (pi_event event : events) {
111  if (!event || (!first && event->get_stream() == lastSeenStream)) {
112  continue;
113  }
114 
115  first = false;
116  lastSeenStream = event->get_stream();
117 
118  auto result = f(event);
119  if (result != PI_SUCCESS) {
120  return result;
121  }
122  }
123 
124  return PI_SUCCESS;
125 }
126 
134 pi_result check_error(CUresult result, const char *function, int line,
135  const char *file) {
136  if (result == CUDA_SUCCESS || result == CUDA_ERROR_DEINITIALIZED) {
137  return PI_SUCCESS;
138  }
139 
140  if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) {
141  const char *errorString = nullptr;
142  const char *errorName = nullptr;
143  cuGetErrorName(result, &errorName);
144  cuGetErrorString(result, &errorString);
145  std::stringstream ss;
146  ss << "\nPI CUDA ERROR:"
147  << "\n\tValue: " << result
148  << "\n\tName: " << errorName
149  << "\n\tDescription: " << errorString
150  << "\n\tFunction: " << function << "\n\tSource Location: " << file
151  << ":" << line << "\n"
152  << std::endl;
153  std::cerr << ss.str();
154  }
155 
156  if (std::getenv("PI_CUDA_ABORT") != nullptr) {
157  std::abort();
158  }
159 
160  throw map_error(result);
161 }
162 
164 #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)
165 
168 //
172 //
185 //
188 class ScopedContext {
189 public:
190  ScopedContext(pi_context ctxt) {
191  if (!ctxt) {
192  throw PI_ERROR_INVALID_CONTEXT;
193  }
194 
195  set_context(ctxt->get());
196  }
197 
198  ScopedContext(CUcontext ctxt) { set_context(ctxt); }
199 
200  ~ScopedContext() {}
201 
202 private:
203  void set_context(CUcontext desired) {
204  CUcontext original = nullptr;
205 
206  PI_CHECK_ERROR(cuCtxGetCurrent(&original));
207 
208  // Make sure the desired context is active on the current thread, setting
209  // it if necessary
210  if (original != desired) {
211  PI_CHECK_ERROR(cuCtxSetCurrent(desired));
212  }
213  }
214 };
215 
217 template <typename T, typename Assign>
218 pi_result getInfoImpl(size_t param_value_size, void *param_value,
219  size_t *param_value_size_ret, T value, size_t value_size,
220  Assign &&assign_func) {
221 
222  if (param_value != nullptr) {
223 
224  if (param_value_size < value_size) {
225  return PI_ERROR_INVALID_VALUE;
226  }
227 
228  assign_func(param_value, value, value_size);
229  }
230 
231  if (param_value_size_ret != nullptr) {
232  *param_value_size_ret = value_size;
233  }
234 
235  return PI_SUCCESS;
236 }
237 
238 template <typename T>
239 pi_result getInfo(size_t param_value_size, void *param_value,
240  size_t *param_value_size_ret, T value) {
241 
242  auto assignment = [](void *param_value, T value, size_t value_size) {
243  // Ignore unused parameter
244  (void)value_size;
245 
246  *static_cast<T *>(param_value) = value;
247  };
248 
249  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
250  sizeof(T), assignment);
251 }
252 
253 template <typename T>
254 pi_result getInfoArray(size_t array_length, size_t param_value_size,
255  void *param_value, size_t *param_value_size_ret,
256  T *value) {
257  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
258  array_length * sizeof(T), memcpy);
259 }
260 
261 template <>
262 pi_result getInfo<const char *>(size_t param_value_size, void *param_value,
263  size_t *param_value_size_ret,
264  const char *value) {
265  return getInfoArray(strlen(value) + 1, param_value_size, param_value,
266  param_value_size_ret, value);
267 }
268 
269 int getAttribute(pi_device device, CUdevice_attribute attribute) {
270  int value;
272  cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS);
273  return value;
274 }
276 
277 // Determine local work sizes that result in uniform work groups.
278 // The default threadsPerBlock only require handling the first work_dim
279 // dimension.
280 void guessLocalWorkSize(size_t *threadsPerBlock, const size_t *global_work_size,
281  const size_t maxThreadsPerBlock[3], pi_kernel kernel,
282  pi_uint32 local_size) {
283  assert(threadsPerBlock != nullptr);
284  assert(global_work_size != nullptr);
285  assert(kernel != nullptr);
286  int recommendedBlockSize, minGrid;
287 
288  PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
289  &minGrid, &recommendedBlockSize, kernel->get(), NULL, local_size,
290  maxThreadsPerBlock[0]));
291 
292  (void)minGrid; // Not used, avoid warnings
293 
294  threadsPerBlock[0] = std::min(
295  maxThreadsPerBlock[0],
296  std::min(global_work_size[0], static_cast<size_t>(recommendedBlockSize)));
297 
298  // Find a local work group size that is a divisor of the global
299  // work group size to produce uniform work groups.
300  while (0u != (global_work_size[0] % threadsPerBlock[0])) {
301  --threadsPerBlock[0];
302  }
303 }
304 
305 pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream,
306  pi_uint32 num_events_in_wait_list,
307  const pi_event *event_wait_list) {
308  if (!event_wait_list) {
309  return PI_SUCCESS;
310  }
311  try {
312  ScopedContext active(command_queue->get_context());
313 
314  auto result = forLatestEvents(
315  event_wait_list, num_events_in_wait_list,
316  [stream](pi_event event) -> pi_result {
317  if (event->get_stream() == stream) {
318  return PI_SUCCESS;
319  } else {
320  return PI_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0));
321  }
322  });
323 
324  if (result != PI_SUCCESS) {
325  return result;
326  }
327  return PI_SUCCESS;
328  } catch (pi_result err) {
329  return err;
330  } catch (...) {
331  return PI_ERROR_UNKNOWN;
332  }
333 }
334 
335 } // anonymous namespace
336 
338 namespace sycl {
340 namespace detail {
341 namespace pi {
342 
343 // Report error and no return (keeps compiler from printing warnings).
344 // TODO: Probably change that to throw a catchable exception,
345 // but for now it is useful to see every failure.
346 //
347 [[noreturn]] void die(const char *Message) {
348  std::cerr << "pi_die: " << Message << std::endl;
349  std::terminate();
350 }
351 
352 // Reports error messages
353 void cuPrint(const char *Message) {
354  std::cerr << "pi_print: " << Message << std::endl;
355 }
356 
357 void assertion(bool Condition, const char *Message) {
358  if (!Condition)
359  die(Message);
360 }
361 
362 } // namespace pi
363 } // namespace detail
364 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
365 } // namespace sycl
366 
367 //--------------
368 // PI object implementation
369 
370 extern "C" {
371 
372 // Required in a number of functions, so forward declare here
374  pi_uint32 num_events_in_wait_list,
375  const pi_event *event_wait_list,
376  pi_event *event);
378  pi_uint32 num_events_in_wait_list,
379  const pi_event *event_wait_list,
380  pi_event *event);
383 
384 } // extern "C"
385 
387 
389  pi_uint32 stream_i) {
390  if (barrier_event_ && !compute_applied_barrier_[stream_i]) {
391  PI_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0));
392  compute_applied_barrier_[stream_i] = true;
393  }
394 }
395 
397  pi_uint32 stream_i) {
398  if (barrier_event_ && !transfer_applied_barrier_[stream_i]) {
399  PI_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0));
400  transfer_applied_barrier_[stream_i] = true;
401  }
402 }
403 
405  pi_uint32 stream_i;
406  pi_uint32 token;
407  while (true) {
408  if (num_compute_streams_ < compute_streams_.size()) {
409  // the check above is for performance - so as not to lock mutex every time
410  std::lock_guard<std::mutex> guard(compute_stream_mutex_);
411  // The second check is done after mutex is locked so other threads can not
412  // change num_compute_streams_ after that
413  if (num_compute_streams_ < compute_streams_.size()) {
414  PI_CHECK_ERROR(
415  cuStreamCreate(&compute_streams_[num_compute_streams_++], flags_));
416  }
417  }
418  token = compute_stream_idx_++;
419  stream_i = token % compute_streams_.size();
420  // if a stream has been reused before it was next selected round-robin
421  // fashion, we want to delay its next use and instead select another one
422  // that is more likely to have completed all the enqueued work.
423  if (delay_compute_[stream_i]) {
424  delay_compute_[stream_i] = false;
425  } else {
426  break;
427  }
428  }
429  if (stream_token) {
430  *stream_token = token;
431  }
432  CUstream res = compute_streams_[stream_i];
434  return res;
435 }
436 
438  const pi_event *event_wait_list,
439  _pi_stream_guard &guard,
440  pi_uint32 *stream_token) {
441  for (pi_uint32 i = 0; i < num_events_in_wait_list; i++) {
442  pi_uint32 token = event_wait_list[i]->get_compute_stream_token();
443  if (event_wait_list[i]->get_queue() == this && can_reuse_stream(token)) {
444  std::unique_lock<std::mutex> compute_sync_guard(
446  // redo the check after lock to avoid data races on
447  // last_sync_compute_streams_
448  if (can_reuse_stream(token)) {
449  pi_uint32 stream_i = token % delay_compute_.size();
450  delay_compute_[stream_i] = true;
451  if (stream_token) {
452  *stream_token = token;
453  }
454  guard = _pi_stream_guard{std::move(compute_sync_guard)};
455  CUstream res = event_wait_list[i]->get_stream();
457  return res;
458  }
459  }
460  }
461  guard = {};
462  return get_next_compute_stream(stream_token);
463 }
464 
466  if (transfer_streams_.empty()) { // for example in in-order queue
467  return get_next_compute_stream();
468  }
470  // the check above is for performance - so as not to lock mutex every time
471  std::lock_guard<std::mutex> guard(transfer_stream_mutex_);
472  // The second check is done after mutex is locked so other threads can not
473  // change num_transfer_streams_ after that
475  PI_CHECK_ERROR(
476  cuStreamCreate(&transfer_streams_[num_transfer_streams_++], flags_));
477  }
478  }
479  pi_uint32 stream_i = transfer_stream_idx_++ % transfer_streams_.size();
480  CUstream res = transfer_streams_[stream_i];
482  return res;
483 }
484 
486  CUstream stream, pi_uint32 stream_token)
487  : commandType_{type}, refCount_{1}, has_ownership_{true},
488  hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
489  streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr},
490  evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} {
491 
492  bool profilingEnabled = queue_->properties_ & PI_QUEUE_PROFILING_ENABLE;
493 
494  PI_CHECK_ERROR(cuEventCreate(
495  &evEnd_, profilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
496 
497  if (profilingEnabled) {
498  PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
499  PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
500  }
501 
502  if (queue_ != nullptr) {
503  cuda_piQueueRetain(queue_);
504  }
506 }
507 
508 _pi_event::_pi_event(pi_context context, CUevent eventNative)
509  : commandType_{PI_COMMAND_TYPE_USER}, refCount_{1}, has_ownership_{false},
510  hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
511  streamToken_{std::numeric_limits<pi_uint32>::max()}, evEnd_{eventNative},
512  evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, context_{
513  context} {
515 }
516 
518  if (queue_ != nullptr) {
519  cuda_piQueueRelease(queue_);
520  }
521  cuda_piContextRelease(context_);
522 }
523 
525  assert(!is_started());
526  pi_result result = PI_SUCCESS;
527 
528  try {
529  if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
530  // NOTE: This relies on the default stream to be unused.
531  result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0));
532  result = PI_CHECK_ERROR(cuEventRecord(evStart_, stream_));
533  }
534  } catch (pi_result error) {
535  result = error;
536  }
537 
538  isStarted_ = true;
539  return result;
540 }
541 
542 bool _pi_event::is_completed() const noexcept {
543  if (!isRecorded_) {
544  return false;
545  }
546  if (!hasBeenWaitedOn_) {
547  const CUresult ret = cuEventQuery(evEnd_);
548  if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_READY) {
549  PI_CHECK_ERROR(ret);
550  return false;
551  }
552  if (ret == CUDA_ERROR_NOT_READY) {
553  return false;
554  }
555  }
556  return true;
557 }
558 
560  float miliSeconds = 0.0f;
561  assert(is_started());
562 
563  PI_CHECK_ERROR(
564  cuEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evQueued_));
565  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
566 }
567 
569  float miliSeconds = 0.0f;
570  assert(is_started());
571 
572  PI_CHECK_ERROR(
573  cuEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evStart_));
574  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
575 }
576 
578  float miliSeconds = 0.0f;
579  assert(is_started() && is_recorded());
580 
581  PI_CHECK_ERROR(
582  cuEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evEnd_));
583  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
584 }
585 
587 
588  if (is_recorded() || !is_started()) {
589  return PI_ERROR_INVALID_EVENT;
590  }
591 
592  pi_result result = PI_ERROR_INVALID_OPERATION;
593 
594  if (!queue_) {
595  return PI_ERROR_INVALID_QUEUE;
596  }
597 
598  try {
599  eventId_ = queue_->get_next_event_id();
600  if (eventId_ == 0) {
602  "Unrecoverable program state reached in event identifier overflow");
603  }
604  result = PI_CHECK_ERROR(cuEventRecord(evEnd_, stream_));
605  } catch (pi_result error) {
606  result = error;
607  }
608 
609  if (result == PI_SUCCESS) {
610  isRecorded_ = true;
611  }
612 
613  return result;
614 }
615 
617  pi_result retErr;
618  try {
619  retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
620  hasBeenWaitedOn_ = true;
621  } catch (pi_result error) {
622  retErr = error;
623  }
624 
625  return retErr;
626 }
627 
629  if (!backend_has_ownership())
630  return PI_SUCCESS;
631 
632  assert(queue_ != nullptr);
633 
634  PI_CHECK_ERROR(cuEventDestroy(evEnd_));
635 
636  if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
637  PI_CHECK_ERROR(cuEventDestroy(evQueued_));
638  PI_CHECK_ERROR(cuEventDestroy(evStart_));
639  }
640 
641  return PI_SUCCESS;
642 }
643 
644 // makes all future work submitted to queue wait for all work captured in event.
646  // for native events, the cuStreamWaitEvent call is used.
647  // This makes all future work submitted to stream wait for all
648  // work captured in event.
649  queue->for_each_stream([e = event->get()](CUstream s) {
650  PI_CHECK_ERROR(cuStreamWaitEvent(s, e, 0));
651  });
652  return PI_SUCCESS;
653 }
654 
656  : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1},
657  context_{ctxt}, kernelReqdWorkGroupSizeMD_{} {
659 }
660 
662 
663 bool get_kernel_metadata(std::string metadataName, const char *tag,
664  std::string &kernelName) {
665  const size_t tagLength = strlen(tag);
666  const size_t metadataNameLength = metadataName.length();
667  if (metadataNameLength >= tagLength &&
668  metadataName.compare(metadataNameLength - tagLength, tagLength, tag) ==
669  0) {
670  kernelName = metadataName.substr(0, metadataNameLength - tagLength);
671  return true;
672  }
673  return false;
674 }
675 
677  size_t length) {
678  for (size_t i = 0; i < length; ++i) {
679  const pi_device_binary_property metadataElement = metadata[i];
680  std::string metadataElementName{metadataElement->Name};
681  std::string kernelName;
682 
683  // If metadata is reqd_work_group_size record it for the corresponding
684  // kernel name.
685  if (get_kernel_metadata(metadataElementName,
687  kernelName)) {
688  assert(metadataElement->ValSize ==
689  sizeof(std::uint64_t) + sizeof(std::uint32_t) * 3 &&
690  "Unexpected size for reqd_work_group_size metadata");
691 
692  // Get pointer to data, skipping 64-bit size at the start of the data.
693  const auto *reqdWorkGroupElements =
694  reinterpret_cast<const std::uint32_t *>(metadataElement->ValAddr) + 2;
695  kernelReqdWorkGroupSizeMD_[kernelName] =
696  std::make_tuple(reqdWorkGroupElements[0], reqdWorkGroupElements[1],
697  reqdWorkGroupElements[2]);
698  }
699  }
700  return PI_SUCCESS;
701 }
702 
703 pi_result _pi_program::set_binary(const char *source, size_t length) {
704  assert((binary_ == nullptr && binarySizeInBytes_ == 0) &&
705  "Re-setting program binary data which has already been set");
706  binary_ = source;
708  return PI_SUCCESS;
709 }
710 
711 pi_result _pi_program::build_program(const char *build_options) {
712 
713  this->buildOptions_ = build_options;
714 
715  constexpr const unsigned int numberOfOptions = 4u;
716 
717  CUjit_option options[numberOfOptions];
718  void *optionVals[numberOfOptions];
719 
720  // Pass a buffer for info messages
721  options[0] = CU_JIT_INFO_LOG_BUFFER;
722  optionVals[0] = (void *)infoLog_;
723  // Pass the size of the info buffer
724  options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
725  optionVals[1] = (void *)(long)MAX_LOG_SIZE;
726  // Pass a buffer for error message
727  options[2] = CU_JIT_ERROR_LOG_BUFFER;
728  optionVals[2] = (void *)errorLog_;
729  // Pass the size of the error buffer
730  options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
731  optionVals[3] = (void *)(long)MAX_LOG_SIZE;
732 
733  auto result = PI_CHECK_ERROR(
734  cuModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
735  numberOfOptions, options, optionVals));
736 
737  const auto success = (result == PI_SUCCESS);
738 
739  buildStatus_ =
741 
742  // If no exception, result is correct
743  return success ? PI_SUCCESS : PI_ERROR_BUILD_PROGRAM_FAILURE;
744 }
745 
754  sycl::detail::pi::die("getKernelNames not implemented");
755  return {};
756 }
757 
762 template <typename T> class ReleaseGuard {
763 private:
764  T Captive;
765 
766  static pi_result callRelease(pi_device Captive) {
767  return cuda_piDeviceRelease(Captive);
768  }
769 
770  static pi_result callRelease(pi_context Captive) {
771  return cuda_piContextRelease(Captive);
772  }
773 
774  static pi_result callRelease(pi_mem Captive) {
775  return cuda_piMemRelease(Captive);
776  }
777 
778  static pi_result callRelease(pi_program Captive) {
779  return cuda_piProgramRelease(Captive);
780  }
781 
782  static pi_result callRelease(pi_kernel Captive) {
783  return cuda_piKernelRelease(Captive);
784  }
785 
786  static pi_result callRelease(pi_queue Captive) {
787  return cuda_piQueueRelease(Captive);
788  }
789 
790  static pi_result callRelease(pi_event Captive) {
791  return cuda_piEventRelease(Captive);
792  }
793 
794 public:
795  ReleaseGuard() = delete;
797  explicit ReleaseGuard(T Obj) : Captive(Obj) {}
798  ReleaseGuard(ReleaseGuard &&Other) noexcept : Captive(Other.Captive) {
799  Other.Captive = nullptr;
800  }
801 
802  ReleaseGuard(const ReleaseGuard &) = delete;
803 
807  if (Captive != nullptr) {
808  pi_result ret = callRelease(Captive);
809  if (ret != PI_SUCCESS) {
810  // A reported CUDA error is either an implementation or an asynchronous
811  // CUDA error for which it is unclear if the function that reported it
812  // succeeded or not. Either way, the state of the program is compromised
813  // and likely unrecoverable.
815  "Unrecoverable program state reached in cuda_piMemRelease");
816  }
817  }
818  }
819 
820  ReleaseGuard &operator=(const ReleaseGuard &) = delete;
821 
823  Captive = Other.Captive;
824  Other.Captive = nullptr;
825  return *this;
826  }
827 
830  void dismiss() { Captive = nullptr; }
831 };
832 
833 //-- PI API implementation
834 extern "C" {
835 
837  size_t param_value_size, void *param_value,
838  size_t *param_value_size_ret);
839 
849  pi_uint32 *num_platforms) {
850 
851  try {
852  static std::once_flag initFlag;
853  static pi_uint32 numPlatforms = 1;
854  static std::vector<_pi_platform> platformIds;
855 
856  if (num_entries == 0 && platforms != nullptr) {
857  return PI_ERROR_INVALID_VALUE;
858  }
859  if (platforms == nullptr && num_platforms == nullptr) {
860  return PI_ERROR_INVALID_VALUE;
861  }
862 
863  pi_result err = PI_SUCCESS;
864 
865  std::call_once(
866  initFlag,
867  [](pi_result &err) {
868  if (cuInit(0) != CUDA_SUCCESS) {
869  numPlatforms = 0;
870  return;
871  }
872  int numDevices = 0;
873  err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices));
874  if (numDevices == 0) {
875  numPlatforms = 0;
876  return;
877  }
878  try {
879  // make one platform per device
880  numPlatforms = numDevices;
881  platformIds.resize(numDevices);
882 
883  for (int i = 0; i < numDevices; ++i) {
884  CUdevice device;
885  err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
886  platformIds[i].devices_.emplace_back(
887  new _pi_device{device, &platformIds[i]});
888 
889  {
890  const auto &dev = platformIds[i].devices_.back().get();
891  size_t maxWorkGroupSize = 0u;
892  size_t maxThreadsPerBlock[3] = {};
893  pi_result retError = cuda_piDeviceGetInfo(
895  sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
896  assert(retError == PI_SUCCESS);
897  (void)retError;
898 
899  retError = cuda_piDeviceGetInfo(
901  sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr);
902  assert(retError == PI_SUCCESS);
903 
904  dev->save_max_work_item_sizes(sizeof(maxThreadsPerBlock),
905  maxThreadsPerBlock);
906  dev->save_max_work_group_size(maxWorkGroupSize);
907  }
908  }
909  } catch (const std::bad_alloc &) {
910  // Signal out-of-memory situation
911  for (int i = 0; i < numDevices; ++i) {
912  platformIds[i].devices_.clear();
913  }
914  platformIds.clear();
915  err = PI_ERROR_OUT_OF_HOST_MEMORY;
916  } catch (...) {
917  // Clear and rethrow to allow retry
918  for (int i = 0; i < numDevices; ++i) {
919  platformIds[i].devices_.clear();
920  }
921  platformIds.clear();
922  throw;
923  }
924  },
925  err);
926 
927  if (num_platforms != nullptr) {
928  *num_platforms = numPlatforms;
929  }
930 
931  if (platforms != nullptr) {
932  for (unsigned i = 0; i < std::min(num_entries, numPlatforms); ++i) {
933  platforms[i] = &platformIds[i];
934  }
935  }
936 
937  return err;
938  } catch (pi_result err) {
939  return err;
940  } catch (...) {
941  return PI_ERROR_OUT_OF_RESOURCES;
942  }
943 }
944 
946  pi_platform_info param_name,
947  size_t param_value_size, void *param_value,
948  size_t *param_value_size_ret) {
949  assert(platform != nullptr);
950 
951  switch (param_name) {
953  return getInfo(param_value_size, param_value, param_value_size_ret,
954  "NVIDIA CUDA BACKEND");
956  return getInfo(param_value_size, param_value, param_value_size_ret,
957  "NVIDIA Corporation");
959  return getInfo(param_value_size, param_value, param_value_size_ret,
960  "FULL PROFILE");
962  auto version = getCudaVersionString();
963  return getInfo(param_value_size, param_value, param_value_size_ret,
964  version.c_str());
965  }
967  return getInfo(param_value_size, param_value, param_value_size_ret, "");
968  }
969  default:
971  }
972  sycl::detail::pi::die("Platform info request not implemented");
973  return {};
974 }
975 
982  pi_uint32 num_entries, pi_device *devices,
983  pi_uint32 *num_devices) {
984 
985  pi_result err = PI_SUCCESS;
986  const bool askingForDefault = device_type == PI_DEVICE_TYPE_DEFAULT;
987  const bool askingForGPU = device_type & PI_DEVICE_TYPE_GPU;
988  const bool returnDevices = askingForDefault || askingForGPU;
989 
990  size_t numDevices = returnDevices ? platform->devices_.size() : 0;
991 
992  try {
993  if (num_devices) {
994  *num_devices = numDevices;
995  }
996 
997  if (returnDevices && devices) {
998  for (size_t i = 0; i < std::min(size_t(num_entries), numDevices); ++i) {
999  devices[i] = platform->devices_[i].get();
1000  }
1001  }
1002 
1003  return err;
1004  } catch (pi_result err) {
1005  return err;
1006  } catch (...) {
1007  return PI_ERROR_OUT_OF_RESOURCES;
1008  }
1009 }
1010 
1013 pi_result cuda_piDeviceRetain(pi_device) { return PI_SUCCESS; }
1014 
1016  size_t param_value_size, void *param_value,
1017  size_t *param_value_size_ret) {
1018 
1019  switch (param_name) {
1021  return getInfo(param_value_size, param_value, param_value_size_ret, 1);
1023  return getInfo(param_value_size, param_value, param_value_size_ret,
1024  context->get_device());
1026  return getInfo(param_value_size, param_value, param_value_size_ret,
1027  context->get_reference_count());
1029  pi_memory_order_capabilities capabilities =
1032  return getInfo(param_value_size, param_value, param_value_size_ret,
1033  capabilities);
1034  }
1036  int major = 0;
1038  cuDeviceGetAttribute(&major,
1039  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1040  context->get_device()->get()) == CUDA_SUCCESS);
1041  pi_memory_order_capabilities capabilities =
1047  return getInfo(param_value_size, param_value, param_value_size_ret,
1048  capabilities);
1049  }
1050  default:
1052  }
1053 
1054  return PI_ERROR_OUT_OF_RESOURCES;
1055 }
1056 
1058  assert(context != nullptr);
1059  assert(context->get_reference_count() > 0);
1060 
1061  context->increment_reference_count();
1062  return PI_SUCCESS;
1063 }
1064 
1066  pi_context context, pi_context_extended_deleter function, void *user_data) {
1067  context->set_extended_deleter(function, user_data);
1068  return PI_SUCCESS;
1069 }
1070 
1074  pi_uint32, pi_device *, pi_uint32 *) {
1075  return {};
1076 }
1077 
1081  pi_device_binary *binaries,
1082  pi_uint32 num_binaries,
1083  pi_uint32 *selected_binary) {
1084  // Ignore unused parameter
1085  (void)device;
1086 
1087  if (!binaries) {
1088  sycl::detail::pi::die("No list of device images provided");
1089  }
1090  if (num_binaries < 1) {
1091  sycl::detail::pi::die("No binary images in the list");
1092  }
1093 
1094  // Look for an image for the NVPTX64 target, and return the first one that is
1095  // found
1096  for (pi_uint32 i = 0; i < num_binaries; i++) {
1097  if (strcmp(binaries[i]->DeviceTargetSpec,
1099  *selected_binary = i;
1100  return PI_SUCCESS;
1101  }
1102  }
1103 
1104  // No image can be loaded for the given device
1105  return PI_ERROR_INVALID_BINARY;
1106 }
1107 
1109  pi_program program,
1110  const char *func_name,
1111  pi_uint64 *func_pointer_ret) {
1112  // Check if device passed is the same the device bound to the context
1113  assert(device == program->get_context()->get_device());
1114  assert(func_pointer_ret != nullptr);
1115 
1116  CUfunction func;
1117  CUresult ret = cuModuleGetFunction(&func, program->get(), func_name);
1118  *func_pointer_ret = reinterpret_cast<pi_uint64>(func);
1119  pi_result retError = PI_SUCCESS;
1120 
1121  if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_FOUND)
1122  retError = PI_CHECK_ERROR(ret);
1123  if (ret == CUDA_ERROR_NOT_FOUND) {
1124  *func_pointer_ret = 0;
1125  retError = PI_ERROR_INVALID_KERNEL_NAME;
1126  }
1127 
1128  return retError;
1129 }
1130 
1134 
1136  size_t param_value_size, void *param_value,
1137  size_t *param_value_size_ret) {
1138 
1139  static constexpr pi_uint32 max_work_item_dimensions = 3u;
1140 
1141  assert(device != nullptr);
1142 
1143  switch (param_name) {
1144  case PI_DEVICE_INFO_TYPE: {
1145  return getInfo(param_value_size, param_value, param_value_size_ret,
1147  }
1148  case PI_DEVICE_INFO_VENDOR_ID: {
1149  return getInfo(param_value_size, param_value, param_value_size_ret, 4318u);
1150  }
1152  int compute_units = 0;
1154  cuDeviceGetAttribute(&compute_units,
1155  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1156  device->get()) == CUDA_SUCCESS);
1157  sycl::detail::pi::assertion(compute_units >= 0);
1158  return getInfo(param_value_size, param_value, param_value_size_ret,
1159  pi_uint32(compute_units));
1160  }
1162  return getInfo(param_value_size, param_value, param_value_size_ret,
1163  max_work_item_dimensions);
1164  }
1166  size_t return_sizes[max_work_item_dimensions];
1167 
1168  int max_x = 0, max_y = 0, max_z = 0;
1170  cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
1171  device->get()) == CUDA_SUCCESS);
1172  sycl::detail::pi::assertion(max_x >= 0);
1173 
1175  cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
1176  device->get()) == CUDA_SUCCESS);
1177  sycl::detail::pi::assertion(max_y >= 0);
1178 
1180  cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
1181  device->get()) == CUDA_SUCCESS);
1182  sycl::detail::pi::assertion(max_z >= 0);
1183 
1184  return_sizes[0] = size_t(max_x);
1185  return_sizes[1] = size_t(max_y);
1186  return_sizes[2] = size_t(max_z);
1187  return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
1188  param_value_size_ret, return_sizes);
1189  }
1190 
1192  size_t return_sizes[max_work_item_dimensions];
1193  int max_x = 0, max_y = 0, max_z = 0;
1195  cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
1196  device->get()) == CUDA_SUCCESS);
1197  sycl::detail::pi::assertion(max_x >= 0);
1198 
1200  cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
1201  device->get()) == CUDA_SUCCESS);
1202  sycl::detail::pi::assertion(max_y >= 0);
1203 
1205  cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
1206  device->get()) == CUDA_SUCCESS);
1207  sycl::detail::pi::assertion(max_z >= 0);
1208 
1209  return_sizes[0] = size_t(max_x);
1210  return_sizes[1] = size_t(max_y);
1211  return_sizes[2] = size_t(max_z);
1212  return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
1213  param_value_size_ret, return_sizes);
1214  }
1215 
1217  int max_work_group_size = 0;
1219  cuDeviceGetAttribute(&max_work_group_size,
1220  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1221  device->get()) == CUDA_SUCCESS);
1222 
1223  sycl::detail::pi::assertion(max_work_group_size >= 0);
1224 
1225  return getInfo(param_value_size, param_value, param_value_size_ret,
1226  size_t(max_work_group_size));
1227  }
1229  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1230  }
1232  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1233  }
1235  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1236  }
1238  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1239  }
1241  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1242  }
1244  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1245  }
1247  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1248  }
1250  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1251  }
1253  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1254  }
1256  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1257  }
1259  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1260  }
1262  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1263  }
1265  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1266  }
1268  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1269  }
1271  // Number of sub-groups = max block size / warp size + possible remainder
1272  int max_threads = 0;
1274  cuDeviceGetAttribute(&max_threads,
1275  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1276  device->get()) == CUDA_SUCCESS);
1277  int warpSize = 0;
1279  cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
1280  device->get()) == CUDA_SUCCESS);
1281  int maxWarps = (max_threads + warpSize - 1) / warpSize;
1282  return getInfo(param_value_size, param_value, param_value_size_ret,
1283  static_cast<uint32_t>(maxWarps));
1284  }
1286  // Volta provides independent thread scheduling
1287  // TODO: Revisit for previous generation GPUs
1288  int major = 0;
1290  cuDeviceGetAttribute(&major,
1291  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1292  device->get()) == CUDA_SUCCESS);
1293  bool ifp = (major >= 7);
1294  return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
1295  }
1296 
1297  case PI_DEVICE_INFO_ATOMIC_64: {
1298  int major = 0;
1300  cuDeviceGetAttribute(&major,
1301  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1302  device->get()) == CUDA_SUCCESS);
1303 
1304  bool atomic64 = (major >= 6) ? true : false;
1305  return getInfo(param_value_size, param_value, param_value_size_ret,
1306  atomic64);
1307  }
1309  pi_memory_order_capabilities capabilities =
1312  return getInfo(param_value_size, param_value, param_value_size_ret,
1313  capabilities);
1314  }
1316  int major = 0;
1318  cuDeviceGetAttribute(&major,
1319  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1320  device->get()) == CUDA_SUCCESS);
1321  pi_memory_order_capabilities capabilities =
1327  return getInfo(param_value_size, param_value, param_value_size_ret,
1328  capabilities);
1329  }
1331  int major = 0;
1333  cuDeviceGetAttribute(&major,
1334  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1335  device->get()) == CUDA_SUCCESS);
1336 
1337  bool bfloat16 = (major >= 8) ? true : false;
1338  return getInfo(param_value_size, param_value, param_value_size_ret,
1339  bfloat16);
1340  }
1342  // NVIDIA devices only support one sub-group size (the warp size)
1343  int warpSize = 0;
1345  cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
1346  device->get()) == CUDA_SUCCESS);
1347  size_t sizes[1] = {static_cast<size_t>(warpSize)};
1348  return getInfoArray<size_t>(1, param_value_size, param_value,
1349  param_value_size_ret, sizes);
1350  }
1352  int clock_freq = 0;
1354  cuDeviceGetAttribute(&clock_freq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
1355  device->get()) == CUDA_SUCCESS);
1356  sycl::detail::pi::assertion(clock_freq >= 0);
1357  return getInfo(param_value_size, param_value, param_value_size_ret,
1358  pi_uint32(clock_freq) / 1000u);
1359  }
1361  auto bits = pi_uint32{std::numeric_limits<uintptr_t>::digits};
1362  return getInfo(param_value_size, param_value, param_value_size_ret, bits);
1363  }
1365  // Max size of memory object allocation in bytes.
1366  // The minimum value is max(min(1024 × 1024 ×
1367  // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE),
1368  // 32 × 1024 × 1024) for devices that are not of type
1369  // CL_DEVICE_TYPE_CUSTOM.
1370 
1371  size_t global = 0;
1372  sycl::detail::pi::assertion(cuDeviceTotalMem(&global, device->get()) ==
1373  CUDA_SUCCESS);
1374 
1375  auto quarter_global = static_cast<pi_uint32>(global / 4u);
1376 
1377  auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global),
1378  32u * 1024u * 1024u);
1379 
1380  return getInfo(param_value_size, param_value, param_value_size_ret,
1381  pi_uint64{max_alloc});
1382  }
1384  pi_bool enabled = PI_FALSE;
1385 
1386  if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) {
1387  enabled = PI_TRUE;
1388  } else {
1390  "Images are not fully supported by the CUDA BE, their support is "
1391  "disabled by default. Their partial support can be activated by "
1392  "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at "
1393  "runtime.");
1394  }
1395 
1396  return getInfo(param_value_size, param_value, param_value_size_ret,
1397  enabled);
1398  }
1400  // This call doesn't match to CUDA as it doesn't have images, but instead
1401  // surfaces and textures. No clear call in the CUDA API to determine this,
1402  // but some searching found as of SM 2.x 128 are supported.
1403  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1404  }
1406  // This call doesn't match to CUDA as it doesn't have images, but instead
1407  // surfaces and textures. No clear call in the CUDA API to determine this,
1408  // but some searching found as of SM 2.x 128 are supported.
1409  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1410  }
1412  // Take the smaller of maximum surface and maximum texture height.
1413  int tex_height = 0;
1415  cuDeviceGetAttribute(&tex_height,
1416  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT,
1417  device->get()) == CUDA_SUCCESS);
1418  sycl::detail::pi::assertion(tex_height >= 0);
1419  int surf_height = 0;
1421  cuDeviceGetAttribute(&surf_height,
1422  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT,
1423  device->get()) == CUDA_SUCCESS);
1424  sycl::detail::pi::assertion(surf_height >= 0);
1425 
1426  int min = std::min(tex_height, surf_height);
1427 
1428  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1429  }
1431  // Take the smaller of maximum surface and maximum texture width.
1432  int tex_width = 0;
1434  cuDeviceGetAttribute(&tex_width,
1435  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH,
1436  device->get()) == CUDA_SUCCESS);
1437  sycl::detail::pi::assertion(tex_width >= 0);
1438  int surf_width = 0;
1440  cuDeviceGetAttribute(&surf_width,
1441  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH,
1442  device->get()) == CUDA_SUCCESS);
1443  sycl::detail::pi::assertion(surf_width >= 0);
1444 
1445  int min = std::min(tex_width, surf_width);
1446 
1447  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1448  }
1450  // Take the smaller of maximum surface and maximum texture height.
1451  int tex_height = 0;
1453  cuDeviceGetAttribute(&tex_height,
1454  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT,
1455  device->get()) == CUDA_SUCCESS);
1456  sycl::detail::pi::assertion(tex_height >= 0);
1457  int surf_height = 0;
1459  cuDeviceGetAttribute(&surf_height,
1460  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT,
1461  device->get()) == CUDA_SUCCESS);
1462  sycl::detail::pi::assertion(surf_height >= 0);
1463 
1464  int min = std::min(tex_height, surf_height);
1465 
1466  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1467  }
1469  // Take the smaller of maximum surface and maximum texture width.
1470  int tex_width = 0;
1472  cuDeviceGetAttribute(&tex_width,
1473  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH,
1474  device->get()) == CUDA_SUCCESS);
1475  sycl::detail::pi::assertion(tex_width >= 0);
1476  int surf_width = 0;
1478  cuDeviceGetAttribute(&surf_width,
1479  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH,
1480  device->get()) == CUDA_SUCCESS);
1481  sycl::detail::pi::assertion(surf_width >= 0);
1482 
1483  int min = std::min(tex_width, surf_width);
1484 
1485  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1486  }
1488  // Take the smaller of maximum surface and maximum texture depth.
1489  int tex_depth = 0;
1491  cuDeviceGetAttribute(&tex_depth,
1492  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH,
1493  device->get()) == CUDA_SUCCESS);
1494  sycl::detail::pi::assertion(tex_depth >= 0);
1495  int surf_depth = 0;
1497  cuDeviceGetAttribute(&surf_depth,
1498  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH,
1499  device->get()) == CUDA_SUCCESS);
1500  sycl::detail::pi::assertion(surf_depth >= 0);
1501 
1502  int min = std::min(tex_depth, surf_depth);
1503 
1504  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1505  }
1507  // Take the smaller of maximum surface and maximum texture width.
1508  int tex_width = 0;
1510  cuDeviceGetAttribute(&tex_width,
1511  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH,
1512  device->get()) == CUDA_SUCCESS);
1513  sycl::detail::pi::assertion(tex_width >= 0);
1514  int surf_width = 0;
1516  cuDeviceGetAttribute(&surf_width,
1517  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH,
1518  device->get()) == CUDA_SUCCESS);
1519  sycl::detail::pi::assertion(surf_width >= 0);
1520 
1521  int min = std::min(tex_width, surf_width);
1522 
1523  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1524  }
1526  return getInfo(param_value_size, param_value, param_value_size_ret,
1527  size_t(0));
1528  }
1530  // This call is kind of meaningless for cuda, as samplers don't exist.
1531  // Closest thing is textures, which is 128.
1532  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1533  }
1535  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters
1536  // __global__ function parameters are passed to the device via constant
1537  // memory and are limited to 4 KB.
1538  return getInfo(param_value_size, param_value, param_value_size_ret,
1539  size_t{4000u});
1540  }
1542  int mem_base_addr_align = 0;
1544  cuDeviceGetAttribute(&mem_base_addr_align,
1545  CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
1546  device->get()) == CUDA_SUCCESS);
1547  // Multiply by 8 as clGetDeviceInfo returns this value in bits
1548  mem_base_addr_align *= 8;
1549  return getInfo(param_value_size, param_value, param_value_size_ret,
1550  mem_base_addr_align);
1551  }
1553  // TODO: is this config consistent across all NVIDIA GPUs?
1554  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1555  }
1557  // TODO: is this config consistent across all NVIDIA GPUs?
1561  return getInfo(param_value_size, param_value, param_value_size_ret, config);
1562  }
1564  // TODO: is this config consistent across all NVIDIA GPUs?
1567  return getInfo(param_value_size, param_value, param_value_size_ret, config);
1568  }
1570  // TODO: is this config consistent across all NVIDIA GPUs?
1571  return getInfo(param_value_size, param_value, param_value_size_ret,
1573  }
1575  // The value is documented for all existing GPUs in the CUDA programming
1576  // guidelines, section "H.3.2. Global Memory".
1577  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1578  }
1580  int cache_size = 0;
1582  cuDeviceGetAttribute(&cache_size, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1583  device->get()) == CUDA_SUCCESS);
1584  sycl::detail::pi::assertion(cache_size >= 0);
1585  // The L2 cache is global to the GPU.
1586  return getInfo(param_value_size, param_value, param_value_size_ret,
1587  pi_uint64(cache_size));
1588  }
1590  size_t bytes = 0;
1591  // Runtime API has easy access to this value, driver API info is scarse.
1592  sycl::detail::pi::assertion(cuDeviceTotalMem(&bytes, device->get()) ==
1593  CUDA_SUCCESS);
1594  return getInfo(param_value_size, param_value, param_value_size_ret,
1595  pi_uint64{bytes});
1596  }
1598  int constant_memory = 0;
1600  cuDeviceGetAttribute(&constant_memory,
1601  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
1602  device->get()) == CUDA_SUCCESS);
1603  sycl::detail::pi::assertion(constant_memory >= 0);
1604 
1605  return getInfo(param_value_size, param_value, param_value_size_ret,
1606  pi_uint64(constant_memory));
1607  }
1609  // TODO: is there a way to retrieve this from CUDA driver API?
1610  // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
1611  // 1060 3GB
1612  return getInfo(param_value_size, param_value, param_value_size_ret, 9u);
1613  }
1615  return getInfo(param_value_size, param_value, param_value_size_ret,
1617  }
1619  // OpenCL's "local memory" maps most closely to CUDA's "shared memory".
1620  // CUDA has its own definition of "local memory", which maps to OpenCL's
1621  // "private memory".
1622  int local_mem_size = 0;
1624  cuDeviceGetAttribute(&local_mem_size,
1625  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
1626  device->get()) == CUDA_SUCCESS);
1627  sycl::detail::pi::assertion(local_mem_size >= 0);
1628  return getInfo(param_value_size, param_value, param_value_size_ret,
1629  pi_uint64(local_mem_size));
1630  }
1632  int ecc_enabled = 0;
1634  cuDeviceGetAttribute(&ecc_enabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
1635  device->get()) == CUDA_SUCCESS);
1636 
1637  sycl::detail::pi::assertion((ecc_enabled == 0) | (ecc_enabled == 1));
1638  auto result = static_cast<pi_bool>(ecc_enabled);
1639  return getInfo(param_value_size, param_value, param_value_size_ret, result);
1640  }
1642  int is_integrated = 0;
1644  cuDeviceGetAttribute(&is_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
1645  device->get()) == CUDA_SUCCESS);
1646 
1647  sycl::detail::pi::assertion((is_integrated == 0) | (is_integrated == 1));
1648  auto result = static_cast<pi_bool>(is_integrated);
1649  return getInfo(param_value_size, param_value, param_value_size_ret, result);
1650  }
1652  // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
1653  // 1060 3GB
1654  return getInfo(param_value_size, param_value, param_value_size_ret,
1655  size_t{1000u});
1656  }
1658  return getInfo(param_value_size, param_value, param_value_size_ret,
1659  PI_TRUE);
1660  }
1661  case PI_DEVICE_INFO_AVAILABLE: {
1662  return getInfo(param_value_size, param_value, param_value_size_ret,
1663  PI_TRUE);
1664  }
1666  return getInfo(param_value_size, param_value, param_value_size_ret,
1667  PI_TRUE);
1668  }
1670  return getInfo(param_value_size, param_value, param_value_size_ret,
1671  PI_TRUE);
1672  }
1674  return getInfo(param_value_size, param_value, param_value_size_ret,
1675  PI_TRUE);
1676  }
1678  auto capability = PI_DEVICE_EXEC_CAPABILITIES_KERNEL;
1679  return getInfo(param_value_size, param_value, param_value_size_ret,
1680  capability);
1681  }
1683  // The mandated minimum capability:
1684  auto capability =
1686  return getInfo(param_value_size, param_value, param_value_size_ret,
1687  capability);
1688  }
1690  // The mandated minimum capability:
1691  auto capability = PI_QUEUE_PROFILING_ENABLE;
1692  return getInfo(param_value_size, param_value, param_value_size_ret,
1693  capability);
1694  }
1696  // An empty string is returned if no built-in kernels are supported by the
1697  // device.
1698  return getInfo(param_value_size, param_value, param_value_size_ret, "");
1699  }
1700  case PI_DEVICE_INFO_PLATFORM: {
1701  return getInfo(param_value_size, param_value, param_value_size_ret,
1702  device->get_platform());
1703  }
1704  case PI_DEVICE_INFO_NAME: {
1705  static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u;
1706  char name[MAX_DEVICE_NAME_LENGTH];
1707  sycl::detail::pi::assertion(cuDeviceGetName(name, MAX_DEVICE_NAME_LENGTH,
1708  device->get()) == CUDA_SUCCESS);
1709  return getInfoArray(strlen(name) + 1, param_value_size, param_value,
1710  param_value_size_ret, name);
1711  }
1712  case PI_DEVICE_INFO_VENDOR: {
1713  return getInfo(param_value_size, param_value, param_value_size_ret,
1714  "NVIDIA Corporation");
1715  }
1717  auto version = getCudaVersionString();
1718  return getInfo(param_value_size, param_value, param_value_size_ret,
1719  version.c_str());
1720  }
1721  case PI_DEVICE_INFO_PROFILE: {
1722  return getInfo(param_value_size, param_value, param_value_size_ret, "CUDA");
1723  }
1725  return getInfo(param_value_size, param_value, param_value_size_ret,
1726  device->get_reference_count());
1727  }
1728  case PI_DEVICE_INFO_VERSION: {
1729  return getInfo(param_value_size, param_value, param_value_size_ret,
1730  "PI 0.0");
1731  }
1733  return getInfo(param_value_size, param_value, param_value_size_ret, "");
1734  }
1736 
1737  std::string SupportedExtensions = "cl_khr_fp64 ";
1738  SupportedExtensions += PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT;
1739  SupportedExtensions += " ";
1740 
1741  int major = 0;
1742  int minor = 0;
1743 
1745  cuDeviceGetAttribute(&major,
1746  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1747  device->get()) == CUDA_SUCCESS);
1749  cuDeviceGetAttribute(&minor,
1750  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
1751  device->get()) == CUDA_SUCCESS);
1752 
1753  if ((major >= 6) || ((major == 5) && (minor >= 3))) {
1754  SupportedExtensions += "cl_khr_fp16 ";
1755  }
1756 
1757  return getInfo(param_value_size, param_value, param_value_size_ret,
1758  SupportedExtensions.c_str());
1759  }
1761  // The minimum value for the FULL profile is 1 MB.
1762  return getInfo(param_value_size, param_value, param_value_size_ret,
1763  size_t{1024u});
1764  }
1766  return getInfo(param_value_size, param_value, param_value_size_ret,
1767  PI_TRUE);
1768  }
1770  return getInfo(param_value_size, param_value, param_value_size_ret,
1771  nullptr);
1772  }
1774  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1775  }
1777  return getInfo(param_value_size, param_value, param_value_size_ret,
1778  static_cast<pi_device_partition_property>(0u));
1779  }
1781  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1782  }
1784  return getInfo(param_value_size, param_value, param_value_size_ret,
1785  static_cast<pi_device_partition_property>(0u));
1786  }
1787 
1788  // Intel USM extensions
1789 
1791  // from cl_intel_unified_shared_memory: "The host memory access capabilities
1792  // apply to any host allocation."
1793  //
1794  // query if/how the device can access page-locked host memory, possibly
1795  // through PCIe, using the same pointer as the host
1796  pi_bitfield value = {};
1797  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
1798  // the device shares a unified address space with the host
1799  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
1800  6) {
1801  // compute capability 6.x introduces operations that are atomic with
1802  // respect to other CPUs and GPUs in the system
1805  } else {
1806  // on GPU architectures with compute capability lower than 6.x, atomic
1807  // operations from the GPU to CPU memory will not be atomic with respect
1808  // to CPU initiated atomic operations
1810  }
1811  }
1812  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1813  }
1815  // from cl_intel_unified_shared_memory:
1816  // "The device memory access capabilities apply to any device allocation
1817  // associated with this device."
1818  //
1819  // query how the device can access memory allocated on the device itself (?)
1823  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1824  }
1826  // from cl_intel_unified_shared_memory:
1827  // "The single device shared memory access capabilities apply to any shared
1828  // allocation associated with this device."
1829  //
1830  // query if/how the device can access managed memory associated to it
1831  pi_bitfield value = {};
1832  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
1833  // the device can allocate managed memory on this system
1835  }
1836  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
1837  // the device can coherently access managed memory concurrently with the
1838  // CPU
1839  value |= PI_USM_CONCURRENT_ACCESS;
1840  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
1841  6) {
1842  // compute capability 6.x introduces operations that are atomic with
1843  // respect to other CPUs and GPUs in the system
1845  }
1846  }
1847  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1848  }
1850  // from cl_intel_unified_shared_memory:
1851  // "The cross-device shared memory access capabilities apply to any shared
1852  // allocation associated with this device, or to any shared memory
1853  // allocation on another device that also supports the same cross-device
1854  // shared memory access capability."
1855  //
1856  // query if/how the device can access managed memory associated to other
1857  // devices
1858  pi_bitfield value = {};
1859  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
1860  // the device can allocate managed memory on this system
1861  value |= PI_USM_ACCESS;
1862  }
1863  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
1864  // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
1865  // attribute can coherently access managed memory concurrently with the
1866  // CPU
1867  value |= PI_USM_CONCURRENT_ACCESS;
1868  }
1869  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
1870  6) {
1871  // compute capability 6.x introduces operations that are atomic with
1872  // respect to other CPUs and GPUs in the system
1873  if (value & PI_USM_ACCESS)
1874  value |= PI_USM_ATOMIC_ACCESS;
1875  if (value & PI_USM_CONCURRENT_ACCESS)
1877  }
1878  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1879  }
1881  // from cl_intel_unified_shared_memory:
1882  // "The shared system memory access capabilities apply to any allocations
1883  // made by a system allocator, such as malloc or new."
1884  //
1885  // query if/how the device can access pageable host memory allocated by the
1886  // system allocator
1887  pi_bitfield value = {};
1888  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) {
1889  // the device suppports coherently accessing pageable memory without
1890  // calling cuMemHostRegister/cudaHostRegister on it
1891  if (getAttribute(device,
1892  CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) {
1893  // the link between the device and the host supports native atomic
1894  // operations
1897  } else {
1898  // the link between the device and the host does not support native
1899  // atomic operations
1901  }
1902  }
1903  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1904  }
1906  int value =
1907  getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8;
1908  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1909  }
1911  int major =
1912  getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
1913  int minor =
1914  getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
1915  std::string result = std::to_string(major) + "." + std::to_string(minor);
1916  return getInfo(param_value_size, param_value, param_value_size_ret,
1917  result.c_str());
1918  }
1919 
1921  size_t FreeMemory = 0;
1922  size_t TotalMemory = 0;
1923  sycl::detail::pi::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
1924  CUDA_SUCCESS,
1925  "failed cuMemGetInfo() API.");
1926  return getInfo(param_value_size, param_value, param_value_size_ret,
1927  FreeMemory);
1928  }
1930  int value = 0;
1932  cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
1933  device->get()) == CUDA_SUCCESS);
1934  sycl::detail::pi::assertion(value >= 0);
1935  // Convert kilohertz to megahertz when returning.
1936  return getInfo(param_value_size, param_value, param_value_size_ret,
1937  value / 1000);
1938  }
1940  int value = 0;
1942  cuDeviceGetAttribute(&value,
1943  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
1944  device->get()) == CUDA_SUCCESS);
1945  sycl::detail::pi::assertion(value >= 0);
1946  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1947  }
1948 
1949  // TODO: Investigate if this information is available on CUDA.
1959  // TODO: Check if Intel device UUID extension is utilized for CUDA.
1960  // For details about this extension, see
1961  // sycl/doc/extensions/supported/sycl_ext_intel_device_info.md
1962  case PI_DEVICE_INFO_UUID:
1963  return PI_ERROR_INVALID_VALUE;
1964 
1965  default:
1967  }
1968  sycl::detail::pi::die("Device info request not implemented");
1969  return {};
1970 }
1971 
1979  pi_native_handle *nativeHandle) {
1980  *nativeHandle = static_cast<pi_native_handle>(device->get());
1981  return PI_SUCCESS;
1982 }
1983 
1993  pi_platform platform,
1994  pi_device *piDevice) {
1995  assert(piDevice != nullptr);
1996 
1997  CUdevice cu_device = static_cast<CUdevice>(nativeHandle);
1998 
1999  auto is_device = [=](std::unique_ptr<_pi_device> &dev) {
2000  return dev->get() == cu_device;
2001  };
2002 
2003  // If a platform is provided just check if the device is in it
2004  if (platform) {
2005  auto search_res = std::find_if(begin(platform->devices_),
2006  end(platform->devices_), is_device);
2007  if (search_res != end(platform->devices_)) {
2008  *piDevice = (*search_res).get();
2009  return PI_SUCCESS;
2010  }
2011  }
2012 
2013  // Get list of platforms
2014  pi_uint32 num_platforms;
2015  pi_result result = cuda_piPlatformsGet(0, nullptr, &num_platforms);
2016  if (result != PI_SUCCESS)
2017  return result;
2018 
2019  pi_platform *plat =
2020  static_cast<pi_platform *>(malloc(num_platforms * sizeof(pi_platform)));
2021  result = cuda_piPlatformsGet(num_platforms, plat, nullptr);
2022  if (result != PI_SUCCESS)
2023  return result;
2024 
2025  // Iterate through platforms to find device that matches nativeHandle
2026  for (pi_uint32 j = 0; j < num_platforms; ++j) {
2027  auto search_res = std::find_if(begin(plat[j]->devices_),
2028  end(plat[j]->devices_), is_device);
2029  if (search_res != end(plat[j]->devices_)) {
2030  *piDevice = (*search_res).get();
2031  return PI_SUCCESS;
2032  }
2033  }
2034 
2035  // If the provided nativeHandle cannot be matched to an
2036  // existing device return error
2037  return PI_ERROR_INVALID_OPERATION;
2038 }
2039 
2040 /* Context APIs */
2041 
2061  pi_uint32 num_devices, const pi_device *devices,
2062  void (*pfn_notify)(const char *errinfo,
2063  const void *private_info,
2064  size_t cb, void *user_data),
2065  void *user_data, pi_context *retcontext) {
2066 
2067  assert(devices != nullptr);
2068  // TODO: How to implement context callback?
2069  assert(pfn_notify == nullptr);
2070  assert(user_data == nullptr);
2071  assert(num_devices == 1);
2072  // Need input context
2073  assert(retcontext != nullptr);
2074  pi_result errcode_ret = PI_SUCCESS;
2075 
2076  // Parse properties.
2077  bool property_cuda_primary = false;
2078  while (properties && (0 != *properties)) {
2079  // Consume property ID.
2080  pi_context_properties id = *properties;
2081  ++properties;
2082  // Consume property value.
2083  pi_context_properties value = *properties;
2084  ++properties;
2085  switch (id) {
2087  assert(value == PI_FALSE || value == PI_TRUE);
2088  property_cuda_primary = static_cast<bool>(value);
2089  break;
2090  default:
2091  // Unknown property.
2093  "Unknown piContextCreate property in property list");
2094  return PI_ERROR_INVALID_VALUE;
2095  }
2096  }
2097 
2098  std::unique_ptr<_pi_context> piContextPtr{nullptr};
2099  try {
2100  CUcontext current = nullptr;
2101 
2102  if (property_cuda_primary) {
2103  // Use the CUDA primary context and assume that we want to use it
2104  // immediately as we want to forge context switches.
2105  CUcontext Ctxt;
2106  errcode_ret =
2107  PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&Ctxt, devices[0]->get()));
2108  piContextPtr = std::unique_ptr<_pi_context>(
2109  new _pi_context{_pi_context::kind::primary, Ctxt, *devices});
2110  errcode_ret = PI_CHECK_ERROR(cuCtxPushCurrent(Ctxt));
2111  } else {
2112  // Create a scoped context.
2113  CUcontext newContext;
2114  PI_CHECK_ERROR(cuCtxGetCurrent(&current));
2115  errcode_ret = PI_CHECK_ERROR(
2116  cuCtxCreate(&newContext, CU_CTX_MAP_HOST, devices[0]->get()));
2117  piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{
2118  _pi_context::kind::user_defined, newContext, *devices});
2119  }
2120 
2121  static std::once_flag initFlag;
2122  std::call_once(
2123  initFlag,
2124  [](pi_result &err) {
2125  // Use default stream to record base event counter
2126  PI_CHECK_ERROR(
2127  cuEventCreate(&_pi_platform::evBase_, CU_EVENT_DEFAULT));
2128  PI_CHECK_ERROR(cuEventRecord(_pi_platform::evBase_, 0));
2129  },
2130  errcode_ret);
2131 
2132  // For non-primary scoped contexts keep the last active on top of the stack
2133  // as `cuCtxCreate` replaces it implicitly otherwise.
2134  // Primary contexts are kept on top of the stack, so the previous context
2135  // is not queried and therefore not recovered.
2136  if (current != nullptr) {
2137  PI_CHECK_ERROR(cuCtxSetCurrent(current));
2138  }
2139 
2140  *retcontext = piContextPtr.release();
2141  } catch (pi_result err) {
2142  errcode_ret = err;
2143  } catch (...) {
2144  errcode_ret = PI_ERROR_OUT_OF_RESOURCES;
2145  }
2146  return errcode_ret;
2147 }
2148 
2150 
2151  assert(ctxt != nullptr);
2152 
2153  if (ctxt->decrement_reference_count() > 0) {
2154  return PI_SUCCESS;
2155  }
2156  ctxt->invoke_extended_deleters();
2157 
2158  std::unique_ptr<_pi_context> context{ctxt};
2159 
2160  if (!ctxt->backend_has_ownership())
2161  return PI_SUCCESS;
2162 
2163  if (!ctxt->is_primary()) {
2164  CUcontext cuCtxt = ctxt->get();
2165  CUcontext current = nullptr;
2166  cuCtxGetCurrent(&current);
2167  if (cuCtxt != current) {
2168  PI_CHECK_ERROR(cuCtxPushCurrent(cuCtxt));
2169  }
2170  PI_CHECK_ERROR(cuCtxSynchronize());
2171  cuCtxGetCurrent(&current);
2172  if (cuCtxt == current) {
2173  PI_CHECK_ERROR(cuCtxPopCurrent(&current));
2174  }
2175  return PI_CHECK_ERROR(cuCtxDestroy(cuCtxt));
2176  }
2177 
2178  // Primary context is not destroyed, but released
2179  CUdevice cuDev = ctxt->get_device()->get();
2180  CUcontext current;
2181  cuCtxPopCurrent(&current);
2182  return PI_CHECK_ERROR(cuDevicePrimaryCtxRelease(cuDev));
2183 }
2184 
2192  pi_native_handle *nativeHandle) {
2193  *nativeHandle = reinterpret_cast<pi_native_handle>(context->get());
2194  return PI_SUCCESS;
2195 }
2196 
2205  pi_uint32 num_devices,
2206  const pi_device *devices,
2207  bool ownNativeHandle,
2208  pi_context *piContext) {
2209  (void)num_devices;
2210  (void)devices;
2211  (void)ownNativeHandle;
2212  assert(piContext != nullptr);
2213  assert(ownNativeHandle == false);
2214 
2215  CUcontext newContext = reinterpret_cast<CUcontext>(nativeHandle);
2216 
2217  ScopedContext active(newContext);
2218 
2219  // Get context's native device
2220  CUdevice cu_device;
2221  pi_result retErr = PI_CHECK_ERROR(cuCtxGetDevice(&cu_device));
2222 
2223  // Create a SYCL device from the ctx device
2224  pi_device device = nullptr;
2225  retErr = cuda_piextDeviceCreateWithNativeHandle(cu_device, nullptr, &device);
2226 
2227  // Create sycl context
2228  *piContext = new _pi_context{_pi_context::kind::user_defined, newContext,
2229  device, /*backend_owns*/ false};
2230 
2231  return retErr;
2232 }
2233 
2239  size_t size, void *host_ptr, pi_mem *ret_mem,
2240  const pi_mem_properties *properties) {
2241  // Need input memory object
2242  assert(ret_mem != nullptr);
2243  assert((properties == nullptr || *properties == 0) &&
2244  "no mem properties goes to cuda RT yet");
2245  // Currently, USE_HOST_PTR is not implemented using host register
2246  // since this triggers a weird segfault after program ends.
2247  // Setting this constant to true enables testing that behavior.
2248  const bool enableUseHostPtr = false;
2249  const bool performInitialCopy =
2250  (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
2251  ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && !enableUseHostPtr);
2252  pi_result retErr = PI_SUCCESS;
2253  pi_mem retMemObj = nullptr;
2254 
2255  try {
2256  ScopedContext active(context);
2257  CUdeviceptr ptr;
2260 
2261  if ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && enableUseHostPtr) {
2262  retErr = PI_CHECK_ERROR(
2263  cuMemHostRegister(host_ptr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
2264  retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, host_ptr, 0));
2266  } else if (flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
2267  retErr = PI_CHECK_ERROR(cuMemAllocHost(&host_ptr, size));
2268  retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, host_ptr, 0));
2270  } else {
2271  retErr = PI_CHECK_ERROR(cuMemAlloc(&ptr, size));
2272  if (flags & PI_MEM_FLAGS_HOST_PTR_COPY) {
2274  }
2275  }
2276 
2277  if (retErr == PI_SUCCESS) {
2278  pi_mem parentBuffer = nullptr;
2279 
2280  auto piMemObj = std::unique_ptr<_pi_mem>(
2281  new _pi_mem{context, parentBuffer, allocMode, ptr, host_ptr, size});
2282  if (piMemObj != nullptr) {
2283  retMemObj = piMemObj.release();
2284  if (performInitialCopy) {
2285  // Operates on the default stream of the current CUDA context.
2286  retErr = PI_CHECK_ERROR(cuMemcpyHtoD(ptr, host_ptr, size));
2287  // Synchronize with default stream implicitly used by cuMemcpyHtoD
2288  // to make buffer data available on device before any other PI call
2289  // uses it.
2290  if (retErr == PI_SUCCESS) {
2291  CUstream defaultStream = 0;
2292  retErr = PI_CHECK_ERROR(cuStreamSynchronize(defaultStream));
2293  }
2294  }
2295  } else {
2296  retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
2297  }
2298  }
2299  } catch (pi_result err) {
2300  retErr = err;
2301  } catch (...) {
2302  retErr = PI_ERROR_OUT_OF_RESOURCES;
2303  }
2304 
2305  *ret_mem = retMemObj;
2306 
2307  return retErr;
2308 }
2309 
2315  assert((memObj != nullptr) && "PI_ERROR_INVALID_MEM_OBJECTS");
2316 
2317  pi_result ret = PI_SUCCESS;
2318 
2319  try {
2320 
2321  // Do nothing if there are other references
2322  if (memObj->decrement_reference_count() > 0) {
2323  return PI_SUCCESS;
2324  }
2325 
2326  // make sure memObj is released in case PI_CHECK_ERROR throws
2327  std::unique_ptr<_pi_mem> uniqueMemObj(memObj);
2328 
2329  if (memObj->is_sub_buffer()) {
2330  return PI_SUCCESS;
2331  }
2332 
2333  ScopedContext active(uniqueMemObj->get_context());
2334 
2335  if (memObj->mem_type_ == _pi_mem::mem_type::buffer) {
2336  switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) {
2339  ret = PI_CHECK_ERROR(cuMemFree(uniqueMemObj->mem_.buffer_mem_.ptr_));
2340  break;
2342  ret = PI_CHECK_ERROR(
2343  cuMemHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
2344  break;
2346  ret = PI_CHECK_ERROR(
2347  cuMemFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
2348  };
2349  } else if (memObj->mem_type_ == _pi_mem::mem_type::surface) {
2350  ret = PI_CHECK_ERROR(
2351  cuSurfObjectDestroy(uniqueMemObj->mem_.surface_mem_.get_surface()));
2352  ret = PI_CHECK_ERROR(
2353  cuArrayDestroy(uniqueMemObj->mem_.surface_mem_.get_array()));
2354  }
2355 
2356  } catch (pi_result err) {
2357  ret = err;
2358  } catch (...) {
2359  ret = PI_ERROR_OUT_OF_RESOURCES;
2360  }
2361 
2362  if (ret != PI_SUCCESS) {
2363  // A reported CUDA error is either an implementation or an asynchronous CUDA
2364  // error for which it is unclear if the function that reported it succeeded
2365  // or not. Either way, the state of the program is compromised and likely
2366  // unrecoverable.
2368  "Unrecoverable program state reached in cuda_piMemRelease");
2369  }
2370 
2371  return PI_SUCCESS;
2372 }
2373 
2379  pi_buffer_create_type buffer_create_type,
2380  void *buffer_create_info, pi_mem *memObj) {
2381  assert((parent_buffer != nullptr) && "PI_ERROR_INVALID_MEM_OBJECT");
2382  assert(parent_buffer->is_buffer() && "PI_ERROR_INVALID_MEM_OBJECTS");
2383  assert(!parent_buffer->is_sub_buffer() && "PI_ERROR_INVALID_MEM_OBJECT");
2384 
2385  // Default value for flags means PI_MEM_FLAGS_ACCCESS_RW.
2386  if (flags == 0) {
2387  flags = PI_MEM_FLAGS_ACCESS_RW;
2388  }
2389 
2390  assert((flags == PI_MEM_FLAGS_ACCESS_RW) && "PI_ERROR_INVALID_VALUE");
2391  assert((buffer_create_type == PI_BUFFER_CREATE_TYPE_REGION) &&
2392  "PI_ERROR_INVALID_VALUE");
2393  assert((buffer_create_info != nullptr) && "PI_ERROR_INVALID_VALUE");
2394  assert(memObj != nullptr);
2395 
2396  const auto bufferRegion =
2397  *reinterpret_cast<pi_buffer_region>(buffer_create_info);
2398  assert((bufferRegion.size != 0u) && "PI_ERROR_INVALID_BUFFER_SIZE");
2399 
2400  assert((bufferRegion.origin <= (bufferRegion.origin + bufferRegion.size)) &&
2401  "Overflow");
2402  assert(((bufferRegion.origin + bufferRegion.size) <=
2403  parent_buffer->mem_.buffer_mem_.get_size()) &&
2404  "PI_ERROR_INVALID_BUFFER_SIZE");
2405  // Retained indirectly due to retaining parent buffer below.
2406  pi_context context = parent_buffer->context_;
2409 
2410  assert(parent_buffer->mem_.buffer_mem_.ptr_ !=
2413  parent_buffer->mem_.buffer_mem_.ptr_ + bufferRegion.origin;
2414 
2415  void *hostPtr = nullptr;
2416  if (parent_buffer->mem_.buffer_mem_.hostPtr_) {
2417  hostPtr = static_cast<char *>(parent_buffer->mem_.buffer_mem_.hostPtr_) +
2418  bufferRegion.origin;
2419  }
2420 
2421  ReleaseGuard<pi_mem> releaseGuard(parent_buffer);
2422 
2423  std::unique_ptr<_pi_mem> retMemObj{nullptr};
2424  try {
2425  ScopedContext active(context);
2426 
2427  retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{
2428  context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}};
2429  } catch (pi_result err) {
2430  *memObj = nullptr;
2431  return err;
2432  } catch (...) {
2433  *memObj = nullptr;
2434  return PI_ERROR_OUT_OF_HOST_MEMORY;
2435  }
2436 
2437  releaseGuard.dismiss();
2438  *memObj = retMemObj.release();
2439  return PI_SUCCESS;
2440 }
2441 
2442 pi_result cuda_piMemGetInfo(pi_mem, pi_mem_info, size_t, void *, size_t *) {
2443  sycl::detail::pi::die("cuda_piMemGetInfo not implemented");
2444 }
2445 
2453  pi_native_handle *nativeHandle) {
2454  *nativeHandle = static_cast<pi_native_handle>(mem->mem_.buffer_mem_.get());
2455  return PI_SUCCESS;
2456 }
2457 
2470  pi_context context,
2471  bool ownNativeHandle,
2472  pi_mem *mem) {
2474  "Creation of PI mem from native handle not implemented");
2475  return {};
2476 }
2477 
2485  pi_queue_properties properties, pi_queue *queue) {
2486  try {
2487  std::unique_ptr<_pi_queue> queueImpl{nullptr};
2488 
2489  if (context->get_device() != device) {
2490  *queue = nullptr;
2491  return PI_ERROR_INVALID_DEVICE;
2492  }
2493 
2494  unsigned int flags = 0;
2495  if (properties == __SYCL_PI_CUDA_USE_DEFAULT_STREAM) {
2496  flags = CU_STREAM_DEFAULT;
2497  } else if (properties == __SYCL_PI_CUDA_SYNC_WITH_DEFAULT) {
2498  flags = 0;
2499  } else {
2500  flags = CU_STREAM_NON_BLOCKING;
2501  }
2502 
2503  const bool is_out_of_order =
2505 
2506  std::vector<CUstream> computeCuStreams(
2507  is_out_of_order ? _pi_queue::default_num_compute_streams : 1);
2508  std::vector<CUstream> transferCuStreams(
2509  is_out_of_order ? _pi_queue::default_num_transfer_streams : 0);
2510 
2511  queueImpl = std::unique_ptr<_pi_queue>(
2512  new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams),
2513  context, device, properties, flags});
2514 
2515  *queue = queueImpl.release();
2516 
2517  return PI_SUCCESS;
2518  } catch (pi_result err) {
2519 
2520  return err;
2521 
2522  } catch (...) {
2523 
2524  return PI_ERROR_OUT_OF_RESOURCES;
2525  }
2526 }
2527 
2529  size_t param_value_size, void *param_value,
2530  size_t *param_value_size_ret) {
2531  assert(command_queue != nullptr);
2532 
2533  switch (param_name) {
2534  case PI_QUEUE_INFO_CONTEXT:
2535  return getInfo(param_value_size, param_value, param_value_size_ret,
2536  command_queue->context_);
2537  case PI_QUEUE_INFO_DEVICE:
2538  return getInfo(param_value_size, param_value, param_value_size_ret,
2539  command_queue->device_);
2541  return getInfo(param_value_size, param_value, param_value_size_ret,
2542  command_queue->get_reference_count());
2544  return getInfo(param_value_size, param_value, param_value_size_ret,
2545  command_queue->properties_);
2546  default:
2548  }
2549  sycl::detail::pi::die("Queue info request not implemented");
2550  return {};
2551 }
2552 
2554  assert(command_queue != nullptr);
2555  assert(command_queue->get_reference_count() > 0);
2556 
2557  command_queue->increment_reference_count();
2558  return PI_SUCCESS;
2559 }
2560 
2562  assert(command_queue != nullptr);
2563 
2564  if (command_queue->decrement_reference_count() > 0) {
2565  return PI_SUCCESS;
2566  }
2567 
2568  try {
2569  std::unique_ptr<_pi_queue> queueImpl(command_queue);
2570 
2571  if (!command_queue->backend_has_ownership())
2572  return PI_SUCCESS;
2573 
2574  ScopedContext active(command_queue->get_context());
2575 
2576  command_queue->for_each_stream([](CUstream s) {
2577  PI_CHECK_ERROR(cuStreamSynchronize(s));
2578  PI_CHECK_ERROR(cuStreamDestroy(s));
2579  });
2580 
2581  return PI_SUCCESS;
2582  } catch (pi_result err) {
2583  return err;
2584  } catch (...) {
2585  return PI_ERROR_OUT_OF_RESOURCES;
2586  }
2587 }
2588 
2590  pi_result result = PI_SUCCESS;
2591 
2592  try {
2593 
2594  assert(command_queue !=
2595  nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code
2596  ScopedContext active(command_queue->get_context());
2597 
2598  command_queue->sync_streams</*ResetUsed=*/true>([&result](CUstream s) {
2599  result = PI_CHECK_ERROR(cuStreamSynchronize(s));
2600  });
2601 
2602  } catch (pi_result err) {
2603 
2604  result = err;
2605 
2606  } catch (...) {
2607 
2608  result = PI_ERROR_OUT_OF_RESOURCES;
2609  }
2610 
2611  return result;
2612 }
2613 
2614 // There is no CUDA counterpart for queue flushing and we don't run into the
2615 // same problem of having to flush cross-queue dependencies as some of the
2616 // other plugins, so it can be left as no-op.
2618  (void)command_queue;
2619  return PI_SUCCESS;
2620 }
2621 
2629  pi_native_handle *nativeHandle) {
2630  ScopedContext active(queue->get_context());
2631  *nativeHandle =
2632  reinterpret_cast<pi_native_handle>(queue->get_next_compute_stream());
2633  return PI_SUCCESS;
2634 }
2635 
2647  pi_context context,
2648  pi_device device,
2649  bool ownNativeHandle,
2650  pi_queue *queue) {
2651  (void)device;
2652  (void)ownNativeHandle;
2653  assert(ownNativeHandle == false);
2654 
2655  unsigned int flags;
2656  CUstream cuStream = reinterpret_cast<CUstream>(nativeHandle);
2657 
2658  auto retErr = PI_CHECK_ERROR(cuStreamGetFlags(cuStream, &flags));
2659 
2660  pi_queue_properties properties = 0;
2661  if (flags == CU_STREAM_DEFAULT)
2662  properties = __SYCL_PI_CUDA_USE_DEFAULT_STREAM;
2663  else if (flags == CU_STREAM_NON_BLOCKING)
2664  properties = __SYCL_PI_CUDA_SYNC_WITH_DEFAULT;
2665  else
2666  sycl::detail::pi::die("Unknown cuda stream");
2667 
2668  std::vector<CUstream> computeCuStreams(1, cuStream);
2669  std::vector<CUstream> transferCuStreams(0);
2670 
2671  // Create queue and set num_compute_streams to 1, as computeCuStreams has
2672  // valid stream
2673  *queue = new _pi_queue{std::move(computeCuStreams),
2674  std::move(transferCuStreams),
2675  context,
2676  context->get_device(),
2677  properties,
2678  flags,
2679  /*backend_owns*/ false};
2680  (*queue)->num_compute_streams_ = 1;
2681 
2682  return retErr;
2683 }
2684 
2686  pi_bool blocking_write, size_t offset,
2687  size_t size, const void *ptr,
2688  pi_uint32 num_events_in_wait_list,
2689  const pi_event *event_wait_list,
2690  pi_event *event) {
2691 
2692  assert(buffer != nullptr);
2693  assert(command_queue != nullptr);
2694  pi_result retErr = PI_SUCCESS;
2695  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
2696  std::unique_ptr<_pi_event> retImplEv{nullptr};
2697 
2698  try {
2699  ScopedContext active(command_queue->get_context());
2700  CUstream cuStream = command_queue->get_next_transfer_stream();
2701 
2702  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
2703  event_wait_list);
2704 
2705  if (event) {
2706  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
2707  PI_COMMAND_TYPE_MEM_BUFFER_WRITE, command_queue, cuStream));
2708  retImplEv->start();
2709  }
2710 
2711  retErr =
2712  PI_CHECK_ERROR(cuMemcpyHtoDAsync(devPtr + offset, ptr, size, cuStream));
2713 
2714  if (event) {
2715  retErr = retImplEv->record();
2716  }
2717 
2718  if (blocking_write) {
2719  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
2720  }
2721 
2722  if (event) {
2723  *event = retImplEv.release();
2724  }
2725  } catch (pi_result err) {
2726  retErr = err;
2727  }
2728  return retErr;
2729 }
2730 
2732  pi_bool blocking_read, size_t offset,
2733  size_t size, void *ptr,
2734  pi_uint32 num_events_in_wait_list,
2735  const pi_event *event_wait_list,
2736  pi_event *event) {
2737 
2738  assert(buffer != nullptr);
2739  assert(command_queue != nullptr);
2740  pi_result retErr = PI_SUCCESS;
2741  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
2742  std::unique_ptr<_pi_event> retImplEv{nullptr};
2743 
2744  try {
2745  ScopedContext active(command_queue->get_context());
2746  CUstream cuStream = command_queue->get_next_transfer_stream();
2747 
2748  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
2749  event_wait_list);
2750 
2751  if (event) {
2752  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
2753  PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue, cuStream));
2754  retImplEv->start();
2755  }
2756 
2757  retErr =
2758  PI_CHECK_ERROR(cuMemcpyDtoHAsync(ptr, devPtr + offset, size, cuStream));
2759 
2760  if (event) {
2761  retErr = retImplEv->record();
2762  }
2763 
2764  if (blocking_read) {
2765  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
2766  }
2767 
2768  if (event) {
2769  *event = retImplEv.release();
2770  }
2771 
2772  } catch (pi_result err) {
2773  retErr = err;
2774  }
2775  return retErr;
2776 }
2777 
2778 pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
2779 
2780  try {
2781  assert(num_events != 0);
2782  assert(event_list);
2783  if (num_events == 0) {
2784  return PI_ERROR_INVALID_VALUE;
2785  }
2786 
2787  if (!event_list) {
2788  return PI_ERROR_INVALID_EVENT;
2789  }
2790 
2791  auto context = event_list[0]->get_context();
2792  ScopedContext active(context);
2793 
2794  auto waitFunc = [context](pi_event event) -> pi_result {
2795  if (!event) {
2796  return PI_ERROR_INVALID_EVENT;
2797  }
2798 
2799  if (event->get_context() != context) {
2800  return PI_ERROR_INVALID_CONTEXT;
2801  }
2802 
2803  return event->wait();
2804  };
2805  return forLatestEvents(event_list, num_events, waitFunc);
2806  } catch (pi_result err) {
2807  return err;
2808  } catch (...) {
2809  return PI_ERROR_OUT_OF_RESOURCES;
2810  }
2811 }
2812 
2813 pi_result cuda_piKernelCreate(pi_program program, const char *kernel_name,
2814  pi_kernel *kernel) {
2815  assert(kernel != nullptr);
2816  assert(program != nullptr);
2817 
2818  pi_result retErr = PI_SUCCESS;
2819  std::unique_ptr<_pi_kernel> retKernel{nullptr};
2820 
2821  try {
2822  ScopedContext active(program->get_context());
2823 
2824  CUfunction cuFunc;
2825  retErr = PI_CHECK_ERROR(
2826  cuModuleGetFunction(&cuFunc, program->get(), kernel_name));
2827 
2828  std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset";
2829  CUfunction cuFuncWithOffsetParam;
2830  CUresult offsetRes = cuModuleGetFunction(
2831  &cuFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str());
2832 
2833  // If there is no kernel with global offset parameter we mark it as missing
2834  if (offsetRes == CUDA_ERROR_NOT_FOUND) {
2835  cuFuncWithOffsetParam = nullptr;
2836  } else {
2837  retErr = PI_CHECK_ERROR(offsetRes);
2838  }
2839 
2840  retKernel = std::unique_ptr<_pi_kernel>(
2841  new _pi_kernel{cuFunc, cuFuncWithOffsetParam, kernel_name, program,
2842  program->get_context()});
2843  } catch (pi_result err) {
2844  retErr = err;
2845  } catch (...) {
2846  retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
2847  }
2848 
2849  *kernel = retKernel.release();
2850  return retErr;
2851 }
2852 
2854  size_t arg_size, const void *arg_value) {
2855 
2856  assert(kernel != nullptr);
2857  pi_result retErr = PI_SUCCESS;
2858  try {
2859  if (arg_value) {
2860  kernel->set_kernel_arg(arg_index, arg_size, arg_value);
2861  } else {
2862  kernel->set_kernel_local_arg(arg_index, arg_size);
2863  }
2864  } catch (pi_result err) {
2865  retErr = err;
2866  }
2867  return retErr;
2868 }
2869 
2871  const pi_mem *arg_value) {
2872 
2873  assert(kernel != nullptr);
2874  assert(arg_value != nullptr);
2875 
2876  pi_result retErr = PI_SUCCESS;
2877  try {
2878  pi_mem arg_mem = *arg_value;
2879  if (arg_mem->mem_type_ == _pi_mem::mem_type::surface) {
2880  CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
2881  PI_CHECK_ERROR(cuArray3DGetDescriptor(
2882  &arrayDesc, arg_mem->mem_.surface_mem_.get_array()));
2883  if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 &&
2884  arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 &&
2885  arrayDesc.Format != CU_AD_FORMAT_HALF &&
2886  arrayDesc.Format != CU_AD_FORMAT_FLOAT) {
2887  setErrorMessage("PI CUDA kernels only support images with channel "
2888  "types int32, uint32, float, and half.",
2889  PI_ERROR_PLUGIN_SPECIFIC_ERROR);
2890  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
2891  }
2892  CUsurfObject cuSurf = arg_mem->mem_.surface_mem_.get_surface();
2893  kernel->set_kernel_arg(arg_index, sizeof(cuSurf), (void *)&cuSurf);
2894  } else {
2895  CUdeviceptr cuPtr = arg_mem->mem_.buffer_mem_.get();
2896  kernel->set_kernel_arg(arg_index, sizeof(CUdeviceptr), (void *)&cuPtr);
2897  }
2898  } catch (pi_result err) {
2899  retErr = err;
2900  }
2901  return retErr;
2902 }
2903 
2905  const pi_sampler *arg_value) {
2906 
2907  assert(kernel != nullptr);
2908  assert(arg_value != nullptr);
2909 
2910  pi_result retErr = PI_SUCCESS;
2911  try {
2912  pi_uint32 samplerProps = (*arg_value)->props_;
2913  kernel->set_kernel_arg(arg_index, sizeof(pi_uint32), (void *)&samplerProps);
2914  } catch (pi_result err) {
2915  retErr = err;
2916  }
2917  return retErr;
2918 }
2919 
2921  pi_kernel_group_info param_name,
2922  size_t param_value_size, void *param_value,
2923  size_t *param_value_size_ret) {
2924 
2925  // Here we want to query about a kernel's cuda blocks!
2926 
2927  if (kernel != nullptr) {
2928 
2929  switch (param_name) {
2931  int max_threads = 0;
2933  cuFuncGetAttribute(&max_threads,
2934  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
2935  kernel->get()) == CUDA_SUCCESS);
2936  return getInfo(param_value_size, param_value, param_value_size_ret,
2937  size_t(max_threads));
2938  }
2940  size_t group_size[3] = {0, 0, 0};
2941  const auto &reqd_wg_size_md_map =
2942  kernel->program_->kernelReqdWorkGroupSizeMD_;
2943  const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_);
2944  if (reqd_wg_size_md != reqd_wg_size_md_map.end()) {
2945  const auto reqd_wg_size = reqd_wg_size_md->second;
2946  group_size[0] = std::get<0>(reqd_wg_size);
2947  group_size[1] = std::get<1>(reqd_wg_size);
2948  group_size[2] = std::get<2>(reqd_wg_size);
2949  }
2950  return getInfoArray(3, param_value_size, param_value,
2951  param_value_size_ret, group_size);
2952  }
2954  // OpenCL LOCAL == CUDA SHARED
2955  int bytes = 0;
2957  cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
2958  kernel->get()) == CUDA_SUCCESS);
2959  return getInfo(param_value_size, param_value, param_value_size_ret,
2960  pi_uint64(bytes));
2961  }
2963  // Work groups should be multiples of the warp size
2964  int warpSize = 0;
2966  cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
2967  device->get()) == CUDA_SUCCESS);
2968  return getInfo(param_value_size, param_value, param_value_size_ret,
2969  static_cast<size_t>(warpSize));
2970  }
2972  // OpenCL PRIVATE == CUDA LOCAL
2973  int bytes = 0;
2975  cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
2976  kernel->get()) == CUDA_SUCCESS);
2977  return getInfo(param_value_size, param_value, param_value_size_ret,
2978  pi_uint64(bytes));
2979  }
2981  int numRegs = 0;
2983  cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
2984  kernel->get()) == CUDA_SUCCESS);
2985  return getInfo(param_value_size, param_value, param_value_size_ret,
2986  pi_uint32(numRegs));
2987  }
2988  default:
2990  }
2991  }
2992 
2993  return PI_ERROR_INVALID_KERNEL;
2994 }
2995 
2997  pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim,
2998  const size_t *global_work_offset, const size_t *global_work_size,
2999  const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
3000  const pi_event *event_wait_list, pi_event *event) {
3001 
3002  // Preconditions
3003  assert(command_queue != nullptr);
3004  assert(command_queue->get_context() == kernel->get_context());
3005  assert(kernel != nullptr);
3006  assert(global_work_offset != nullptr);
3007  assert(work_dim > 0);
3008  assert(work_dim < 4);
3009 
3010  if (*global_work_size == 0) {
3012  command_queue, num_events_in_wait_list, event_wait_list, event);
3013  }
3014 
3015  // Set the number of threads per block to the number of threads per warp
3016  // by default unless user has provided a better number
3017  size_t threadsPerBlock[3] = {32u, 1u, 1u};
3018  size_t maxWorkGroupSize = 0u;
3019  size_t maxThreadsPerBlock[3] = {};
3020  bool providedLocalWorkGroupSize = (local_work_size != nullptr);
3021  pi_uint32 local_size = kernel->get_local_size();
3022  pi_result retError = PI_SUCCESS;
3023 
3024  try {
3025  // Set the active context here as guessLocalWorkSize needs an active context
3026  ScopedContext active(command_queue->get_context());
3027  {
3028  size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_;
3029  maxWorkGroupSize = command_queue->device_->get_max_work_group_size();
3030  command_queue->device_->get_max_work_item_sizes(
3031  sizeof(maxThreadsPerBlock), maxThreadsPerBlock);
3032 
3033  if (providedLocalWorkGroupSize) {
3034  auto isValid = [&](int dim) {
3035  if (reqdThreadsPerBlock[dim] != 0 &&
3036  local_work_size[dim] != reqdThreadsPerBlock[dim])
3037  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
3038 
3039  if (local_work_size[dim] > maxThreadsPerBlock[dim])
3040  return PI_ERROR_INVALID_WORK_ITEM_SIZE;
3041  // Checks that local work sizes are a divisor of the global work sizes
3042  // which includes that the local work sizes are neither larger than
3043  // the global work sizes and not 0.
3044  if (0u == local_work_size[dim])
3045  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
3046  if (0u != (global_work_size[dim] % local_work_size[dim]))
3047  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
3048  threadsPerBlock[dim] = local_work_size[dim];
3049  return PI_SUCCESS;
3050  };
3051 
3052  for (size_t dim = 0; dim < work_dim; dim++) {
3053  auto err = isValid(dim);
3054  if (err != PI_SUCCESS)
3055  return err;
3056  }
3057  } else {
3058  guessLocalWorkSize(threadsPerBlock, global_work_size,
3059  maxThreadsPerBlock, kernel, local_size);
3060  }
3061  }
3062 
3063  if (maxWorkGroupSize <
3064  size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
3065  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
3066  }
3067 
3068  size_t blocksPerGrid[3] = {1u, 1u, 1u};
3069 
3070  for (size_t i = 0; i < work_dim; i++) {
3071  blocksPerGrid[i] =
3072  (global_work_size[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i];
3073  }
3074 
3075  std::unique_ptr<_pi_event> retImplEv{nullptr};
3076 
3077  pi_uint32 stream_token;
3078  _pi_stream_guard guard;
3079  CUstream cuStream = command_queue->get_next_compute_stream(
3080  num_events_in_wait_list, event_wait_list, guard, &stream_token);
3081  CUfunction cuFunc = kernel->get();
3082 
3083  retError = enqueueEventsWait(command_queue, cuStream,
3084  num_events_in_wait_list, event_wait_list);
3085 
3086  // Set the implicit global offset parameter if kernel has offset variant
3087  if (kernel->get_with_offset_parameter()) {
3088  std::uint32_t cuda_implicit_offset[3] = {0, 0, 0};
3089  if (global_work_offset) {
3090  for (size_t i = 0; i < work_dim; i++) {
3091  cuda_implicit_offset[i] =
3092  static_cast<std::uint32_t>(global_work_offset[i]);
3093  if (global_work_offset[i] != 0) {
3094  cuFunc = kernel->get_with_offset_parameter();
3095  }
3096  }
3097  }
3098  kernel->set_implicit_offset_arg(sizeof(cuda_implicit_offset),
3099  cuda_implicit_offset);
3100  }
3101 
3102  auto &argIndices = kernel->get_arg_indices();
3103 
3104  if (event) {
3105  retImplEv = std::unique_ptr<_pi_event>(
3107  cuStream, stream_token));
3108  retImplEv->start();
3109  }
3110 
3111  // Set local mem max size if env var is present
3112  static const char *local_mem_sz_ptr =
3113  std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE");
3114 
3115  if (local_mem_sz_ptr) {
3116  int device_max_local_mem = 0;
3117  cuDeviceGetAttribute(
3118  &device_max_local_mem,
3119  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
3120  command_queue->get_device()->get());
3121 
3122  static const int env_val = std::atoi(local_mem_sz_ptr);
3123  if (env_val <= 0 || env_val > device_max_local_mem) {
3124  setErrorMessage("Invalid value specified for "
3125  "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
3126  PI_ERROR_PLUGIN_SPECIFIC_ERROR);
3127  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
3128  }
3129  PI_CHECK_ERROR(cuFuncSetAttribute(
3130  cuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, env_val));
3131  }
3132 
3133  retError = PI_CHECK_ERROR(cuLaunchKernel(
3134  cuFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2],
3135  threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], local_size,
3136  cuStream, const_cast<void **>(argIndices.data()), nullptr));
3137  if (local_size != 0)
3138  kernel->clear_local_size();
3139 
3140  if (event) {
3141  retError = retImplEv->record();
3142  *event = retImplEv.release();
3143  }
3144  } catch (pi_result err) {
3145  retError = err;
3146  }
3147  return retError;
3148 }
3149 
3151 pi_result cuda_piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t,
3152  pi_uint32, const pi_mem *, const void **,
3153  pi_uint32, const pi_event *, pi_event *) {
3154  sycl::detail::pi::die("Not implemented in CUDA backend");
3155  return {};
3156 }
3157 
3159  pi_program, bool,
3160  pi_kernel *) {
3161  sycl::detail::pi::die("Unsupported operation");
3162  return PI_SUCCESS;
3163 }
3164 
3167  const pi_image_format *image_format,
3168  const pi_image_desc *image_desc, void *host_ptr,
3169  pi_mem *ret_mem) {
3170  // Need input memory object
3171  assert(ret_mem != nullptr);
3172  const bool performInitialCopy = (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
3173  ((flags & PI_MEM_FLAGS_HOST_PTR_USE));
3174  pi_result retErr = PI_SUCCESS;
3175 
3176  // We only support RBGA channel order
3177  // TODO: check SYCL CTS and spec. May also have to support BGRA
3178  if (image_format->image_channel_order !=
3181  "cuda_piMemImageCreate only supports RGBA channel order");
3182  }
3183 
3184  // We have to use cuArray3DCreate, which has some caveats. The height and
3185  // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc gives
3186  // a minimum value of 1, so we need to convert the answer.
3187  CUDA_ARRAY3D_DESCRIPTOR array_desc;
3188  array_desc.NumChannels = 4; // Only support 4 channel image
3189  array_desc.Flags = 0; // No flags required
3190  array_desc.Width = image_desc->image_width;
3191  if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
3192  array_desc.Height = 0;
3193  array_desc.Depth = 0;
3194  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
3195  array_desc.Height = image_desc->image_height;
3196  array_desc.Depth = 0;
3197  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
3198  array_desc.Height = image_desc->image_height;
3199  array_desc.Depth = image_desc->image_depth;
3200  }
3201 
3202  // We need to get this now in bytes for calculating the total image size later
3203  size_t pixel_type_size_bytes;
3204 
3205  switch (image_format->image_channel_data_type) {
3208  array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
3209  pixel_type_size_bytes = 1;
3210  break;
3212  array_desc.Format = CU_AD_FORMAT_SIGNED_INT8;
3213  pixel_type_size_bytes = 1;
3214  break;
3217  array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
3218  pixel_type_size_bytes = 2;
3219  break;
3221  array_desc.Format = CU_AD_FORMAT_SIGNED_INT16;
3222  pixel_type_size_bytes = 2;
3223  break;
3225  array_desc.Format = CU_AD_FORMAT_HALF;
3226  pixel_type_size_bytes = 2;
3227  break;
3229  array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
3230  pixel_type_size_bytes = 4;
3231  break;
3233  array_desc.Format = CU_AD_FORMAT_SIGNED_INT32;
3234  pixel_type_size_bytes = 4;
3235  break;
3237  array_desc.Format = CU_AD_FORMAT_FLOAT;
3238  pixel_type_size_bytes = 4;
3239  break;
3240  default:
3242  "cuda_piMemImageCreate given unsupported image_channel_data_type");
3243  }
3244 
3245  // When a dimension isn't used image_desc has the size set to 1
3246  size_t pixel_size_bytes =
3247  pixel_type_size_bytes * 4; // 4 is the only number of channels we support
3248  size_t image_size_bytes = pixel_size_bytes * image_desc->image_width *
3249  image_desc->image_height * image_desc->image_depth;
3250 
3251  ScopedContext active(context);
3252  CUarray image_array;
3253  retErr = PI_CHECK_ERROR(cuArray3DCreate(&image_array, &array_desc));
3254 
3255  try {
3256  if (performInitialCopy) {
3257  // We have to use a different copy function for each image dimensionality
3258  if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
3259  retErr = PI_CHECK_ERROR(
3260  cuMemcpyHtoA(image_array, 0, host_ptr, image_size_bytes));
3261  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
3262  CUDA_MEMCPY2D cpy_desc;
3263  memset(&cpy_desc, 0, sizeof(cpy_desc));
3264  cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
3265  cpy_desc.srcHost = host_ptr;
3266  cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
3267  cpy_desc.dstArray = image_array;
3268  cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
3269  cpy_desc.Height = image_desc->image_height;
3270  retErr = PI_CHECK_ERROR(cuMemcpy2D(&cpy_desc));
3271  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
3272  CUDA_MEMCPY3D cpy_desc;
3273  memset(&cpy_desc, 0, sizeof(cpy_desc));
3274  cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
3275  cpy_desc.srcHost = host_ptr;
3276  cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
3277  cpy_desc.dstArray = image_array;
3278  cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
3279  cpy_desc.Height = image_desc->image_height;
3280  cpy_desc.Depth = image_desc->image_depth;
3281  retErr = PI_CHECK_ERROR(cuMemcpy3D(&cpy_desc));
3282  }
3283  }
3284 
3285  // CUDA_RESOURCE_DESC is a union of different structs, shown here
3286  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
3287  // We need to fill it as described here to use it for a surface or texture
3288  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html
3289  // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and
3290  // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array
3291  // handle.
3292  // CUDA_RESOURCE_DESC::flags must be set to zero
3293 
3294  CUDA_RESOURCE_DESC image_res_desc;
3295  image_res_desc.res.array.hArray = image_array;
3296  image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
3297  image_res_desc.flags = 0;
3298 
3299  CUsurfObject surface;
3300  retErr = PI_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc));
3301 
3302  auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{
3303  context, image_array, surface, image_desc->image_type, host_ptr});
3304 
3305  if (piMemObj == nullptr) {
3306  return PI_ERROR_OUT_OF_HOST_MEMORY;
3307  }
3308 
3309  *ret_mem = piMemObj.release();
3310  } catch (pi_result err) {
3311  cuArrayDestroy(image_array);
3312  return err;
3313  } catch (...) {
3314  cuArrayDestroy(image_array);
3315  return PI_ERROR_UNKNOWN;
3316  }
3317 
3318  return retErr;
3319 }
3320 
3323  size_t *) {
3324  sycl::detail::pi::die("cuda_piMemImageGetInfo not implemented");
3325  return {};
3326 }
3327 
3329  assert(mem != nullptr);
3330  assert(mem->get_reference_count() > 0);
3332  return PI_SUCCESS;
3333 }
3334 
3339  const size_t *, pi_program *) {
3340  sycl::detail::pi::cuPrint("cuda_piclProgramCreateWithSource not implemented");
3341  return PI_ERROR_INVALID_OPERATION;
3342 }
3343 
3349  const pi_device *device_list, const char *options,
3350  void (*pfn_notify)(pi_program program,
3351  void *user_data),
3352  void *user_data) {
3353 
3354  assert(program != nullptr);
3355  assert(num_devices == 1 || num_devices == 0);
3356  assert(device_list != nullptr || num_devices == 0);
3357  assert(pfn_notify == nullptr);
3358  assert(user_data == nullptr);
3359  pi_result retError = PI_SUCCESS;
3360 
3361  try {
3362  ScopedContext active(program->get_context());
3363 
3364  program->build_program(options);
3365 
3366  } catch (pi_result err) {
3367  retError = err;
3368  }
3369  return retError;
3370 }
3371 
3374  sycl::detail::pi::die("cuda_piProgramCreate not implemented");
3375  return {};
3376 }
3377 
3385  pi_context context, pi_uint32 num_devices, const pi_device *device_list,
3386  const size_t *lengths, const unsigned char **binaries,
3387  size_t num_metadata_entries, const pi_device_binary_property *metadata,
3388  pi_int32 *binary_status, pi_program *program) {
3389  // Ignore unused parameter
3390  (void)binary_status;
3391 
3392  assert(context != nullptr);
3393  assert(binaries != nullptr);
3394  assert(program != nullptr);
3395  assert(device_list != nullptr);
3396  assert(num_devices == 1 && "CUDA contexts are for a single device");
3397  assert((context->get_device()->get() == device_list[0]->get()) &&
3398  "Mismatch between devices context and passed context when creating "
3399  "program from binary");
3400 
3401  pi_result retError = PI_SUCCESS;
3402 
3403  std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
3404 
3405  retProgram->set_metadata(metadata, num_metadata_entries);
3406 
3407  const bool has_length = (lengths != nullptr);
3408  size_t length = has_length
3409  ? lengths[0]
3410  : strlen(reinterpret_cast<const char *>(binaries[0])) + 1;
3411 
3412  assert(length != 0);
3413 
3414  retProgram->set_binary(reinterpret_cast<const char *>(binaries[0]), length);
3415 
3416  *program = retProgram.release();
3417 
3418  return retError;
3419 }
3420 
3422  size_t param_value_size, void *param_value,
3423  size_t *param_value_size_ret) {
3424  assert(program != nullptr);
3425 
3426  switch (param_name) {
3428  return getInfo(param_value_size, param_value, param_value_size_ret,
3429  program->get_reference_count());
3431  return getInfo(param_value_size, param_value, param_value_size_ret,
3432  program->context_);
3434  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
3436  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3437  &program->context_->deviceId_);
3439  return getInfo(param_value_size, param_value, param_value_size_ret,
3440  program->binary_);
3442  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3443  &program->binarySizeInBytes_);
3445  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3446  &program->binary_);
3448  return getInfo(param_value_size, param_value, param_value_size_ret,
3449  getKernelNames(program).c_str());
3450  }
3451  default:
3453  }
3454  sycl::detail::pi::die("Program info request not implemented");
3455  return {};
3456 }
3457 
3463  const pi_device *device_list, const char *options,
3464  pi_uint32 num_input_programs,
3465  const pi_program *input_programs,
3466  void (*pfn_notify)(pi_program program,
3467  void *user_data),
3468  void *user_data, pi_program *ret_program) {
3469 
3470  assert(ret_program != nullptr);
3471  assert(num_devices == 1 || num_devices == 0);
3472  assert(device_list != nullptr || num_devices == 0);
3473  assert(pfn_notify == nullptr);
3474  assert(user_data == nullptr);
3475  pi_result retError = PI_SUCCESS;
3476 
3477  try {
3478  ScopedContext active(context);
3479 
3480  CUlinkState state;
3481  std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
3482 
3483  retError = PI_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &state));
3484  try {
3485  for (size_t i = 0; i < num_input_programs; ++i) {
3486  pi_program program = input_programs[i];
3487  retError = PI_CHECK_ERROR(cuLinkAddData(
3488  state, CU_JIT_INPUT_PTX, const_cast<char *>(program->binary_),
3489  program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr));
3490  }
3491  void *cubin = nullptr;
3492  size_t cubinSize = 0;
3493  retError = PI_CHECK_ERROR(cuLinkComplete(state, &cubin, &cubinSize));
3494 
3495  retError =
3496  retProgram->set_binary(static_cast<const char *>(cubin), cubinSize);
3497 
3498  if (retError != PI_SUCCESS) {
3499  return retError;
3500  }
3501 
3502  retError = retProgram->build_program(options);
3503 
3504  if (retError != PI_SUCCESS) {
3505  return retError;
3506  }
3507  } catch (...) {
3508  // Upon error attempt cleanup
3509  PI_CHECK_ERROR(cuLinkDestroy(state));
3510  throw;
3511  }
3512 
3513  retError = PI_CHECK_ERROR(cuLinkDestroy(state));
3514  *ret_program = retProgram.release();
3515 
3516  } catch (pi_result err) {
3517  retError = err;
3518  }
3519  return retError;
3520 }
3521 
3527  pi_program program, pi_uint32 num_devices, const pi_device *device_list,
3528  const char *options, pi_uint32 num_input_headers,
3529  const pi_program *input_headers, const char **header_include_names,
3530  void (*pfn_notify)(pi_program program, void *user_data), void *user_data) {
3531  // Ignore unused parameters
3532  (void)header_include_names;
3533  (void)input_headers;
3534 
3535  assert(program != nullptr);
3536  assert(num_devices == 1 || num_devices == 0);
3537  assert(device_list != nullptr || num_devices == 0);
3538  assert(pfn_notify == nullptr);
3539  assert(user_data == nullptr);
3540  assert(num_input_headers == 0);
3541  pi_result retError = PI_SUCCESS;
3542 
3543  try {
3544  ScopedContext active(program->get_context());
3545 
3546  program->build_program(options);
3547 
3548  } catch (pi_result err) {
3549  retError = err;
3550  }
3551  return retError;
3552 }
3553 
3555  pi_program_build_info param_name,
3556  size_t param_value_size, void *param_value,
3557  size_t *param_value_size_ret) {
3558  // Ignore unused parameter
3559  (void)device;
3560 
3561  assert(program != nullptr);
3562 
3563  switch (param_name) {
3565  return getInfo(param_value_size, param_value, param_value_size_ret,
3566  program->buildStatus_);
3567  }
3569  return getInfo(param_value_size, param_value, param_value_size_ret,
3570  program->buildOptions_.c_str());
3572  return getInfoArray(program->MAX_LOG_SIZE, param_value_size, param_value,
3573  param_value_size_ret, program->infoLog_);
3574  default:
3576  }
3577  sycl::detail::pi::die("Program Build info request not implemented");
3578  return {};
3579 }
3580 
3582  assert(program != nullptr);
3583  assert(program->get_reference_count() > 0);
3584  program->increment_reference_count();
3585  return PI_SUCCESS;
3586 }
3587 
3592  assert(program != nullptr);
3593 
3594  // double delete or someone is messing with the ref count.
3595  // either way, cannot safely proceed.
3596  assert(program->get_reference_count() != 0 &&
3597  "Reference count overflow detected in cuda_piProgramRelease.");
3598 
3599  // decrement ref count. If it is 0, delete the program.
3600  if (program->decrement_reference_count() == 0) {
3601 
3602  std::unique_ptr<_pi_program> program_ptr{program};
3603 
3604  pi_result result = PI_ERROR_INVALID_PROGRAM;
3605 
3606  try {
3607  ScopedContext active(program->get_context());
3608  auto cuModule = program->get();
3609  result = PI_CHECK_ERROR(cuModuleUnload(cuModule));
3610  } catch (...) {
3611  result = PI_ERROR_OUT_OF_RESOURCES;
3612  }
3613 
3614  return result;
3615  }
3616 
3617  return PI_SUCCESS;
3618 }
3619 
3627  pi_native_handle *nativeHandle) {
3628  *nativeHandle = reinterpret_cast<pi_native_handle>(program->get());
3629  return PI_SUCCESS;
3630 }
3631 
3642  bool, pi_program *) {
3644  "Creation of PI program from native handle not implemented");
3645  return {};
3646 }
3647 
3649  size_t param_value_size, void *param_value,
3650  size_t *param_value_size_ret) {
3651 
3652  if (kernel != nullptr) {
3653 
3654  switch (param_name) {
3656  return getInfo(param_value_size, param_value, param_value_size_ret,
3657  kernel->get_name());
3659  return getInfo(param_value_size, param_value, param_value_size_ret,
3660  kernel->get_num_args());
3662  return getInfo(param_value_size, param_value, param_value_size_ret,
3663  kernel->get_reference_count());
3664  case PI_KERNEL_INFO_CONTEXT: {
3665  return getInfo(param_value_size, param_value, param_value_size_ret,
3666  kernel->get_context());
3667  }
3668  case PI_KERNEL_INFO_PROGRAM: {
3669  return getInfo(param_value_size, param_value, param_value_size_ret,
3670  kernel->get_program());
3671  }
3673  return getInfo(param_value_size, param_value, param_value_size_ret, "");
3674  }
3675  default: {
3677  }
3678  }
3679  }
3680 
3681  return PI_ERROR_INVALID_KERNEL;
3682 }
3683 
3685  pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name,
3686  size_t input_value_size, const void *input_value, size_t param_value_size,
3687  void *param_value, size_t *param_value_size_ret) {
3688  // Ignore unused parameters
3689  (void)input_value_size;
3690  (void)input_value;
3691 
3692  if (kernel != nullptr) {
3693  switch (param_name) {
3695  // Sub-group size is equivalent to warp size
3696  int warpSize = 0;
3698  cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
3699  device->get()) == CUDA_SUCCESS);
3700  return getInfo(param_value_size, param_value, param_value_size_ret,
3701  static_cast<uint32_t>(warpSize));
3702  }
3704  // Number of sub-groups = max block size / warp size + possible remainder
3705  int max_threads = 0;
3707  cuFuncGetAttribute(&max_threads,
3708  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
3709  kernel->get()) == CUDA_SUCCESS);
3710  int warpSize = 0;
3712  0, nullptr, sizeof(uint32_t), &warpSize,
3713  nullptr);
3714  int maxWarps = (max_threads + warpSize - 1) / warpSize;
3715  return getInfo(param_value_size, param_value, param_value_size_ret,
3716  static_cast<uint32_t>(maxWarps));
3717  }
3719  // Return value of 0 => not specified
3720  // TODO: Revisit if PTX is generated for compile-time work-group sizes
3721  return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3722  }
3724  // Return value of 0 => unspecified or "auto" sub-group size
3725  // Correct for now, since warp size may be read from special register
3726  // TODO: Return warp size once default is primary sub-group size
3727  // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX
3728  return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3729  }
3730  default:
3732  }
3733  }
3734  return PI_ERROR_INVALID_KERNEL;
3735 }
3736 
3738  assert(kernel != nullptr);
3739  assert(kernel->get_reference_count() > 0u);
3740 
3741  kernel->increment_reference_count();
3742  return PI_SUCCESS;
3743 }
3744 
3746  assert(kernel != nullptr);
3747 
3748  // double delete or someone is messing with the ref count.
3749  // either way, cannot safely proceed.
3750  assert(kernel->get_reference_count() != 0 &&
3751  "Reference count overflow detected in cuda_piKernelRelease.");
3752 
3753  // decrement ref count. If it is 0, delete the program.
3754  if (kernel->decrement_reference_count() == 0) {
3755  // no internal cuda resources to clean up. Just delete it.
3756  delete kernel;
3757  return PI_SUCCESS;
3758  }
3759 
3760  return PI_SUCCESS;
3761 }
3762 
3763 // A NOP for the CUDA backend
3765  const void *) {
3766  return PI_SUCCESS;
3767 }
3768 
3770  size_t, const void *) {
3771  // This entry point is only used for native specialization constants (SPIR-V),
3772  // and the CUDA plugin is AOT only so this entry point is not supported.
3773  sycl::detail::pi::die("Native specialization constants are not supported");
3774  return {};
3775 }
3776 
3778  size_t arg_size,
3779  const void *arg_value) {
3780  kernel->set_kernel_arg(arg_index, arg_size, arg_value);
3781  return PI_SUCCESS;
3782 }
3783 
3784 //
3785 // Events
3786 //
3788  sycl::detail::pi::die("PI Event Create not implemented in CUDA backend");
3789 }
3790 
3792  size_t param_value_size, void *param_value,
3793  size_t *param_value_size_ret) {
3794  assert(event != nullptr);
3795 
3796  switch (param_name) {
3798  return getInfo(param_value_size, param_value, param_value_size_ret,
3799  event->get_queue());
3801  return getInfo(param_value_size, param_value, param_value_size_ret,
3802  event->get_command_type());
3804  return getInfo(param_value_size, param_value, param_value_size_ret,
3805  event->get_reference_count());
3807  return getInfo(param_value_size, param_value, param_value_size_ret,
3808  static_cast<pi_event_status>(event->get_execution_status()));
3809  }
3810  case PI_EVENT_INFO_CONTEXT:
3811  return getInfo(param_value_size, param_value, param_value_size_ret,
3812  event->get_context());
3813  default:
3815  }
3816 
3817  return PI_ERROR_INVALID_EVENT;
3818 }
3819 
3823  pi_profiling_info param_name,
3824  size_t param_value_size,
3825  void *param_value,
3826  size_t *param_value_size_ret) {
3827 
3828  assert(event != nullptr);
3829 
3830  pi_queue queue = event->get_queue();
3831  if (queue == nullptr || !(queue->properties_ & PI_QUEUE_PROFILING_ENABLE)) {
3832  return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE;
3833  }
3834 
3835  switch (param_name) {
3838  return getInfo<pi_uint64>(param_value_size, param_value,
3839  param_value_size_ret, event->get_queued_time());
3841  return getInfo<pi_uint64>(param_value_size, param_value,
3842  param_value_size_ret, event->get_start_time());
3844  return getInfo<pi_uint64>(param_value_size, param_value,
3845  param_value_size_ret, event->get_end_time());
3846  default:
3848  }
3849  sycl::detail::pi::die("Event Profiling info request not implemented");
3850  return {};
3851 }
3852 
3854  sycl::detail::pi::die("Event Callback not implemented in CUDA backend");
3855  return PI_SUCCESS;
3856 }
3857 
3859  sycl::detail::pi::die("Event Set Status not implemented in CUDA backend");
3860  return PI_ERROR_INVALID_VALUE;
3861 }
3862 
3864  assert(event != nullptr);
3865 
3866  const auto refCount = event->increment_reference_count();
3867 
3869  refCount != 0,
3870  "Reference count overflow detected in cuda_piEventRetain.");
3871 
3872  return PI_SUCCESS;
3873 }
3874 
3876  assert(event != nullptr);
3877 
3878  // double delete or someone is messing with the ref count.
3879  // either way, cannot safely proceed.
3881  event->get_reference_count() != 0,
3882  "Reference count overflow detected in cuda_piEventRelease.");
3883 
3884  // decrement ref count. If it is 0, delete the event.
3885  if (event->decrement_reference_count() == 0) {
3886  std::unique_ptr<_pi_event> event_ptr{event};
3887  pi_result result = PI_ERROR_INVALID_EVENT;
3888  try {
3889  ScopedContext active(event->get_context());
3890  result = event->release();
3891  } catch (...) {
3892  result = PI_ERROR_OUT_OF_RESOURCES;
3893  }
3894  return result;
3895  }
3896 
3897  return PI_SUCCESS;
3898 }
3899 
3906  pi_uint32 num_events_in_wait_list,
3907  const pi_event *event_wait_list,
3908  pi_event *event) {
3910  command_queue, num_events_in_wait_list, event_wait_list, event);
3911 }
3912 
3926  pi_uint32 num_events_in_wait_list,
3927  const pi_event *event_wait_list,
3928  pi_event *event) {
3929  // This function makes one stream work on the previous work (or work
3930  // represented by input events) and then all future work waits on that stream.
3931  if (!command_queue) {
3932  return PI_ERROR_INVALID_QUEUE;
3933  }
3934 
3935  pi_result result;
3936 
3937  try {
3938  ScopedContext active(command_queue->get_context());
3939  pi_uint32 stream_token;
3940  _pi_stream_guard guard;
3941  CUstream cuStream = command_queue->get_next_compute_stream(
3942  num_events_in_wait_list, event_wait_list, guard, &stream_token);
3943  {
3944  std::lock_guard<std::mutex> guard(command_queue->barrier_mutex_);
3945  if (command_queue->barrier_event_ == nullptr) {
3946  PI_CHECK_ERROR(cuEventCreate(&command_queue->barrier_event_,
3947  CU_EVENT_DISABLE_TIMING));
3948  }
3949  if (num_events_in_wait_list == 0) { // wait on all work
3950  if (command_queue->barrier_tmp_event_ == nullptr) {
3951  PI_CHECK_ERROR(cuEventCreate(&command_queue->barrier_tmp_event_,
3952  CU_EVENT_DISABLE_TIMING));
3953  }
3954  command_queue->sync_streams(
3955  [cuStream,
3956  tmp_event = command_queue->barrier_tmp_event_](CUstream s) {
3957  if (cuStream != s) {
3958  // record a new CUDA event on every stream and make one stream
3959  // wait for these events
3960  PI_CHECK_ERROR(cuEventRecord(tmp_event, s));
3961  PI_CHECK_ERROR(cuStreamWaitEvent(cuStream, tmp_event, 0));
3962  }
3963  });
3964  } else { // wait just on given events
3965  forLatestEvents(event_wait_list, num_events_in_wait_list,
3966  [cuStream](pi_event event) -> pi_result {
3967  if (event->get_queue()->has_been_synchronized(
3968  event->get_compute_stream_token())) {
3969  return PI_SUCCESS;
3970  } else {
3971  return PI_CHECK_ERROR(
3972  cuStreamWaitEvent(cuStream, event->get(), 0));
3973  }
3974  });
3975  }
3976 
3977  result = PI_CHECK_ERROR(
3978  cuEventRecord(command_queue->barrier_event_, cuStream));
3979  for (unsigned int i = 0;
3980  i < command_queue->compute_applied_barrier_.size(); i++) {
3981  command_queue->compute_applied_barrier_[i] = false;
3982  }
3983  for (unsigned int i = 0;
3984  i < command_queue->transfer_applied_barrier_.size(); i++) {
3985  command_queue->transfer_applied_barrier_[i] = false;
3986  }
3987  }
3988  if (result != PI_SUCCESS) {
3989  return result;
3990  }
3991 
3992  if (event) {
3993  *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue,
3994  cuStream, stream_token);
3995  (*event)->start();
3996  (*event)->record();
3997  }
3998 
3999  return PI_SUCCESS;
4000  } catch (pi_result err) {
4001  return err;
4002  } catch (...) {
4003  return PI_ERROR_UNKNOWN;
4004  }
4005 }
4006 
4014  pi_native_handle *nativeHandle) {
4015  *nativeHandle = reinterpret_cast<pi_native_handle>(event->get());
4016  return PI_SUCCESS;
4017 }
4018 
4028  pi_context context,
4029  bool ownNativeHandle,
4030  pi_event *event) {
4031  (void)ownNativeHandle;
4032  assert(!ownNativeHandle);
4033 
4034  std::unique_ptr<_pi_event> event_ptr{nullptr};
4035 
4036  *event = _pi_event::make_with_native(context,
4037  reinterpret_cast<CUevent>(nativeHandle));
4038 
4039  return PI_SUCCESS;
4040 }
4041 
4052  const pi_sampler_properties *sampler_properties,
4053  pi_sampler *result_sampler) {
4054  std::unique_ptr<_pi_sampler> retImplSampl{new _pi_sampler(context)};
4055 
4056  bool propSeen[3] = {false, false, false};
4057  for (size_t i = 0; sampler_properties[i] != 0; i += 2) {
4058  switch (sampler_properties[i]) {
4060  if (propSeen[0]) {
4061  return PI_ERROR_INVALID_VALUE;
4062  }
4063  propSeen[0] = true;
4064  retImplSampl->props_ |= sampler_properties[i + 1];
4065  break;
4067  if (propSeen[1]) {
4068  return PI_ERROR_INVALID_VALUE;
4069  }
4070  propSeen[1] = true;
4071  retImplSampl->props_ |=
4072  (sampler_properties[i + 1] - PI_SAMPLER_FILTER_MODE_NEAREST) << 1;
4073  break;
4075  if (propSeen[2]) {
4076  return PI_ERROR_INVALID_VALUE;
4077  }
4078  propSeen[2] = true;
4079  retImplSampl->props_ |=
4080  (sampler_properties[i + 1] - PI_SAMPLER_ADDRESSING_MODE_NONE) << 2;
4081  break;
4082  default:
4083  return PI_ERROR_INVALID_VALUE;
4084  }
4085  }
4086 
4087  if (!propSeen[0]) {
4088  retImplSampl->props_ |= PI_TRUE;
4089  }
4090  // Default filter mode to PI_SAMPLER_FILTER_MODE_NEAREST
4091  if (!propSeen[2]) {
4092  retImplSampl->props_ |=
4094  << 2;
4095  }
4096 
4097  *result_sampler = retImplSampl.release();
4098  return PI_SUCCESS;
4099 }
4100 
4111  size_t param_value_size, void *param_value,
4112  size_t *param_value_size_ret) {
4113  assert(sampler != nullptr);
4114 
4115  switch (param_name) {
4117  return getInfo(param_value_size, param_value, param_value_size_ret,
4118  sampler->get_reference_count());
4120  return getInfo(param_value_size, param_value, param_value_size_ret,
4121  sampler->context_);
4123  pi_bool norm_coords_prop = static_cast<pi_bool>(sampler->props_ & 0x1);
4124  return getInfo(param_value_size, param_value, param_value_size_ret,
4125  norm_coords_prop);
4126  }
4128  pi_sampler_filter_mode filter_prop = static_cast<pi_sampler_filter_mode>(
4129  ((sampler->props_ >> 1) & 0x1) + PI_SAMPLER_FILTER_MODE_NEAREST);
4130  return getInfo(param_value_size, param_value, param_value_size_ret,
4131  filter_prop);
4132  }
4134  pi_sampler_addressing_mode addressing_prop =
4135  static_cast<pi_sampler_addressing_mode>(
4136  (sampler->props_ >> 2) + PI_SAMPLER_ADDRESSING_MODE_NONE);
4137  return getInfo(param_value_size, param_value, param_value_size_ret,
4138  addressing_prop);
4139  }
4140  default:
4142  }
4143  return {};
4144 }
4145 
4152  assert(sampler != nullptr);
4153  sampler->increment_reference_count();
4154  return PI_SUCCESS;
4155 }
4156 
4164  assert(sampler != nullptr);
4165 
4166  // double delete or someone is messing with the ref count.
4167  // either way, cannot safely proceed.
4169  sampler->get_reference_count() != 0,
4170  "Reference count overflow detected in cuda_piSamplerRelease.");
4171 
4172  // decrement ref count. If it is 0, delete the sampler.
4173  if (sampler->decrement_reference_count() == 0) {
4174  delete sampler;
4175  }
4176 
4177  return PI_SUCCESS;
4178 }
4179 
4186  CUstream cu_stream, pi_buff_rect_region region, const void *src_ptr,
4187  const CUmemorytype_enum src_type, pi_buff_rect_offset src_offset,
4188  size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr,
4189  const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset,
4190  size_t dst_row_pitch, size_t dst_slice_pitch) {
4191 
4192  assert(region != nullptr);
4193  assert(src_offset != nullptr);
4194  assert(dst_offset != nullptr);
4195 
4196  assert(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST);
4197  assert(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST);
4198 
4199  src_row_pitch = (!src_row_pitch) ? region->width_bytes + src_offset->x_bytes
4200  : src_row_pitch;
4201  src_slice_pitch =
4202  (!src_slice_pitch)
4203  ? ((region->height_scalar + src_offset->y_scalar) * src_row_pitch)
4204  : src_slice_pitch;
4205  dst_row_pitch = (!dst_row_pitch) ? region->width_bytes + dst_offset->x_bytes
4206  : dst_row_pitch;
4207  dst_slice_pitch =
4208  (!dst_slice_pitch)
4209  ? ((region->height_scalar + dst_offset->y_scalar) * dst_row_pitch)
4210  : dst_slice_pitch;
4211 
4212  CUDA_MEMCPY3D params = {};
4213 
4214  params.WidthInBytes = region->width_bytes;
4215  params.Height = region->height_scalar;
4216  params.Depth = region->depth_scalar;
4217 
4218  params.srcMemoryType = src_type;
4219  params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE
4220  ? *static_cast<const CUdeviceptr *>(src_ptr)
4221  : 0;
4222  params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr;
4223  params.srcXInBytes = src_offset->x_bytes;
4224  params.srcY = src_offset->y_scalar;
4225  params.srcZ = src_offset->z_scalar;
4226  params.srcPitch = src_row_pitch;
4227  params.srcHeight = src_slice_pitch / src_row_pitch;
4228 
4229  params.dstMemoryType = dst_type;
4230  params.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE
4231  ? *static_cast<CUdeviceptr *>(dst_ptr)
4232  : 0;
4233  params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr;
4234  params.dstXInBytes = dst_offset->x_bytes;
4235  params.dstY = dst_offset->y_scalar;
4236  params.dstZ = dst_offset->z_scalar;
4237  params.dstPitch = dst_row_pitch;
4238  params.dstHeight = dst_slice_pitch / dst_row_pitch;
4239 
4240  return PI_CHECK_ERROR(cuMemcpy3DAsync(&params, cu_stream));
4241 }
4242 
4244  pi_queue command_queue, pi_mem buffer, pi_bool blocking_read,
4245  pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
4246  pi_buff_rect_region region, size_t buffer_row_pitch,
4247  size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
4248  void *ptr, pi_uint32 num_events_in_wait_list,
4249  const pi_event *event_wait_list, pi_event *event) {
4250 
4251  assert(buffer != nullptr);
4252  assert(command_queue != nullptr);
4253 
4254  pi_result retErr = PI_SUCCESS;
4255  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
4256  std::unique_ptr<_pi_event> retImplEv{nullptr};
4257 
4258  try {
4259  ScopedContext active(command_queue->get_context());
4260  CUstream cuStream = command_queue->get_next_transfer_stream();
4261 
4262  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4263  event_wait_list);
4264 
4265  if (event) {
4266  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4267  PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, command_queue, cuStream));
4268  retImplEv->start();
4269  }
4270 
4272  cuStream, region, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset,
4273  buffer_row_pitch, buffer_slice_pitch, ptr, CU_MEMORYTYPE_HOST,
4274  host_offset, host_row_pitch, host_slice_pitch);
4275 
4276  if (event) {
4277  retErr = retImplEv->record();
4278  }
4279 
4280  if (blocking_read) {
4281  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
4282  }
4283 
4284  if (event) {
4285  *event = retImplEv.release();
4286  }
4287 
4288  } catch (pi_result err) {
4289  retErr = err;
4290  }
4291  return retErr;
4292 }
4293 
4295  pi_queue command_queue, pi_mem buffer, pi_bool blocking_write,
4296  pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
4297  pi_buff_rect_region region, size_t buffer_row_pitch,
4298  size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
4299  const void *ptr, pi_uint32 num_events_in_wait_list,
4300  const pi_event *event_wait_list, pi_event *event) {
4301 
4302  assert(buffer != nullptr);
4303  assert(command_queue != nullptr);
4304 
4305  pi_result retErr = PI_SUCCESS;
4306  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
4307  std::unique_ptr<_pi_event> retImplEv{nullptr};
4308 
4309  try {
4310  ScopedContext active(command_queue->get_context());
4311  CUstream cuStream = command_queue->get_next_transfer_stream();
4312  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4313  event_wait_list);
4314 
4315  if (event) {
4316  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4317  PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, command_queue, cuStream));
4318  retImplEv->start();
4319  }
4320 
4322  cuStream, region, ptr, CU_MEMORYTYPE_HOST, host_offset, host_row_pitch,
4323  host_slice_pitch, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset,
4324  buffer_row_pitch, buffer_slice_pitch);
4325 
4326  if (event) {
4327  retErr = retImplEv->record();
4328  }
4329 
4330  if (blocking_write) {
4331  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
4332  }
4333 
4334  if (event) {
4335  *event = retImplEv.release();
4336  }
4337 
4338  } catch (pi_result err) {
4339  retErr = err;
4340  }
4341  return retErr;
4342 }
4343 
4345  pi_mem dst_buffer, size_t src_offset,
4346  size_t dst_offset, size_t size,
4347  pi_uint32 num_events_in_wait_list,
4348  const pi_event *event_wait_list,
4349  pi_event *event) {
4350  if (!command_queue) {
4351  return PI_ERROR_INVALID_QUEUE;
4352  }
4353 
4354  std::unique_ptr<_pi_event> retImplEv{nullptr};
4355 
4356  try {
4357  ScopedContext active(command_queue->get_context());
4358  pi_result result;
4359 
4360  auto stream = command_queue->get_next_transfer_stream();
4361  result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
4362  event_wait_list);
4363 
4364  if (event) {
4365  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4366  PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue, stream));
4367  result = retImplEv->start();
4368  }
4369 
4370  auto src = src_buffer->mem_.buffer_mem_.get() + src_offset;
4371  auto dst = dst_buffer->mem_.buffer_mem_.get() + dst_offset;
4372 
4373  result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream));
4374 
4375  if (event) {
4376  result = retImplEv->record();
4377  *event = retImplEv.release();
4378  }
4379 
4380  return result;
4381  } catch (pi_result err) {
4382  return err;
4383  } catch (...) {
4384  return PI_ERROR_UNKNOWN;
4385  }
4386 }
4387 
4389  pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer,
4390  pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin,
4391  pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch,
4392  size_t dst_row_pitch, size_t dst_slice_pitch,
4393  pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
4394  pi_event *event) {
4395 
4396  assert(src_buffer != nullptr);
4397  assert(dst_buffer != nullptr);
4398  assert(command_queue != nullptr);
4399 
4400  pi_result retErr = PI_SUCCESS;
4401  CUdeviceptr srcPtr = src_buffer->mem_.buffer_mem_.get();
4402  CUdeviceptr dstPtr = dst_buffer->mem_.buffer_mem_.get();
4403  std::unique_ptr<_pi_event> retImplEv{nullptr};
4404 
4405  try {
4406  ScopedContext active(command_queue->get_context());
4407  CUstream cuStream = command_queue->get_next_transfer_stream();
4408  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4409  event_wait_list);
4410 
4411  if (event) {
4412  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4413  PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue, cuStream));
4414  retImplEv->start();
4415  }
4416 
4418  cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin,
4419  src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE,
4420  dst_origin, dst_row_pitch, dst_slice_pitch);
4421 
4422  if (event) {
4423  retImplEv->record();
4424  *event = retImplEv.release();
4425  }
4426 
4427  } catch (pi_result err) {
4428  retErr = err;
4429  }
4430  return retErr;
4431 }
4432 
4434  const void *pattern, size_t pattern_size,
4435  size_t offset, size_t size,
4436  pi_uint32 num_events_in_wait_list,
4437  const pi_event *event_wait_list,
4438  pi_event *event) {
4439  assert(command_queue != nullptr);
4440 
4441  auto args_are_multiples_of_pattern_size =
4442  (offset % pattern_size == 0) || (size % pattern_size == 0);
4443 
4444  auto pattern_is_valid = (pattern != nullptr);
4445 
4446  auto pattern_size_is_valid =
4447  ((pattern_size & (pattern_size - 1)) == 0) && // is power of two
4448  (pattern_size > 0) && (pattern_size <= 128); // falls within valid range
4449 
4450  assert(args_are_multiples_of_pattern_size && pattern_is_valid &&
4451  pattern_size_is_valid);
4452  (void)args_are_multiples_of_pattern_size;
4453  (void)pattern_is_valid;
4454  (void)pattern_size_is_valid;
4455 
4456  std::unique_ptr<_pi_event> retImplEv{nullptr};
4457 
4458  try {
4459  ScopedContext active(command_queue->get_context());
4460 
4461  auto stream = command_queue->get_next_transfer_stream();
4462  pi_result result;
4463  result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
4464  event_wait_list);
4465 
4466  if (event) {
4467  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4468  PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue, stream));
4469  result = retImplEv->start();
4470  }
4471 
4472  auto dstDevice = buffer->mem_.buffer_mem_.get() + offset;
4473  auto N = size / pattern_size;
4474 
4475  // pattern size in bytes
4476  switch (pattern_size) {
4477  case 1: {
4478  auto value = *static_cast<const uint8_t *>(pattern);
4479  result = PI_CHECK_ERROR(cuMemsetD8Async(dstDevice, value, N, stream));
4480  break;
4481  }
4482  case 2: {
4483  auto value = *static_cast<const uint16_t *>(pattern);
4484  result = PI_CHECK_ERROR(cuMemsetD16Async(dstDevice, value, N, stream));
4485  break;
4486  }
4487  case 4: {
4488  auto value = *static_cast<const uint32_t *>(pattern);
4489  result = PI_CHECK_ERROR(cuMemsetD32Async(dstDevice, value, N, stream));
4490  break;
4491  }
4492  default: {
4493  // CUDA has no memset functions that allow setting values more than 4
4494  // bytes. PI API lets you pass an arbitrary "pattern" to the buffer
4495  // fill, which can be more than 4 bytes. We must break up the pattern
4496  // into 4 byte values, and set the buffer using multiple strided calls.
4497  // This means that one cuMemsetD2D32Async call is made for every 4 bytes
4498  // in the pattern.
4499 
4500  auto number_of_steps = pattern_size / sizeof(uint32_t);
4501 
4502  // we walk up the pattern in 4-byte steps, and call cuMemset for each
4503  // 4-byte chunk of the pattern.
4504  for (auto step = 0u; step < number_of_steps; ++step) {
4505  // take 4 bytes of the pattern
4506  auto value = *(static_cast<const uint32_t *>(pattern) + step);
4507 
4508  // offset the pointer to the part of the buffer we want to write to
4509  auto offset_ptr = dstDevice + (step * sizeof(uint32_t));
4510 
4511  // set all of the pattern chunks
4512  result = PI_CHECK_ERROR(
4513  cuMemsetD2D32Async(offset_ptr, pattern_size, value, 1, N, stream));
4514  }
4515 
4516  break;
4517  }
4518  }
4519 
4520  if (event) {
4521  result = retImplEv->record();
4522  *event = retImplEv.release();
4523  }
4524 
4525  return result;
4526  } catch (pi_result err) {
4527  return err;
4528  } catch (...) {
4529  return PI_ERROR_UNKNOWN;
4530  }
4531 }
4532 
4533 static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR array_desc) {
4534  switch (array_desc.Format) {
4535  case CU_AD_FORMAT_UNSIGNED_INT8:
4536  case CU_AD_FORMAT_SIGNED_INT8:
4537  return 1;
4538  case CU_AD_FORMAT_UNSIGNED_INT16:
4539  case CU_AD_FORMAT_SIGNED_INT16:
4540  case CU_AD_FORMAT_HALF:
4541  return 2;
4542  case CU_AD_FORMAT_UNSIGNED_INT32:
4543  case CU_AD_FORMAT_SIGNED_INT32:
4544  case CU_AD_FORMAT_FLOAT:
4545  return 4;
4546  default:
4547  sycl::detail::pi::die("Invalid image format.");
4548  return 0;
4549  }
4550 }
4551 
4558  CUstream cu_stream, pi_mem_type img_type, const size_t *region,
4559  const void *src_ptr, const CUmemorytype_enum src_type,
4560  const size_t *src_offset, void *dst_ptr, const CUmemorytype_enum dst_type,
4561  const size_t *dst_offset) {
4562  assert(region != nullptr);
4563 
4564  assert(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST);
4565  assert(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST);
4566 
4567  if (img_type == PI_MEM_TYPE_IMAGE2D) {
4568  CUDA_MEMCPY2D cpyDesc;
4569  memset(&cpyDesc, 0, sizeof(cpyDesc));
4570  cpyDesc.srcMemoryType = src_type;
4571  if (src_type == CU_MEMORYTYPE_ARRAY) {
4572  cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
4573  cpyDesc.srcXInBytes = src_offset[0];
4574  cpyDesc.srcY = src_offset[1];
4575  } else {
4576  cpyDesc.srcHost = src_ptr;
4577  }
4578  cpyDesc.dstMemoryType = dst_type;
4579  if (dst_type == CU_MEMORYTYPE_ARRAY) {
4580  cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
4581  cpyDesc.dstXInBytes = dst_offset[0];
4582  cpyDesc.dstY = dst_offset[1];
4583  } else {
4584  cpyDesc.dstHost = dst_ptr;
4585  }
4586  cpyDesc.WidthInBytes = region[0];
4587  cpyDesc.Height = region[1];
4588  return PI_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cu_stream));
4589  }
4590  if (img_type == PI_MEM_TYPE_IMAGE3D) {
4591  CUDA_MEMCPY3D cpyDesc;
4592  memset(&cpyDesc, 0, sizeof(cpyDesc));
4593  cpyDesc.srcMemoryType = src_type;
4594  if (src_type == CU_MEMORYTYPE_ARRAY) {
4595  cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
4596  cpyDesc.srcXInBytes = src_offset[0];
4597  cpyDesc.srcY = src_offset[1];
4598  cpyDesc.srcZ = src_offset[2];
4599  } else {
4600  cpyDesc.srcHost = src_ptr;
4601  }
4602  cpyDesc.dstMemoryType = dst_type;
4603  if (dst_type == CU_MEMORYTYPE_ARRAY) {
4604  cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
4605  cpyDesc.dstXInBytes = dst_offset[0];
4606  cpyDesc.dstY = dst_offset[1];
4607  cpyDesc.dstZ = dst_offset[2];
4608  } else {
4609  cpyDesc.dstHost = dst_ptr;
4610  }
4611  cpyDesc.WidthInBytes = region[0];
4612  cpyDesc.Height = region[1];
4613  cpyDesc.Depth = region[2];
4614  return PI_CHECK_ERROR(cuMemcpy3DAsync(&cpyDesc, cu_stream));
4615  }
4616  return PI_ERROR_INVALID_VALUE;
4617 }
4618 
4620  pi_queue command_queue, pi_mem image, pi_bool blocking_read,
4621  const size_t *origin, const size_t *region, size_t row_pitch,
4622  size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list,
4623  const pi_event *event_wait_list, pi_event *event) {
4624  // Ignore unused parameters
4625  (void)row_pitch;
4626  (void)slice_pitch;
4627 
4628  assert(command_queue != nullptr);
4629  assert(image != nullptr);
4630  assert(image->mem_type_ == _pi_mem::mem_type::surface);
4631 
4632  pi_result retErr = PI_SUCCESS;
4633 
4634  try {
4635  ScopedContext active(command_queue->get_context());
4636  CUstream cuStream = command_queue->get_next_transfer_stream();
4637  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4638  event_wait_list);
4639 
4640  CUarray array = image->mem_.surface_mem_.get_array();
4641 
4642  CUDA_ARRAY_DESCRIPTOR arrayDesc;
4643  retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
4644 
4645  int elementByteSize = imageElementByteSize(arrayDesc);
4646 
4647  size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels;
4648  size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0];
4649 
4650  pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
4651  if (imgType == PI_MEM_TYPE_IMAGE1D) {
4652  retErr = PI_CHECK_ERROR(
4653  cuMemcpyAtoHAsync(ptr, array, byteOffsetX, bytesToCopy, cuStream));
4654  } else {
4655  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4656  size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]};
4657 
4658  retErr = commonEnqueueMemImageNDCopy(
4659  cuStream, imgType, adjustedRegion, &array, CU_MEMORYTYPE_ARRAY,
4660  srcOffset, ptr, CU_MEMORYTYPE_HOST, nullptr);
4661 
4662  if (retErr != PI_SUCCESS) {
4663  return retErr;
4664  }
4665  }
4666 
4667  if (event) {
4669  command_queue, cuStream);
4670  new_event->record();
4671  *event = new_event;
4672  }
4673 
4674  if (blocking_read) {
4675  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
4676  }
4677  } catch (pi_result err) {
4678  return err;
4679  } catch (...) {
4680  return PI_ERROR_UNKNOWN;
4681  }
4682 
4683  return retErr;
4684 }
4685 
4686 pi_result
4688  pi_bool blocking_write, const size_t *origin,
4689  const size_t *region, size_t input_row_pitch,
4690  size_t input_slice_pitch, const void *ptr,
4691  pi_uint32 num_events_in_wait_list,
4692  const pi_event *event_wait_list, pi_event *event) {
4693  // Ignore unused parameters
4694  (void)blocking_write;
4695  (void)input_row_pitch;
4696  (void)input_slice_pitch;
4697 
4698  assert(command_queue != nullptr);
4699  assert(image != nullptr);
4700  assert(image->mem_type_ == _pi_mem::mem_type::surface);
4701 
4702  pi_result retErr = PI_SUCCESS;
4703 
4704  try {
4705  ScopedContext active(command_queue->get_context());
4706  CUstream cuStream = command_queue->get_next_transfer_stream();
4707  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4708  event_wait_list);
4709 
4710  CUarray array = image->mem_.surface_mem_.get_array();
4711 
4712  CUDA_ARRAY_DESCRIPTOR arrayDesc;
4713  retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
4714 
4715  int elementByteSize = imageElementByteSize(arrayDesc);
4716 
4717  size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels;
4718  size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0];
4719 
4720  pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
4721  if (imgType == PI_MEM_TYPE_IMAGE1D) {
4722  retErr = PI_CHECK_ERROR(
4723  cuMemcpyHtoAAsync(array, byteOffsetX, ptr, bytesToCopy, cuStream));
4724  } else {
4725  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4726  size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]};
4727 
4728  retErr = commonEnqueueMemImageNDCopy(
4729  cuStream, imgType, adjustedRegion, ptr, CU_MEMORYTYPE_HOST, nullptr,
4730  &array, CU_MEMORYTYPE_ARRAY, dstOffset);
4731 
4732  if (retErr != PI_SUCCESS) {
4733  return retErr;
4734  }
4735  }
4736 
4737  if (event) {
4739  command_queue, cuStream);
4740  new_event->record();
4741  *event = new_event;
4742  }
4743  } catch (pi_result err) {
4744  return err;
4745  } catch (...) {
4746  return PI_ERROR_UNKNOWN;
4747  }
4748 
4749  return retErr;
4750 }
4751 
4753  pi_mem dst_image, const size_t *src_origin,
4754  const size_t *dst_origin,
4755  const size_t *region,
4756  pi_uint32 num_events_in_wait_list,
4757  const pi_event *event_wait_list,
4758  pi_event *event) {
4759  assert(src_image->mem_type_ == _pi_mem::mem_type::surface);
4760  assert(dst_image->mem_type_ == _pi_mem::mem_type::surface);
4761  assert(src_image->mem_.surface_mem_.get_image_type() ==
4762  dst_image->mem_.surface_mem_.get_image_type());
4763 
4764  pi_result retErr = PI_SUCCESS;
4765 
4766  try {
4767  ScopedContext active(command_queue->get_context());
4768  CUstream cuStream = command_queue->get_next_transfer_stream();
4769  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4770  event_wait_list);
4771 
4772  CUarray srcArray = src_image->mem_.surface_mem_.get_array();
4773  CUarray dstArray = dst_image->mem_.surface_mem_.get_array();
4774 
4775  CUDA_ARRAY_DESCRIPTOR srcArrayDesc;
4776  retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&srcArrayDesc, srcArray));
4777  CUDA_ARRAY_DESCRIPTOR dstArrayDesc;
4778  retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&dstArrayDesc, dstArray));
4779 
4780  assert(srcArrayDesc.Format == dstArrayDesc.Format);
4781  assert(srcArrayDesc.NumChannels == dstArrayDesc.NumChannels);
4782 
4783  int elementByteSize = imageElementByteSize(srcArrayDesc);
4784 
4785  size_t dstByteOffsetX =
4786  dst_origin[0] * elementByteSize * srcArrayDesc.NumChannels;
4787  size_t srcByteOffsetX =
4788  src_origin[0] * elementByteSize * dstArrayDesc.NumChannels;
4789  size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0];
4790 
4791  pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type();
4792  if (imgType == PI_MEM_TYPE_IMAGE1D) {
4793  retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray,
4794  srcByteOffsetX, bytesToCopy));
4795  } else {
4796  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4797  size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
4798  size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
4799 
4800  retErr = commonEnqueueMemImageNDCopy(
4801  cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY,
4802  srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset);
4803 
4804  if (retErr != PI_SUCCESS) {
4805  return retErr;
4806  }
4807  }
4808 
4809  if (event) {
4811  command_queue, cuStream);
4812  new_event->record();
4813  *event = new_event;
4814  }
4815  } catch (pi_result err) {
4816  return err;
4817  } catch (...) {
4818  return PI_ERROR_UNKNOWN;
4819  }
4820 
4821  return retErr;
4822 }
4823 
4826  const size_t *, const size_t *, pi_uint32,
4827  const pi_event *, pi_event *) {
4828  sycl::detail::pi::die("cuda_piEnqueueMemImageFill not implemented");
4829  return {};
4830 }
4831 
4838  pi_bool blocking_map,
4839  pi_map_flags map_flags, size_t offset,
4840  size_t size,
4841  pi_uint32 num_events_in_wait_list,
4842  const pi_event *event_wait_list,
4843  pi_event *event, void **ret_map) {
4844  assert(ret_map != nullptr);
4845  assert(command_queue != nullptr);
4846  assert(buffer != nullptr);
4847  assert(buffer->mem_type_ == _pi_mem::mem_type::buffer);
4848 
4849  pi_result ret_err = PI_ERROR_INVALID_OPERATION;
4850  const bool is_pinned = buffer->mem_.buffer_mem_.allocMode_ ==
4852 
4853  // Currently no support for overlapping regions
4854  if (buffer->mem_.buffer_mem_.get_map_ptr() != nullptr) {
4855  return ret_err;
4856  }
4857 
4858  // Allocate a pointer in the host to store the mapped information
4859  auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags);
4860  *ret_map = buffer->mem_.buffer_mem_.get_map_ptr();
4861  if (hostPtr) {
4862  ret_err = PI_SUCCESS;
4863  }
4864 
4865  if (!is_pinned && ((map_flags & PI_MAP_READ) || (map_flags & PI_MAP_WRITE))) {
4866  // Pinned host memory is already on host so it doesn't need to be read.
4867  ret_err = cuda_piEnqueueMemBufferRead(
4868  command_queue, buffer, blocking_map, offset, size, hostPtr,
4869  num_events_in_wait_list, event_wait_list, event);
4870  } else {
4871  ScopedContext active(command_queue->get_context());
4872 
4873  if (is_pinned) {
4874  ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
4875  event_wait_list, nullptr);
4876  }
4877 
4878  if (event) {
4879  try {
4880  *event = _pi_event::make_native(
4881  PI_COMMAND_TYPE_MEM_BUFFER_MAP, command_queue,
4882  command_queue->get_next_transfer_stream());
4883  (*event)->start();
4884  (*event)->record();
4885  } catch (pi_result error) {
4886  ret_err = error;
4887  }
4888  }
4889  }
4890 
4891  return ret_err;
4892 }
4893 
4899  void *mapped_ptr,
4900  pi_uint32 num_events_in_wait_list,
4901  const pi_event *event_wait_list,
4902  pi_event *event) {
4903  pi_result ret_err = PI_SUCCESS;
4904 
4905  assert(command_queue != nullptr);
4906  assert(mapped_ptr != nullptr);
4907  assert(memobj != nullptr);
4908  assert(memobj->mem_type_ == _pi_mem::mem_type::buffer);
4909  assert(memobj->mem_.buffer_mem_.get_map_ptr() != nullptr);
4910  assert(memobj->mem_.buffer_mem_.get_map_ptr() == mapped_ptr);
4911 
4912  const bool is_pinned = memobj->mem_.buffer_mem_.allocMode_ ==
4914 
4915  if (!is_pinned &&
4916  ((memobj->mem_.buffer_mem_.get_map_flags() & PI_MAP_WRITE) ||
4917  (memobj->mem_.buffer_mem_.get_map_flags() &
4919  // Pinned host memory is only on host so it doesn't need to be written to.
4920  ret_err = cuda_piEnqueueMemBufferWrite(
4921  command_queue, memobj, true,
4922  memobj->mem_.buffer_mem_.get_map_offset(mapped_ptr),
4923  memobj->mem_.buffer_mem_.get_size(), mapped_ptr,
4924  num_events_in_wait_list, event_wait_list, event);
4925  } else {
4926  ScopedContext active(command_queue->get_context());
4927 
4928  if (is_pinned) {
4929  ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
4930  event_wait_list, nullptr);
4931  }
4932 
4933  if (event) {
4934  try {
4935  *event = _pi_event::make_native(
4936  PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, command_queue,
4937  command_queue->get_next_transfer_stream());
4938  (*event)->start();
4939  (*event)->record();
4940  } catch (pi_result error) {
4941  ret_err = error;
4942  }
4943  }
4944  }
4945 
4946  memobj->mem_.buffer_mem_.unmap(mapped_ptr);
4947  return ret_err;
4948 }
4949 
4952 pi_result cuda_piextUSMHostAlloc(void **result_ptr, pi_context context,
4953  pi_usm_mem_properties *properties, size_t size,
4954  pi_uint32 alignment) {
4955  assert(result_ptr != nullptr);
4956  assert(context != nullptr);
4957  assert(properties == nullptr || *properties == 0);
4958  pi_result result = PI_SUCCESS;
4959  try {
4960  ScopedContext active(context);
4961  result = PI_CHECK_ERROR(cuMemAllocHost(result_ptr, size));
4962  } catch (pi_result error) {
4963  result = error;
4964  }
4965 
4966  assert(alignment == 0 ||
4967  (result == PI_SUCCESS &&
4968  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
4969  return result;
4970 }
4971 
4974 pi_result cuda_piextUSMDeviceAlloc(void **result_ptr, pi_context context,
4975  pi_device device,
4976  pi_usm_mem_properties *properties,
4977  size_t size, pi_uint32 alignment) {
4978  assert(result_ptr != nullptr);
4979  assert(context != nullptr);
4980  assert(device != nullptr);
4981  assert(properties == nullptr || *properties == 0);
4982  pi_result result = PI_SUCCESS;
4983  try {
4984  ScopedContext active(context);
4985  result = PI_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)result_ptr, size));
4986  } catch (pi_result error) {
4987  result = error;
4988  }
4989 
4990  assert(alignment == 0 ||
4991  (result == PI_SUCCESS &&
4992  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
4993  return result;
4994 }
4995 
4998 pi_result cuda_piextUSMSharedAlloc(void **result_ptr, pi_context context,
4999  pi_device device,
5000  pi_usm_mem_properties *properties,
5001  size_t size, pi_uint32 alignment) {
5002  assert(result_ptr != nullptr);
5003  assert(context != nullptr);
5004  assert(device != nullptr);
5005  assert(properties == nullptr || *properties == 0);
5006  pi_result result = PI_SUCCESS;
5007  try {
5008  ScopedContext active(context);
5009  result = PI_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)result_ptr, size,
5010  CU_MEM_ATTACH_GLOBAL));
5011  } catch (pi_result error) {
5012  result = error;
5013  }
5014 
5015  assert(alignment == 0 ||
5016  (result == PI_SUCCESS &&
5017  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
5018  return result;
5019 }
5020 
5024  assert(context != nullptr);
5025  pi_result result = PI_SUCCESS;
5026  try {
5027  ScopedContext active(context);
5028  bool is_managed;
5029  unsigned int type;
5030  void *attribute_values[2] = {&is_managed, &type};
5031  CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
5032  CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
5033  result = PI_CHECK_ERROR(cuPointerGetAttributes(
5034  2, attributes, attribute_values, (CUdeviceptr)ptr));
5035  assert(type == CU_MEMORYTYPE_DEVICE || type == CU_MEMORYTYPE_HOST);
5036  if (is_managed || type == CU_MEMORYTYPE_DEVICE) {
5037  // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
5038  // with cuMemFree
5039  result = PI_CHECK_ERROR(cuMemFree((CUdeviceptr)ptr));
5040  } else {
5041  // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost
5042  result = PI_CHECK_ERROR(cuMemFreeHost(ptr));
5043  }
5044  } catch (pi_result error) {
5045  result = error;
5046  }
5047  return result;
5048 }
5049 
5051  size_t count,
5052  pi_uint32 num_events_in_waitlist,
5053  const pi_event *events_waitlist,
5054  pi_event *event) {
5055  assert(queue != nullptr);
5056  assert(ptr != nullptr);
5057  pi_result result = PI_SUCCESS;
5058  std::unique_ptr<_pi_event> event_ptr{nullptr};
5059 
5060  try {
5061  ScopedContext active(queue->get_context());
5062  pi_uint32 stream_token;
5063  _pi_stream_guard guard;
5064  CUstream cuStream = queue->get_next_compute_stream(
5065  num_events_in_waitlist, events_waitlist, guard, &stream_token);
5066  result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
5067  events_waitlist);
5068  if (event) {
5069  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5070  PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue, cuStream, stream_token));
5071  event_ptr->start();
5072  }
5073  result = PI_CHECK_ERROR(cuMemsetD8Async(
5074  (CUdeviceptr)ptr, (unsigned char)value & 0xFF, count, cuStream));
5075  if (event) {
5076  result = event_ptr->record();
5077  *event = event_ptr.release();
5078  }
5079  } catch (pi_result err) {
5080  result = err;
5081  }
5082  return result;
5083 }
5084 
5086  void *dst_ptr, const void *src_ptr,
5087  size_t size,
5088  pi_uint32 num_events_in_waitlist,
5089  const pi_event *events_waitlist,
5090  pi_event *event) {
5091  assert(queue != nullptr);
5092  assert(dst_ptr != nullptr);
5093  assert(src_ptr != nullptr);
5094  pi_result result = PI_SUCCESS;
5095 
5096  std::unique_ptr<_pi_event> event_ptr{nullptr};
5097 
5098  try {
5099  ScopedContext active(queue->get_context());
5100  CUstream cuStream = queue->get_next_transfer_stream();
5101  result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
5102  events_waitlist);
5103  if (event) {
5104  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5105  PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream));
5106  event_ptr->start();
5107  }
5108  result = PI_CHECK_ERROR(cuMemcpyAsync(
5109  (CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr, size, cuStream));
5110  if (event) {
5111  result = event_ptr->record();
5112  }
5113  if (blocking) {
5114  result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
5115  }
5116  if (event) {
5117  *event = event_ptr.release();
5118  }
5119  } catch (pi_result err) {
5120  result = err;
5121  }
5122  return result;
5123 }
5124 
5126  size_t size,
5127  pi_usm_migration_flags flags,
5128  pi_uint32 num_events_in_waitlist,
5129  const pi_event *events_waitlist,
5130  pi_event *event) {
5131  pi_device device = queue->get_context()->get_device();
5132 
5133  // Certain cuda devices and Windows do not have support for some Unified
5134  // Memory features. cuMemPrefetchAsync requires concurrent memory access
5135  // for managed memory. Therfore, ignore prefetch hint if concurrent managed
5136  // memory access is not available.
5137  if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
5138  setErrorMessage("Prefetch hint ignored as device does not support "
5139  "concurrent managed access",
5140  PI_SUCCESS);
5141  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5142  }
5143 
5144  unsigned int is_managed;
5145  PI_CHECK_ERROR(cuPointerGetAttribute(
5146  &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr));
5147  if (!is_managed) {
5148  setErrorMessage("Prefetch hint ignored as prefetch only works with USM",
5149  PI_SUCCESS);
5150  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5151  }
5152 
5153  // flags is currently unused so fail if set
5154  if (flags != 0)
5155  return PI_ERROR_INVALID_VALUE;
5156  assert(queue != nullptr);
5157  assert(ptr != nullptr);
5158  pi_result result = PI_SUCCESS;
5159  std::unique_ptr<_pi_event> event_ptr{nullptr};
5160 
5161  try {
5162  ScopedContext active(queue->get_context());
5163  CUstream cuStream = queue->get_next_transfer_stream();
5164  result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
5165  events_waitlist);
5166  if (event) {
5167  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5168  PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream));
5169  event_ptr->start();
5170  }
5171  result = PI_CHECK_ERROR(
5172  cuMemPrefetchAsync((CUdeviceptr)ptr, size, device->get(), cuStream));
5173  if (event) {
5174  result = event_ptr->record();
5175  *event = event_ptr.release();
5176  }
5177  } catch (pi_result err) {
5178  result = err;
5179  }
5180  return result;
5181 }
5182 
5185  size_t length, pi_mem_advice advice,
5186  pi_event *event) {
5187  assert(queue != nullptr);
5188  assert(ptr != nullptr);
5189 
5190  // Certain cuda devices and Windows do not have support for some Unified
5191  // Memory features. Passing CU_MEM_ADVISE_[UN]SET_PREFERRED_LOCATION and
5192  // CU_MEM_ADVISE_[UN]SET_ACCESSED_BY to cuMemAdvise on a GPU device requires
5193  // the GPU device to report a non-zero value for
5194  // CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Therfore, ignore memory
5195  // advise if concurrent managed memory access is not available.
5200  advice == PI_MEM_ADVICE_RESET) {
5201  pi_device device = queue->get_context()->get_device();
5202  if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
5203  setErrorMessage("Mem advise ignored as device does not support "
5204  "concurrent managed access",
5205  PI_SUCCESS);
5206  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5207  }
5208 
5209  // TODO: If ptr points to valid system-allocated pageable memory we should
5210  // check that the device also has the
5211  // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property.
5212  }
5213 
5214  unsigned int is_managed;
5215  PI_CHECK_ERROR(cuPointerGetAttribute(
5216  &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr));
5217  if (!is_managed) {
5219  "Memory advice ignored as memory advices only works with USM",
5220  PI_SUCCESS);
5221  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5222  }
5223 
5224  pi_result result = PI_SUCCESS;
5225  std::unique_ptr<_pi_event> event_ptr{nullptr};
5226 
5227  try {
5228  ScopedContext active(queue->get_context());
5229 
5230  if (event) {
5231  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5233  event_ptr->start();
5234  }
5235 
5236  switch (advice) {
5243  result = PI_CHECK_ERROR(cuMemAdvise(
5244  (CUdeviceptr)ptr, length,
5245  (CUmem_advise)(advice - PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY + 1),
5246  queue->get_context()->get_device()->get()));
5247  break;
5252  result = PI_CHECK_ERROR(cuMemAdvise(
5253  (CUdeviceptr)ptr, length,
5254  (CUmem_advise)(advice - PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY + 1 -
5257  CU_DEVICE_CPU));
5258  break;
5259  case PI_MEM_ADVICE_RESET:
5260  PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
5261  CU_MEM_ADVISE_UNSET_READ_MOSTLY,
5262  queue->get_context()->get_device()->get()));
5263  PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
5264  CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
5265  queue->get_context()->get_device()->get()));
5266  PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
5267  CU_MEM_ADVISE_UNSET_ACCESSED_BY,
5268  queue->get_context()->get_device()->get()));
5269  break;
5270  default:
5271  sycl::detail::pi::die("Unknown advice");
5272  }
5273  if (event) {
5274  result = event_ptr->record();
5275  *event = event_ptr.release();
5276  }
5277  } catch (pi_result err) {
5278  result = err;
5279  } catch (...) {
5280  result = PI_ERROR_UNKNOWN;
5281  }
5282  return result;
5283 }
5284 
5302  pi_mem_alloc_info param_name,
5303  size_t param_value_size,
5304  void *param_value,
5305  size_t *param_value_size_ret) {
5306  assert(context != nullptr);
5307  assert(ptr != nullptr);
5308  pi_result result = PI_SUCCESS;
5309 
5310  try {
5311  ScopedContext active(context);
5312  switch (param_name) {
5313  case PI_MEM_ALLOC_TYPE: {
5314  unsigned int value;
5315  // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
5316  CUresult ret = cuPointerGetAttribute(
5317  &value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr);
5318  if (ret == CUDA_ERROR_INVALID_VALUE) {
5319  // pointer not known to the CUDA subsystem
5320  return getInfo(param_value_size, param_value, param_value_size_ret,
5322  }
5323  result = check_error(ret, __func__, __LINE__ - 5, __FILE__);
5324  if (value) {
5325  // pointer to managed memory
5326  return getInfo(param_value_size, param_value, param_value_size_ret,
5328  }
5329  result = PI_CHECK_ERROR(cuPointerGetAttribute(
5330  &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)ptr));
5331  assert(value == CU_MEMORYTYPE_DEVICE || value == CU_MEMORYTYPE_HOST);
5332  if (value == CU_MEMORYTYPE_DEVICE) {
5333  // pointer to device memory
5334  return getInfo(param_value_size, param_value, param_value_size_ret,
5336  }
5337  if (value == CU_MEMORYTYPE_HOST) {
5338  // pointer to host memory
5339  return getInfo(param_value_size, param_value, param_value_size_ret,
5341  }
5342  // should never get here
5343 #ifdef _MSC_VER
5344  __assume(0);
5345 #else
5346  __builtin_unreachable();
5347 #endif
5348  return getInfo(param_value_size, param_value, param_value_size_ret,
5350  }
5351  case PI_MEM_ALLOC_BASE_PTR: {
5352 #if __CUDA_API_VERSION >= 10020
5353  // CU_POINTER_ATTRIBUTE_RANGE_START_ADDR was introduced in CUDA 10.2
5354  unsigned int value;
5355  result = PI_CHECK_ERROR(cuPointerGetAttribute(
5356  &value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)ptr));
5357  return getInfo(param_value_size, param_value, param_value_size_ret,
5358  value);
5359 #else
5360  return PI_ERROR_INVALID_VALUE;
5361 #endif
5362  }
5363  case PI_MEM_ALLOC_SIZE: {
5364 #if __CUDA_API_VERSION >= 10020
5365  // CU_POINTER_ATTRIBUTE_RANGE_SIZE was introduced in CUDA 10.2
5366  unsigned int value;
5367  result = PI_CHECK_ERROR(cuPointerGetAttribute(
5368  &value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)ptr));
5369  return getInfo(param_value_size, param_value, param_value_size_ret,
5370  value);
5371 #else
5372  return PI_ERROR_INVALID_VALUE;
5373 #endif
5374  }
5375  case PI_MEM_ALLOC_DEVICE: {
5376  // get device index associated with this pointer
5377  unsigned int device_idx;
5378  result = PI_CHECK_ERROR(cuPointerGetAttribute(
5379  &device_idx, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr));
5380 
5381  // currently each device is in its own platform, so find the platform at
5382  // the same index
5383  std::vector<pi_platform> platforms;
5384  platforms.resize(device_idx + 1);
5385  result = cuda_piPlatformsGet(device_idx + 1, platforms.data(), nullptr);
5386 
5387  // get the device from the platform
5388  pi_device device = platforms[device_idx]->devices_[0].get();
5389  return getInfo(param_value_size, param_value, param_value_size_ret,
5390  device);
5391  }
5392  }
5393  } catch (pi_result error) {
5394  result = error;
5395  }
5396  return result;
5397 }
5398 
5399 // This API is called by Sycl RT to notify the end of the plugin lifetime.
5400 // TODO: add a global variable lifetime management code here (see
5401 // pi_level_zero.cpp for reference) Currently this is just a NOOP.
5404  return PI_SUCCESS;
5405 }
5406 
5408 
5410  // Check that the major version matches in PiVersion and SupportedVersion
5412 
5413  // PI interface supports higher version or the same version.
5414  size_t PluginVersionSize = sizeof(PluginInit->PluginVersion);
5415  if (strlen(SupportedVersion) >= PluginVersionSize)
5416  return PI_ERROR_INVALID_VALUE;
5417  strncpy(PluginInit->PluginVersion, SupportedVersion, PluginVersionSize);
5418 
5419  // Set whole function table to zero to make it easier to detect if
5420  // functions are not set up below.
5421  std::memset(&(PluginInit->PiFunctionTable), 0,
5422  sizeof(PluginInit->PiFunctionTable));
5423 
5425 
5426 // Forward calls to CUDA RT.
5427 #define _PI_CL(pi_api, cuda_api) \
5428  (PluginInit->PiFunctionTable).pi_api = (decltype(&::pi_api))(&cuda_api);
5429 
5430  // Platform
5433  // Device
5444  // Context
5453  // Queue
5463  // Memory
5473  // Program
5487  // Kernel
5501  // Event
5513  // Sampler
5518  // Queue commands
5536  // USM
5546 
5549  _PI_CL(piPluginGetLastError, cuda_piPluginGetLastError)
5551 
5552 #undef _PI_CL
5553 
5554  return PI_SUCCESS;
5555 }
5556 
5557 } // extern "C"
5558 
RAII object that calls the reference count release function on the held PI object on destruction.
Definition: pi_cuda.cpp:762
ReleaseGuard(ReleaseGuard &&Other) noexcept
Definition: pi_cuda.cpp:798
void dismiss()
End the guard and do not release the reference count of the held PI object.
Definition: pi_cuda.cpp:830
~ReleaseGuard()
Calls the related PI object release function if the object held is not nullptr or if dismiss has not ...
Definition: pi_cuda.cpp:806
ReleaseGuard(const ReleaseGuard &)=delete
ReleaseGuard()=delete
ReleaseGuard(T Obj)
Obj can be nullptr.
Definition: pi_cuda.cpp:797
ReleaseGuard & operator=(const ReleaseGuard &)=delete
ReleaseGuard & operator=(ReleaseGuard &&Other)
Definition: pi_cuda.cpp:822
#define __SYCL_PI_CUDA_SYNC_WITH_DEFAULT
#define __SYCL_PI_CUDA_USE_DEFAULT_STREAM
#define __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY
unsigned int CUdeviceptr
struct CUevent_st * CUevent
struct CUstream_st * CUstream
int CUdevice
struct CUctx_st * CUcontext
#define __SYCL_INLINE_VER_NAMESPACE(X)
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_fp_or_dword_type< T >::value &&std::is_floating_point< T >::value, sycl::ext::intel::esimd::simd< T, SZ > > line(sycl::ext::intel::esimd::simd< T, 4 > src0, sycl::ext::intel::esimd::simd< T, SZ > src1, Sat sat={})
Linear equation.
Definition: math.hpp:921
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:199
__SYCL_EXTERN_STREAM_ATTRS ostream cerr
Linked to standard error (unbuffered)
void cuPrint(const char *Message)
Definition: pi_cuda.cpp:353
void die(const char *Message)
Definition: pi.cpp:536
void assertion(bool Condition, const char *Message=nullptr)
Definition: pi.cpp:541
constexpr tuple< Ts... > make_tuple(Ts... Args)
Definition: tuple.hpp:36
void memcpy(void *Dst, const void *Src, std::size_t Size)
detail::enable_if_t< detail::is_genfloat< T >::value, T > step(T edge, T x) __NOEXC
Definition: builtins.hpp:589
float length(T p) __NOEXC
Definition: builtins.hpp:1032
multi_ptr< ElementType, access::address_space::ext_intel_global_host_space, IsDecorated > host_ptr
Definition: pointers.hpp:40
void * malloc(size_t size, const device &dev, const context &ctxt, usm::alloc kind _CODELOCPARAM(&CodeLoc))
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
pi_result piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, size_t count, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
USM Memset API.
#define PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT
Extension to denote native support of assert feature by an arbitrary device piDeviceGetInfo call shou...
Definition: pi.h:740
pi_result piKernelCreate(pi_program program, const char *kernel_name, pi_kernel *ret_kernel)
constexpr pi_memory_scope_capabilities PI_MEMORY_SCOPE_WORK_GROUP
Definition: pi.h:527
int32_t pi_int32
Definition: pi.h:102
constexpr pi_sampler_properties PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS
Definition: pi.h:512
constexpr pi_memory_scope_capabilities PI_MEMORY_SCOPE_WORK_ITEM
Definition: pi.h:525
pi_result piQueueFinish(pi_queue command_queue)
uintptr_t pi_native_handle
Definition: pi.h:107
pi_result piProgramGetBuildInfo(pi_program program, pi_device device, _pi_program_build_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_bitfield pi_map_flags
Definition: pi.h:552
pi_result piextDeviceGetNativeHandle(pi_device device, pi_native_handle *nativeHandle)
Gets the native handle of a PI device object.
pi_result piEnqueueMemBufferCopyRect(pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
_pi_result
Definition: pi.h:114
pi_result piMemImageGetInfo(pi_mem image, pi_image_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_result piKernelRelease(pi_kernel kernel)
constexpr pi_sampler_properties PI_SAMPLER_PROPERTIES_ADDRESSING_MODE
Definition: pi.h:514
pi_uint32 pi_bool
Definition: pi.h:105
pi_result piextQueueCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, pi_device device, bool pluginOwnsNativeHandle, pi_queue *queue)
Creates PI queue object from a native handle.
pi_result piextUSMFree(pi_context context, void *ptr)
Indicates that the allocated USM memory is no longer needed on the runtime side.
pi_bitfield pi_usm_mem_properties
Definition: pi.h:564
_pi_device_info
Definition: pi.h:183
@ PI_DEVICE_INFO_PRINTF_BUFFER_SIZE
Definition: pi.h:253
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR
Definition: pi.h:194
@ PI_DEVICE_INFO_MAX_MEM_BANDWIDTH
Definition: pi.h:278
@ PI_DEVICE_INFO_GPU_SLICES
Definition: pi.h:275
@ PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC
Definition: pi.h:254
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT
Definition: pi.h:205
@ PI_DEVICE_INFO_BUILD_ON_SUBDEVICE
Definition: pi.h:281
@ PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE
Definition: pi.h:189
@ PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH
Definition: pi.h:216
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT
Definition: pi.h:195
@ PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT
Definition: pi.h:217
@ PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY
Definition: pi.h:208
@ PI_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT
Definition: pi.h:265
@ PI_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS
Definition: pi.h:187
@ PI_DEVICE_INFO_EXECUTION_CAPABILITIES
Definition: pi.h:239
@ PI_DEVICE_INFO_GPU_EU_COUNT
Definition: pi.h:273
@ PI_DEVICE_INFO_COMPILER_AVAILABLE
Definition: pi.h:237
@ PI_DEVICE_INFO_IMAGE_SUPPORT
Definition: pi.h:211
@ PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT
Definition: pi.h:232
@ PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY
Definition: pi.h:282
@ PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES
Definition: pi.h:291
@ PI_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES
Definition: pi.h:240
@ PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES
Definition: pi.h:257
@ PI_DEVICE_INFO_BUILT_IN_KERNELS
Definition: pi.h:242
@ PI_DEVICE_INFO_UUID
Definition: pi.h:269
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT
Definition: pi.h:198
@ PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS
Definition: pi.h:260
@ PI_DEVICE_INFO_AVAILABLE
Definition: pi.h:236
@ PI_DEVICE_INFO_HALF_FP_CONFIG
Definition: pi.h:191
@ PI_DEVICE_INFO_PARENT_DEVICE
Definition: pi.h:255
@ PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL
Definition: pi.h:262
@ PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES
Definition: pi.h:290
@ PI_DEVICE_INFO_GPU_EU_SIMD_WIDTH
Definition: pi.h:274
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT
Definition: pi.h:202
@ PI_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES
Definition: pi.h:241
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE
Definition: pi.h:206
@ PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE
Definition: pi.h:224
@ PI_DEVICE_INFO_DRIVER_VERSION
Definition: pi.h:248
@ PI_DEVICE_INFO_MAX_PARAMETER_SIZE
Definition: pi.h:222
@ PI_DEVICE_INFO_PARTITION_PROPERTIES
Definition: pi.h:256
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT
Definition: pi.h:196
@ PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT
Definition: pi.h:266
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF
Definition: pi.h:200
@ PI_DEVICE_INFO_VERSION
Definition: pi.h:250
@ PI_DEVICE_INFO_IMAGE3D_MAX_DEPTH
Definition: pi.h:218
@ PI_DEVICE_INFO_MAX_COMPUTE_UNITS
Definition: pi.h:186
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR
Definition: pi.h:201
@ PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT
Definition: pi.h:215
@ PI_DEVICE_INFO_VENDOR_ID
Definition: pi.h:185
@ PI_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE
Definition: pi.h:225
@ PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU
Definition: pi.h:292
@ PI_DEVICE_INFO_BACKEND_VERSION
Definition: pi.h:293
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG
Definition: pi.h:197
@ PI_DEVICE_INFO_PLATFORM
Definition: pi.h:243
@ PI_DEVICE_INFO_LOCAL_MEM_SIZE
Definition: pi.h:231
@ PI_DEVICE_INFO_REFERENCE_COUNT
Definition: pi.h:244
@ PI_DEVICE_INFO_MAX_SAMPLERS
Definition: pi.h:221
@ PI_DEVICE_INFO_DEVICE_ID
Definition: pi.h:271
@ PI_DEVICE_INFO_PROFILE
Definition: pi.h:249
@ PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES
Definition: pi.h:188
@ PI_DEVICE_INFO_EXTENSIONS
Definition: pi.h:252
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT
Definition: pi.h:203
@ PI_DEVICE_INFO_HOST_UNIFIED_MEMORY
Definition: pi.h:233
@ PI_DEVICE_INFO_PCI_ADDRESS
Definition: pi.h:272
@ PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE
Definition: pi.h:210
@ PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE
Definition: pi.h:226
@ PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE
Definition: pi.h:220
@ PI_DEVICE_INFO_LINKER_AVAILABLE
Definition: pi.h:238
@ PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D
Definition: pi.h:299
@ PI_EXT_ONEAPI_DEVICE_INFO_CUDA_ASYNC_BARRIER
Definition: pi.h:300
@ PI_DEVICE_INFO_ADDRESS_BITS
Definition: pi.h:209
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE
Definition: pi.h:199
@ PI_DEVICE_INFO_ATOMIC_64
Definition: pi.h:289
@ PI_DEVICE_INFO_USM_HOST_SUPPORT
Definition: pi.h:263
@ PI_DEVICE_INFO_USM_DEVICE_SUPPORT
Definition: pi.h:264
@ PI_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS
Definition: pi.h:261
@ PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE
Definition: pi.h:277
@ PI_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS
Definition: pi.h:213
@ PI_DEVICE_INFO_PROFILING_TIMER_RESOLUTION
Definition: pi.h:234
@ PI_DEVICE_INFO_VENDOR
Definition: pi.h:247
@ PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN
Definition: pi.h:223
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG
Definition: pi.h:204
@ PI_DEVICE_INFO_MAX_READ_IMAGE_ARGS
Definition: pi.h:212
@ PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN
Definition: pi.h:258
@ PI_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT
Definition: pi.h:267
@ PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE
Definition: pi.h:219
@ PI_EXT_INTEL_DEVICE_INFO_MEMORY_CLOCK_RATE
Definition: pi.h:285
@ PI_DEVICE_INFO_DOUBLE_FP_CONFIG
Definition: pi.h:192
@ PI_EXT_INTEL_DEVICE_INFO_MEMORY_BUS_WIDTH
Definition: pi.h:288
@ PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE
Definition: pi.h:228
@ PI_DEVICE_INFO_PARTITION_TYPE
Definition: pi.h:259
@ PI_DEVICE_INFO_TYPE
Definition: pi.h:184
@ PI_DEVICE_INFO_SINGLE_FP_CONFIG
Definition: pi.h:190
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF
Definition: pi.h:207
@ PI_DEVICE_INFO_GLOBAL_MEM_SIZE
Definition: pi.h:227
@ PI_DEVICE_INFO_NAME
Definition: pi.h:246
@ PI_DEVICE_INFO_LOCAL_MEM_TYPE
Definition: pi.h:230
@ PI_DEVICE_INFO_MAX_CONSTANT_ARGS
Definition: pi.h:229
@ PI_DEVICE_INFO_OPENCL_C_VERSION
Definition: pi.h:251
@ PI_DEVICE_INFO_ENDIAN_LITTLE
Definition: pi.h:235
@ PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH
Definition: pi.h:214
@ PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE
Definition: pi.h:276
@ PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS
Definition: pi.h:295
pi_uint64 pi_bitfield
Definition: pi.h:106
static constexpr pi_device_fp_config PI_FP_DENORM
Definition: pi.h:642
pi_result piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, size_t length, pi_mem_advice advice, pi_event *event)
USM Memadvise API.
constexpr pi_mem_flags PI_MEM_FLAGS_HOST_PTR_ALLOC
Definition: pi.h:549
@ PI_PROGRAM_BUILD_STATUS_SUCCESS
Definition: pi.h:147
@ PI_PROGRAM_BUILD_STATUS_ERROR
Definition: pi.h:146
_pi_queue_info
Definition: pi.h:326
@ PI_QUEUE_INFO_PROPERTIES
Definition: pi.h:330
@ PI_QUEUE_INFO_DEVICE
Definition: pi.h:328
@ PI_QUEUE_INFO_CONTEXT
Definition: pi.h:327
@ PI_QUEUE_INFO_REFERENCE_COUNT
Definition: pi.h:331
_pi_mem_advice
Definition: pi.h:422
@ PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST
Definition: pi.h:434
@ PI_MEM_ADVICE_CUDA_UNSET_READ_MOSTLY
Definition: pi.h:426
@ PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION
Definition: pi.h:428
@ PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY
Definition: pi.h:425
@ PI_MEM_ADVICE_RESET
Definition: pi.h:424
@ PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION_HOST
Definition: pi.h:431
@ PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY
Definition: pi.h:430
@ PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY_HOST
Definition: pi.h:433
@ PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION_HOST
Definition: pi.h:432
@ PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION
Definition: pi.h:427
@ PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY
Definition: pi.h:429
pi_result piEnqueueMemImageRead(pi_queue command_queue, pi_mem image, pi_bool blocking_read, pi_image_offset origin, pi_image_region region, size_t row_pitch, size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
@ PI_IMAGE_CHANNEL_TYPE_FLOAT
Definition: pi.h:471
@ PI_IMAGE_CHANNEL_TYPE_UNORM_INT8
Definition: pi.h:459
@ PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16
Definition: pi.h:465
@ PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
Definition: pi.h:467
@ PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8
Definition: pi.h:464
@ PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
Definition: pi.h:469
@ PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
Definition: pi.h:468
@ PI_IMAGE_CHANNEL_TYPE_SIGNED_INT32
Definition: pi.h:466
@ PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT
Definition: pi.h:470
@ PI_IMAGE_CHANNEL_TYPE_UNORM_INT16
Definition: pi.h:460
_pi_kernel_sub_group_info
Definition: pi.h:365
@ PI_KERNEL_COMPILE_NUM_SUB_GROUPS
Definition: pi.h:368
@ PI_KERNEL_MAX_NUM_SUB_GROUPS
Definition: pi.h:367
@ PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL
Definition: pi.h:369
@ PI_KERNEL_MAX_SUB_GROUP_SIZE
Definition: pi.h:366
pi_result piextProgramCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, bool pluginOwnsNativeHandle, pi_program *program)
Creates PI program object from a native handle.
pi_bitfield pi_mem_properties
Definition: pi.h:558
pi_result piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
pi_result piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piEnqueueKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
_pi_kernel_group_info
Definition: pi.h:344
@ PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE
Definition: pi.h:346
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
Definition: pi.h:347
@ PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE
Definition: pi.h:350
@ PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE
Definition: pi.h:348
@ PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
Definition: pi.h:349
@ PI_KERNEL_GROUP_INFO_NUM_REGS
Definition: pi.h:352
pi_result piEnqueueMemBufferRead(pi_queue queue, pi_mem buffer, pi_bool blocking_read, size_t offset, size_t size, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
constexpr pi_memory_order_capabilities PI_MEMORY_ORDER_ACQUIRE
Definition: pi.h:519
pi_result piProgramRetain(pi_program program)
const pi_bool PI_TRUE
Definition: pi.h:476
pi_result piMemImageCreate(pi_context context, pi_mem_flags flags, const pi_image_format *image_format, const pi_image_desc *image_desc, void *host_ptr, pi_mem *ret_mem)
pi_result piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, size_t size, pi_usm_migration_flags flags, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Hint to migrate memory to the device.
_pi_kernel_exec_info
Definition: pi.h:1304
pi_result piProgramRelease(pi_program program)
_pi_platform_info
Definition: pi.h:129
@ PI_PLATFORM_INFO_VENDOR
Definition: pi.h:133
@ PI_PLATFORM_INFO_EXTENSIONS
Definition: pi.h:130
@ PI_PLATFORM_INFO_PROFILE
Definition: pi.h:132
@ PI_PLATFORM_INFO_NAME
Definition: pi.h:131
@ PI_PLATFORM_INFO_VERSION
Definition: pi.h:134
pi_result piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, const pi_sampler *arg_value)
pi_result piSamplerRetain(pi_sampler sampler)
pi_result piMemGetInfo(pi_mem mem, pi_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_result piextUSMHostAlloc(void **result_ptr, pi_context context, pi_usm_mem_properties *properties, size_t size, pi_uint32 alignment)
Allocates host memory accessible by the device.
pi_result piDeviceGetInfo(pi_device device, pi_device_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Returns requested info for provided native device Return PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT fo...
pi_result piKernelRetain(pi_kernel kernel)
pi_result piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, pi_uint32 *num_platforms)
_pi_image_info
Definition: pi.h:355
pi_result piextEventGetNativeHandle(pi_event event, pi_native_handle *nativeHandle)
Gets the native handle of a PI event object.
pi_result piEnqueueEventsWait(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
@ PI_USM_ATOMIC_ACCESS
Definition: pi.h:1617
@ PI_USM_ACCESS
Definition: pi.h:1616
@ PI_USM_CONCURRENT_ACCESS
Definition: pi.h:1618
@ PI_USM_CONCURRENT_ATOMIC_ACCESS
Definition: pi.h:1619
@ PI_IMAGE_CHANNEL_ORDER_RGBA
Definition: pi.h:444
constexpr pi_memory_order_capabilities PI_MEMORY_ORDER_ACQ_REL
Definition: pi.h:521
_pi_profiling_info
Definition: pi.h:531
@ PI_PROFILING_INFO_COMMAND_END
Definition: pi.h:535
@ PI_PROFILING_INFO_COMMAND_START
Definition: pi.h:534
@ PI_PROFILING_INFO_COMMAND_SUBMIT
Definition: pi.h:533
@ PI_PROFILING_INFO_COMMAND_QUEUED
Definition: pi.h:532
@ PI_MEM_TYPE_SHARED
Definition: pi.h:1633
@ PI_MEM_TYPE_UNKNOWN
Definition: pi.h:1630
@ PI_MEM_TYPE_DEVICE
Definition: pi.h:1632
@ PI_MEM_TYPE_HOST
Definition: pi.h:1631
constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE
Definition: pi.h:579
_pi_usm_migration_flags
Definition: pi.h:1639
pi_result piTearDown(void *PluginParameter)
API to notify that the plugin should clean up its resources.
_pi_device_type
Definition: pi.h:161
@ PI_DEVICE_TYPE_DEFAULT
The default device available in the PI plugin.
Definition: pi.h:162
@ PI_DEVICE_TYPE_GPU
A PI device that is a GPU.
Definition: pi.h:166
pi_result piContextRetain(pi_context context)
@ PI_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE
Definition: pi.h:175
uint64_t pi_uint64
Definition: pi.h:104
pi_bitfield pi_queue_properties
Definition: pi.h:577
pi_result piMemRetain(pi_mem mem)
pi_result piKernelGetSubGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name, size_t input_value_size, const void *input_value, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
API to query information from the sub-group from a kernel.
pi_result piextProgramSetSpecializationConstant(pi_program prog, pi_uint32 spec_id, size_t spec_size, const void *spec_value)
Sets a specialization constant to a specific value.
const pi_bool PI_FALSE
Definition: pi.h:477
pi_result piEventGetInfo(pi_event event, pi_event_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
#define __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64
PTX 64-bit image <-> "nvptx64", 64-bit NVIDIA PTX device.
Definition: pi.h:734
pi_result piDevicesGet(pi_platform platform, pi_device_type device_type, pi_uint32 num_entries, pi_device *devices, pi_uint32 *num_devices)
pi_result piQueueRelease(pi_queue command_queue)
constexpr pi_memory_order_capabilities PI_MEMORY_ORDER_RELEASE
Definition: pi.h:520
pi_bitfield pi_sampler_properties
Definition: pi.h:511
pi_result piextContextCreateWithNativeHandle(pi_native_handle nativeHandle, pi_uint32 numDevices, const pi_device *devices, bool pluginOwnsNativeHandle, pi_context *context)
Creates PI context object from a native handle.
pi_result piEventGetProfilingInfo(pi_event event, pi_profiling_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_result piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, size_t offset, size_t size, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
constexpr pi_mem_flags PI_MEM_FLAGS_HOST_PTR_COPY
Definition: pi.h:548
pi_result piEnqueueMemBufferWriteRect(pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, pi_buff_rect_region region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
_pi_buffer_create_type
Definition: pi.h:474
@ PI_BUFFER_CREATE_TYPE_REGION
Definition: pi.h:474
pi_result piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, void *mapped_ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
constexpr pi_queue_properties PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
Definition: pi.h:578
pi_result piKernelSetExecInfo(pi_kernel kernel, pi_kernel_exec_info value_name, size_t param_value_size, const void *param_value)
API to set attributes controlling kernel execution.
static constexpr pi_device_fp_config PI_FP_ROUND_TO_NEAREST
Definition: pi.h:644
pi_result piProgramGetInfo(pi_program program, pi_program_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_result piProgramBuild(pi_program program, pi_uint32 num_devices, const pi_device *device_list, const char *options, void(*pfn_notify)(pi_program program, void *user_data), void *user_data)
uint32_t pi_uint32
Definition: pi.h:103
constexpr pi_memory_order_capabilities PI_MEMORY_ORDER_RELAXED
Definition: pi.h:518
_pi_event_info
Definition: pi.h:372
@ PI_EVENT_INFO_COMMAND_EXECUTION_STATUS
Definition: pi.h:376
@ PI_EVENT_INFO_REFERENCE_COUNT
Definition: pi.h:377
@ PI_EVENT_INFO_COMMAND_TYPE
Definition: pi.h:375
@ PI_EVENT_INFO_CONTEXT
Definition: pi.h:374