DPC++ Runtime
Runtime libraries for oneAPI DPC++
pi_cuda.cpp
Go to the documentation of this file.
1 //==---------- pi_cuda.cpp - CUDA Plugin -----------------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
13 
14 #include <pi_cuda.hpp>
16 #include <sycl/detail/defines.hpp>
17 #include <sycl/detail/pi.hpp>
18 
19 #include <algorithm>
20 #include <cassert>
21 #include <chrono>
22 #include <cuda.h>
23 #include <cuda_device_runtime_api.h>
24 #include <limits>
25 #include <memory>
26 #include <mutex>
27 #include <regex>
28 
29 // Forward declarations
30 void enableCUDATracing();
31 void disableCUDATracing();
32 
33 namespace {
34 std::string getCudaVersionString() {
35  int driver_version = 0;
36  cuDriverGetVersion(&driver_version);
37  // The version is returned as (1000 major + 10 minor).
38  std::stringstream stream;
39  stream << "CUDA " << driver_version / 1000 << "."
40  << driver_version % 1000 / 10;
41  return stream.str();
42 }
43 
44 pi_result map_error(CUresult result) {
45  switch (result) {
46  case CUDA_SUCCESS:
47  return PI_SUCCESS;
48  case CUDA_ERROR_NOT_PERMITTED:
49  return PI_ERROR_INVALID_OPERATION;
50  case CUDA_ERROR_INVALID_CONTEXT:
51  return PI_ERROR_INVALID_CONTEXT;
52  case CUDA_ERROR_INVALID_DEVICE:
53  return PI_ERROR_INVALID_DEVICE;
54  case CUDA_ERROR_INVALID_VALUE:
55  return PI_ERROR_INVALID_VALUE;
56  case CUDA_ERROR_OUT_OF_MEMORY:
57  return PI_ERROR_OUT_OF_HOST_MEMORY;
58  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
59  return PI_ERROR_OUT_OF_RESOURCES;
60  default:
61  return PI_ERROR_UNKNOWN;
62  }
63 }
64 
65 // Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR
66 constexpr size_t MaxMessageSize = 256;
67 thread_local pi_result ErrorMessageCode = PI_SUCCESS;
68 thread_local char ErrorMessage[MaxMessageSize];
69 
70 // Utility function for setting a message and warning
71 static void setErrorMessage(const char *message, pi_result error_code) {
72  assert(strlen(message) <= MaxMessageSize);
73  strcpy(ErrorMessage, message);
74  ErrorMessageCode = error_code;
75 }
76 
77 // Returns plugin specific error and warning messages
78 pi_result cuda_piPluginGetLastError(char **message) {
79  *message = &ErrorMessage[0];
80  return ErrorMessageCode;
81 }
82 
83 // Iterates over the event wait list, returns correct pi_result error codes.
84 // Invokes the callback for the latest event of each queue in the wait list.
85 // The callback must take a single pi_event argument and return a pi_result.
86 template <typename Func>
87 pi_result forLatestEvents(const pi_event *event_wait_list,
88  std::size_t num_events_in_wait_list, Func &&f) {
89 
90  if (event_wait_list == nullptr || num_events_in_wait_list == 0) {
91  return PI_ERROR_INVALID_EVENT_WAIT_LIST;
92  }
93 
94  // Fast path if we only have a single event
95  if (num_events_in_wait_list == 1) {
96  return f(event_wait_list[0]);
97  }
98 
99  std::vector<pi_event> events{event_wait_list,
100  event_wait_list + num_events_in_wait_list};
101  std::sort(events.begin(), events.end(), [](pi_event e0, pi_event e1) {
102  // Tiered sort creating sublists of streams (smallest value first) in which
103  // the corresponding events are sorted into a sequence of newest first.
104  return e0->get_stream() < e1->get_stream() ||
105  (e0->get_stream() == e1->get_stream() &&
106  e0->get_event_id() > e1->get_event_id());
107  });
108 
109  bool first = true;
110  CUstream lastSeenStream = 0;
111  for (pi_event event : events) {
112  if (!event || (!first && event->get_stream() == lastSeenStream)) {
113  continue;
114  }
115 
116  first = false;
117  lastSeenStream = event->get_stream();
118 
119  auto result = f(event);
120  if (result != PI_SUCCESS) {
121  return result;
122  }
123  }
124 
125  return PI_SUCCESS;
126 }
127 
135 pi_result check_error(CUresult result, const char *function, int line,
136  const char *file) {
137  if (result == CUDA_SUCCESS || result == CUDA_ERROR_DEINITIALIZED) {
138  return PI_SUCCESS;
139  }
140 
141  if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) {
142  const char *errorString = nullptr;
143  const char *errorName = nullptr;
144  cuGetErrorName(result, &errorName);
145  cuGetErrorString(result, &errorString);
146  std::stringstream ss;
147  ss << "\nPI CUDA ERROR:"
148  << "\n\tValue: " << result
149  << "\n\tName: " << errorName
150  << "\n\tDescription: " << errorString
151  << "\n\tFunction: " << function << "\n\tSource Location: " << file
152  << ":" << line << "\n"
153  << std::endl;
154  std::cerr << ss.str();
155  }
156 
157  if (std::getenv("PI_CUDA_ABORT") != nullptr) {
158  std::abort();
159  }
160 
161  throw map_error(result);
162 }
163 
165 #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)
166 
169 //
173 //
186 //
189 class ScopedContext {
190 public:
191  ScopedContext(pi_context ctxt) {
192  if (!ctxt) {
193  throw PI_ERROR_INVALID_CONTEXT;
194  }
195 
196  set_context(ctxt->get());
197  }
198 
199  ScopedContext(CUcontext ctxt) { set_context(ctxt); }
200 
201  ~ScopedContext() {}
202 
203 private:
204  void set_context(CUcontext desired) {
205  CUcontext original = nullptr;
206 
207  PI_CHECK_ERROR(cuCtxGetCurrent(&original));
208 
209  // Make sure the desired context is active on the current thread, setting
210  // it if necessary
211  if (original != desired) {
212  PI_CHECK_ERROR(cuCtxSetCurrent(desired));
213  }
214  }
215 };
216 
218 template <typename T, typename Assign>
219 pi_result getInfoImpl(size_t param_value_size, void *param_value,
220  size_t *param_value_size_ret, T value, size_t value_size,
221  Assign &&assign_func) {
222 
223  if (param_value != nullptr) {
224 
225  if (param_value_size < value_size) {
226  return PI_ERROR_INVALID_VALUE;
227  }
228 
229  assign_func(param_value, value, value_size);
230  }
231 
232  if (param_value_size_ret != nullptr) {
233  *param_value_size_ret = value_size;
234  }
235 
236  return PI_SUCCESS;
237 }
238 
239 template <typename T>
240 pi_result getInfo(size_t param_value_size, void *param_value,
241  size_t *param_value_size_ret, T value) {
242 
243  auto assignment = [](void *param_value, T value, size_t value_size) {
244  // Ignore unused parameter
245  (void)value_size;
246 
247  *static_cast<T *>(param_value) = value;
248  };
249 
250  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
251  sizeof(T), assignment);
252 }
253 
254 template <typename T>
255 pi_result getInfoArray(size_t array_length, size_t param_value_size,
256  void *param_value, size_t *param_value_size_ret,
257  T *value) {
258  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
259  array_length * sizeof(T), memcpy);
260 }
261 
262 template <>
263 pi_result getInfo<const char *>(size_t param_value_size, void *param_value,
264  size_t *param_value_size_ret,
265  const char *value) {
266  return getInfoArray(strlen(value) + 1, param_value_size, param_value,
267  param_value_size_ret, value);
268 }
269 
270 int getAttribute(pi_device device, CUdevice_attribute attribute) {
271  int value;
273  cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS);
274  return value;
275 }
277 
278 // Determine local work sizes that result in uniform work groups.
279 // The default threadsPerBlock only require handling the first work_dim
280 // dimension.
281 void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
282  const size_t *global_work_size,
283  const size_t maxThreadsPerBlock[3], pi_kernel kernel,
284  pi_uint32 local_size) {
285  assert(threadsPerBlock != nullptr);
286  assert(global_work_size != nullptr);
287  assert(kernel != nullptr);
288  int minGrid, maxBlockSize, gridDim[3];
289 
290  cuDeviceGetAttribute(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
291  device->get());
292  cuDeviceGetAttribute(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
293  device->get());
294 
295  threadsPerBlock[1] = ((global_work_size[1] - 1) / gridDim[1]) + 1;
296  threadsPerBlock[2] = ((global_work_size[2] - 1) / gridDim[2]) + 1;
297 
298  PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
299  &minGrid, &maxBlockSize, kernel->get(), NULL, local_size,
300  maxThreadsPerBlock[0]));
301 
302  gridDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
303 
304  threadsPerBlock[0] =
305  std::min(maxThreadsPerBlock[0],
306  std::min(global_work_size[0], static_cast<size_t>(gridDim[0])));
307 
308  // Find a local work group size that is a divisor of the global
309  // work group size to produce uniform work groups.
310  while (0u != (global_work_size[0] % threadsPerBlock[0])) {
311  --threadsPerBlock[0];
312  }
313 }
314 
315 pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream,
316  pi_uint32 num_events_in_wait_list,
317  const pi_event *event_wait_list) {
318  if (!event_wait_list) {
319  return PI_SUCCESS;
320  }
321  try {
322  ScopedContext active(command_queue->get_context());
323 
324  auto result = forLatestEvents(
325  event_wait_list, num_events_in_wait_list,
326  [stream](pi_event event) -> pi_result {
327  if (event->get_stream() == stream) {
328  return PI_SUCCESS;
329  } else {
330  return PI_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0));
331  }
332  });
333 
334  if (result != PI_SUCCESS) {
335  return result;
336  }
337  return PI_SUCCESS;
338  } catch (pi_result err) {
339  return err;
340  } catch (...) {
341  return PI_ERROR_UNKNOWN;
342  }
343 }
344 
345 template <typename PtrT>
346 void getUSMHostOrDevicePtr(PtrT usm_ptr, CUmemorytype *out_mem_type,
347  CUdeviceptr *out_dev_ptr, PtrT *out_host_ptr) {
348  // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
349  // checks with PI_CHECK_ERROR are not suggested
350  CUresult ret = cuPointerGetAttribute(
351  out_mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)usm_ptr);
352  assert((*out_mem_type != CU_MEMORYTYPE_ARRAY &&
353  *out_mem_type != CU_MEMORYTYPE_UNIFIED) &&
354  "ARRAY, UNIFIED types are not supported!");
355 
356  // pointer not known to the CUDA subsystem (possibly a system allocated ptr)
357  if (ret == CUDA_ERROR_INVALID_VALUE) {
358  *out_mem_type = CU_MEMORYTYPE_HOST;
359  *out_dev_ptr = 0;
360  *out_host_ptr = usm_ptr;
361 
362  // todo: resets the above "non-stick" error
363  } else if (ret == CUDA_SUCCESS) {
364  *out_dev_ptr = (*out_mem_type == CU_MEMORYTYPE_DEVICE)
365  ? reinterpret_cast<CUdeviceptr>(usm_ptr)
366  : 0;
367  *out_host_ptr = (*out_mem_type == CU_MEMORYTYPE_HOST) ? usm_ptr : nullptr;
368  } else {
369  PI_CHECK_ERROR(ret);
370  }
371 }
372 
373 } // anonymous namespace
374 
376 namespace sycl {
378 namespace detail {
379 namespace pi {
380 
381 // Report error and no return (keeps compiler from printing warnings).
382 // TODO: Probably change that to throw a catchable exception,
383 // but for now it is useful to see every failure.
384 //
385 [[noreturn]] void die(const char *Message) {
386  std::cerr << "pi_die: " << Message << std::endl;
387  std::terminate();
388 }
389 
390 // Reports error messages
391 void cuPrint(const char *Message) {
392  std::cerr << "pi_print: " << Message << std::endl;
393 }
394 
395 void assertion(bool Condition, const char *Message) {
396  if (!Condition)
397  die(Message);
398 }
399 
400 } // namespace pi
401 } // namespace detail
402 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
403 } // namespace sycl
404 
405 //--------------
406 // PI object implementation
407 
408 extern "C" {
409 
410 // Required in a number of functions, so forward declare here
412  pi_uint32 num_events_in_wait_list,
413  const pi_event *event_wait_list,
414  pi_event *event);
416  pi_uint32 num_events_in_wait_list,
417  const pi_event *event_wait_list,
418  pi_event *event);
421 
422 } // extern "C"
423 
425 
427  pi_uint32 stream_i) {
428  if (barrier_event_ && !compute_applied_barrier_[stream_i]) {
429  PI_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0));
430  compute_applied_barrier_[stream_i] = true;
431  }
432 }
433 
435  pi_uint32 stream_i) {
436  if (barrier_event_ && !transfer_applied_barrier_[stream_i]) {
437  PI_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0));
438  transfer_applied_barrier_[stream_i] = true;
439  }
440 }
441 
443  pi_uint32 stream_i;
444  pi_uint32 token;
445  while (true) {
446  if (num_compute_streams_ < compute_streams_.size()) {
447  // the check above is for performance - so as not to lock mutex every time
448  std::lock_guard<std::mutex> guard(compute_stream_mutex_);
449  // The second check is done after mutex is locked so other threads can not
450  // change num_compute_streams_ after that
451  if (num_compute_streams_ < compute_streams_.size()) {
452  PI_CHECK_ERROR(
453  cuStreamCreate(&compute_streams_[num_compute_streams_++], flags_));
454  }
455  }
456  token = compute_stream_idx_++;
457  stream_i = token % compute_streams_.size();
458  // if a stream has been reused before it was next selected round-robin
459  // fashion, we want to delay its next use and instead select another one
460  // that is more likely to have completed all the enqueued work.
461  if (delay_compute_[stream_i]) {
462  delay_compute_[stream_i] = false;
463  } else {
464  break;
465  }
466  }
467  if (stream_token) {
468  *stream_token = token;
469  }
470  CUstream res = compute_streams_[stream_i];
472  return res;
473 }
474 
476  const pi_event *event_wait_list,
477  _pi_stream_guard &guard,
478  pi_uint32 *stream_token) {
479  for (pi_uint32 i = 0; i < num_events_in_wait_list; i++) {
480  pi_uint32 token = event_wait_list[i]->get_compute_stream_token();
481  if (event_wait_list[i]->get_queue() == this && can_reuse_stream(token)) {
482  std::unique_lock<std::mutex> compute_sync_guard(
484  // redo the check after lock to avoid data races on
485  // last_sync_compute_streams_
486  if (can_reuse_stream(token)) {
487  pi_uint32 stream_i = token % delay_compute_.size();
488  delay_compute_[stream_i] = true;
489  if (stream_token) {
490  *stream_token = token;
491  }
492  guard = _pi_stream_guard{std::move(compute_sync_guard)};
493  CUstream res = event_wait_list[i]->get_stream();
495  return res;
496  }
497  }
498  }
499  guard = {};
500  return get_next_compute_stream(stream_token);
501 }
502 
504  if (transfer_streams_.empty()) { // for example in in-order queue
505  return get_next_compute_stream();
506  }
508  // the check above is for performance - so as not to lock mutex every time
509  std::lock_guard<std::mutex> guard(transfer_stream_mutex_);
510  // The second check is done after mutex is locked so other threads can not
511  // change num_transfer_streams_ after that
513  PI_CHECK_ERROR(
514  cuStreamCreate(&transfer_streams_[num_transfer_streams_++], flags_));
515  }
516  }
517  pi_uint32 stream_i = transfer_stream_idx_++ % transfer_streams_.size();
518  CUstream res = transfer_streams_[stream_i];
520  return res;
521 }
522 
524  CUstream stream, pi_uint32 stream_token)
525  : commandType_{type}, refCount_{1}, has_ownership_{true},
526  hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
527  streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr},
528  evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} {
529 
530  bool profilingEnabled = queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE;
531 
532  PI_CHECK_ERROR(cuEventCreate(
533  &evEnd_, profilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
534 
535  if (profilingEnabled) {
536  PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
537  PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
538  }
539 
540  if (queue_ != nullptr) {
541  cuda_piQueueRetain(queue_);
542  }
543  cuda_piContextRetain(context_);
544 }
545 
546 _pi_event::_pi_event(pi_context context, CUevent eventNative)
547  : commandType_{PI_COMMAND_TYPE_USER}, refCount_{1}, has_ownership_{false},
548  hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
549  streamToken_{std::numeric_limits<pi_uint32>::max()}, evEnd_{eventNative},
550  evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, context_{
551  context} {
552  cuda_piContextRetain(context_);
553 }
554 
556  if (queue_ != nullptr) {
557  cuda_piQueueRelease(queue_);
558  }
559  cuda_piContextRelease(context_);
560 }
561 
563  assert(!is_started());
564  pi_result result = PI_SUCCESS;
565 
566  try {
568  // NOTE: This relies on the default stream to be unused.
569  result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0));
570  result = PI_CHECK_ERROR(cuEventRecord(evStart_, stream_));
571  }
572  } catch (pi_result error) {
573  result = error;
574  }
575 
576  isStarted_ = true;
577  return result;
578 }
579 
580 bool _pi_event::is_completed() const noexcept {
581  if (!isRecorded_) {
582  return false;
583  }
584  if (!hasBeenWaitedOn_) {
585  const CUresult ret = cuEventQuery(evEnd_);
586  if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_READY) {
587  PI_CHECK_ERROR(ret);
588  return false;
589  }
590  if (ret == CUDA_ERROR_NOT_READY) {
591  return false;
592  }
593  }
594  return true;
595 }
596 
598  float miliSeconds = 0.0f;
599 
600  PI_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, evBase_, ev));
601 
602  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
603 }
604 
606  assert(is_started());
607  return queue_->get_device()->get_elapsed_time(evQueued_);
608 }
609 
611  assert(is_started());
612  return queue_->get_device()->get_elapsed_time(evStart_);
613 }
614 
616  assert(is_started() && is_recorded());
617  return queue_->get_device()->get_elapsed_time(evEnd_);
618 }
619 
621 
622  if (is_recorded() || !is_started()) {
623  return PI_ERROR_INVALID_EVENT;
624  }
625 
626  pi_result result = PI_ERROR_INVALID_OPERATION;
627 
628  if (!queue_) {
629  return PI_ERROR_INVALID_QUEUE;
630  }
631 
632  try {
633  eventId_ = queue_->get_next_event_id();
634  if (eventId_ == 0) {
636  "Unrecoverable program state reached in event identifier overflow");
637  }
638  result = PI_CHECK_ERROR(cuEventRecord(evEnd_, stream_));
639  } catch (pi_result error) {
640  result = error;
641  }
642 
643  if (result == PI_SUCCESS) {
644  isRecorded_ = true;
645  }
646 
647  return result;
648 }
649 
651  pi_result retErr;
652  try {
653  retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
654  hasBeenWaitedOn_ = true;
655  } catch (pi_result error) {
656  retErr = error;
657  }
658 
659  return retErr;
660 }
661 
663  if (!backend_has_ownership())
664  return PI_SUCCESS;
665 
666  assert(queue_ != nullptr);
667 
668  PI_CHECK_ERROR(cuEventDestroy(evEnd_));
669 
671  PI_CHECK_ERROR(cuEventDestroy(evQueued_));
672  PI_CHECK_ERROR(cuEventDestroy(evStart_));
673  }
674 
675  return PI_SUCCESS;
676 }
677 
678 // makes all future work submitted to queue wait for all work captured in event.
680  // for native events, the cuStreamWaitEvent call is used.
681  // This makes all future work submitted to stream wait for all
682  // work captured in event.
683  queue->for_each_stream([e = event->get()](CUstream s) {
684  PI_CHECK_ERROR(cuStreamWaitEvent(s, e, 0));
685  });
686  return PI_SUCCESS;
687 }
688 
690  : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1},
691  context_{ctxt}, kernelReqdWorkGroupSizeMD_{} {
692  cuda_piContextRetain(context_);
693 }
694 
696 
697 std::pair<std::string, std::string>
698 splitMetadataName(const std::string &metadataName) {
699  size_t splitPos = metadataName.rfind('@');
700  if (splitPos == std::string::npos)
701  return std::make_pair(metadataName, std::string{});
702  return std::make_pair(metadataName.substr(0, splitPos),
703  metadataName.substr(splitPos, metadataName.length()));
704 }
705 
707  size_t length) {
708  for (size_t i = 0; i < length; ++i) {
709  const pi_device_binary_property metadataElement = metadata[i];
710  std::string metadataElementName{metadataElement->Name};
711 
712  auto [prefix, tag] = splitMetadataName(metadataElementName);
713 
715  // If metadata is reqd_work_group_size, record it for the corresponding
716  // kernel name.
717  size_t MDElemsSize = metadataElement->ValSize - sizeof(std::uint64_t);
718 
719  // Expect between 1 and 3 32-bit integer values.
720  assert(MDElemsSize >= sizeof(std::uint32_t) &&
721  MDElemsSize <= sizeof(std::uint32_t) * 3 &&
722  "Unexpected size for reqd_work_group_size metadata");
723 
724  // Get pointer to data, skipping 64-bit size at the start of the data.
725  const char *ValuePtr =
726  reinterpret_cast<const char *>(metadataElement->ValAddr) +
727  sizeof(std::uint64_t);
728  // Read values and pad with 1's for values not present.
729  std::uint32_t reqdWorkGroupElements[] = {1, 1, 1};
730  std::memcpy(reqdWorkGroupElements, ValuePtr, MDElemsSize);
732  std::make_tuple(reqdWorkGroupElements[0], reqdWorkGroupElements[1],
733  reqdWorkGroupElements[2]);
735  const char *metadataValPtr =
736  reinterpret_cast<const char *>(metadataElement->ValAddr) +
737  sizeof(std::uint64_t);
738  const char *metadataValPtrEnd =
739  metadataValPtr + metadataElement->ValSize - sizeof(std::uint64_t);
740  globalIDMD_[prefix] = std::string{metadataValPtr, metadataValPtrEnd};
741  }
742  }
743  return PI_SUCCESS;
744 }
745 
746 pi_result _pi_program::set_binary(const char *source, size_t length) {
747  assert((binary_ == nullptr && binarySizeInBytes_ == 0) &&
748  "Re-setting program binary data which has already been set");
749  binary_ = source;
750  binarySizeInBytes_ = length;
751  return PI_SUCCESS;
752 }
753 
754 pi_result _pi_program::build_program(const char *build_options) {
755 
756  this->buildOptions_ = build_options;
757 
758  constexpr const unsigned int numberOfOptions = 4u;
759 
760  CUjit_option options[numberOfOptions];
761  void *optionVals[numberOfOptions];
762 
763  // Pass a buffer for info messages
764  options[0] = CU_JIT_INFO_LOG_BUFFER;
765  optionVals[0] = (void *)infoLog_;
766  // Pass the size of the info buffer
767  options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
768  optionVals[1] = (void *)(long)MAX_LOG_SIZE;
769  // Pass a buffer for error message
770  options[2] = CU_JIT_ERROR_LOG_BUFFER;
771  optionVals[2] = (void *)errorLog_;
772  // Pass the size of the error buffer
773  options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
774  optionVals[3] = (void *)(long)MAX_LOG_SIZE;
775 
776  auto result = PI_CHECK_ERROR(
777  cuModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
778  numberOfOptions, options, optionVals));
779 
780  const auto success = (result == PI_SUCCESS);
781 
782  buildStatus_ =
784 
785  // If no exception, result is correct
786  return success ? PI_SUCCESS : PI_ERROR_BUILD_PROGRAM_FAILURE;
787 }
788 
797  sycl::detail::pi::die("getKernelNames not implemented");
798  return {};
799 }
800 
801 //-- PI API implementation
802 extern "C" {
803 
805  size_t param_value_size, void *param_value,
806  size_t *param_value_size_ret);
807 
817  pi_uint32 *num_platforms) {
818 
819  try {
820  static std::once_flag initFlag;
821  static pi_uint32 numPlatforms = 1;
822  static std::vector<_pi_platform> platformIds;
823 
824  if (num_entries == 0 && platforms != nullptr) {
825  return PI_ERROR_INVALID_VALUE;
826  }
827  if (platforms == nullptr && num_platforms == nullptr) {
828  return PI_ERROR_INVALID_VALUE;
829  }
830 
831  pi_result err = PI_SUCCESS;
832 
833  std::call_once(
834  initFlag,
835  [](pi_result &err) {
836  if (cuInit(0) != CUDA_SUCCESS) {
837  numPlatforms = 0;
838  return;
839  }
840  int numDevices = 0;
841  err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices));
842  if (numDevices == 0) {
843  numPlatforms = 0;
844  return;
845  }
846  try {
847  // make one platform per device
848  numPlatforms = numDevices;
849  platformIds.resize(numDevices);
850 
851  for (int i = 0; i < numDevices; ++i) {
852  CUdevice device;
853  err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
854  CUcontext context;
855  err = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device));
856 
857  ScopedContext active(context);
858  CUevent evBase;
859  err = PI_CHECK_ERROR(cuEventCreate(&evBase, CU_EVENT_DEFAULT));
860 
861  // Use default stream to record base event counter
862  err = PI_CHECK_ERROR(cuEventRecord(evBase, 0));
863 
864  platformIds[i].devices_.emplace_back(
865  new _pi_device{device, context, evBase, &platformIds[i]});
866 
867  {
868  const auto &dev = platformIds[i].devices_.back().get();
869  size_t maxWorkGroupSize = 0u;
870  size_t maxThreadsPerBlock[3] = {};
871  pi_result retError = cuda_piDeviceGetInfo(
873  sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
874  assert(retError == PI_SUCCESS);
875  (void)retError;
876 
877  retError = cuda_piDeviceGetInfo(
879  sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr);
880  assert(retError == PI_SUCCESS);
881 
882  dev->save_max_work_item_sizes(sizeof(maxThreadsPerBlock),
883  maxThreadsPerBlock);
884  dev->save_max_work_group_size(maxWorkGroupSize);
885  }
886  }
887  } catch (const std::bad_alloc &) {
888  // Signal out-of-memory situation
889  for (int i = 0; i < numDevices; ++i) {
890  platformIds[i].devices_.clear();
891  }
892  platformIds.clear();
893  err = PI_ERROR_OUT_OF_HOST_MEMORY;
894  } catch (...) {
895  // Clear and rethrow to allow retry
896  for (int i = 0; i < numDevices; ++i) {
897  platformIds[i].devices_.clear();
898  }
899  platformIds.clear();
900  throw;
901  }
902  },
903  err);
904 
905  if (num_platforms != nullptr) {
906  *num_platforms = numPlatforms;
907  }
908 
909  if (platforms != nullptr) {
910  for (unsigned i = 0; i < std::min(num_entries, numPlatforms); ++i) {
911  platforms[i] = &platformIds[i];
912  }
913  }
914 
915  return err;
916  } catch (pi_result err) {
917  return err;
918  } catch (...) {
919  return PI_ERROR_OUT_OF_RESOURCES;
920  }
921 }
922 
924  pi_platform_info param_name,
925  size_t param_value_size, void *param_value,
926  size_t *param_value_size_ret) {
927  assert(platform != nullptr);
928 
929  switch (param_name) {
931  return getInfo(param_value_size, param_value, param_value_size_ret,
932  "NVIDIA CUDA BACKEND");
934  return getInfo(param_value_size, param_value, param_value_size_ret,
935  "NVIDIA Corporation");
937  return getInfo(param_value_size, param_value, param_value_size_ret,
938  "FULL PROFILE");
940  auto version = getCudaVersionString();
941  return getInfo(param_value_size, param_value, param_value_size_ret,
942  version.c_str());
943  }
945  return getInfo(param_value_size, param_value, param_value_size_ret, "");
946  }
947  default:
949  }
950  sycl::detail::pi::die("Platform info request not implemented");
951  return {};
952 }
953 
960  pi_uint32 num_entries, pi_device *devices,
961  pi_uint32 *num_devices) {
962 
963  pi_result err = PI_SUCCESS;
964  const bool askingForDefault = device_type == PI_DEVICE_TYPE_DEFAULT;
965  const bool askingForGPU = device_type & PI_DEVICE_TYPE_GPU;
966  const bool returnDevices = askingForDefault || askingForGPU;
967 
968  size_t numDevices = returnDevices ? platform->devices_.size() : 0;
969 
970  try {
971  if (num_devices) {
972  *num_devices = numDevices;
973  }
974 
975  if (returnDevices && devices) {
976  for (size_t i = 0; i < std::min(size_t(num_entries), numDevices); ++i) {
977  devices[i] = platform->devices_[i].get();
978  }
979  }
980 
981  return err;
982  } catch (pi_result err) {
983  return err;
984  } catch (...) {
985  return PI_ERROR_OUT_OF_RESOURCES;
986  }
987 }
988 
991 pi_result cuda_piDeviceRetain(pi_device) { return PI_SUCCESS; }
992 
994  size_t param_value_size, void *param_value,
995  size_t *param_value_size_ret) {
996 
997  switch (param_name) {
999  return getInfo(param_value_size, param_value, param_value_size_ret, 1);
1001  return getInfo(param_value_size, param_value, param_value_size_ret,
1002  context->get_device());
1004  return getInfo(param_value_size, param_value, param_value_size_ret,
1005  context->get_reference_count());
1010  // These queries should be dealt with in context_impl.cpp by calling the
1011  // queries of each device separately and building the intersection set.
1012  setErrorMessage("These queries should have never come here.",
1013  PI_ERROR_INVALID_ARG_VALUE);
1014  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
1015  }
1017  return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
1018  true);
1021  // 2D USM operations currently not supported.
1022  return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
1023  false);
1024  default:
1026  }
1027 
1028  return PI_ERROR_OUT_OF_RESOURCES;
1029 }
1030 
1032  assert(context != nullptr);
1033  assert(context->get_reference_count() > 0);
1034 
1035  context->increment_reference_count();
1036  return PI_SUCCESS;
1037 }
1038 
1040  pi_context context, pi_context_extended_deleter function, void *user_data) {
1041  context->set_extended_deleter(function, user_data);
1042  return PI_SUCCESS;
1043 }
1044 
1048  pi_uint32, pi_device *, pi_uint32 *) {
1049  return {};
1050 }
1051 
1055  pi_device_binary *binaries,
1056  pi_uint32 num_binaries,
1057  pi_uint32 *selected_binary) {
1058  // Ignore unused parameter
1059  (void)device;
1060 
1061  if (!binaries) {
1062  sycl::detail::pi::die("No list of device images provided");
1063  }
1064  if (num_binaries < 1) {
1065  sycl::detail::pi::die("No binary images in the list");
1066  }
1067 
1068  // Look for an image for the NVPTX64 target, and return the first one that is
1069  // found
1070  for (pi_uint32 i = 0; i < num_binaries; i++) {
1071  if (strcmp(binaries[i]->DeviceTargetSpec,
1073  *selected_binary = i;
1074  return PI_SUCCESS;
1075  }
1076  }
1077 
1078  // No image can be loaded for the given device
1079  return PI_ERROR_INVALID_BINARY;
1080 }
1081 
1083  pi_program program,
1084  const char *func_name,
1085  pi_uint64 *func_pointer_ret) {
1086  // Check if device passed is the same the device bound to the context
1087  assert(device == program->get_context()->get_device());
1088  assert(func_pointer_ret != nullptr);
1089 
1090  CUfunction func;
1091  CUresult ret = cuModuleGetFunction(&func, program->get(), func_name);
1092  *func_pointer_ret = reinterpret_cast<pi_uint64>(func);
1093  pi_result retError = PI_SUCCESS;
1094 
1095  if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_FOUND)
1096  retError = PI_CHECK_ERROR(ret);
1097  if (ret == CUDA_ERROR_NOT_FOUND) {
1098  *func_pointer_ret = 0;
1099  retError = PI_ERROR_INVALID_KERNEL_NAME;
1100  }
1101 
1102  return retError;
1103 }
1104 
1108 
1110  size_t param_value_size, void *param_value,
1111  size_t *param_value_size_ret) {
1112 
1113  static constexpr pi_uint32 max_work_item_dimensions = 3u;
1114 
1115  assert(device != nullptr);
1116 
1117  ScopedContext active(device->get_context());
1118 
1119  switch (param_name) {
1120  case PI_DEVICE_INFO_TYPE: {
1121  return getInfo(param_value_size, param_value, param_value_size_ret,
1123  }
1124  case PI_DEVICE_INFO_VENDOR_ID: {
1125  return getInfo(param_value_size, param_value, param_value_size_ret, 4318u);
1126  }
1128  int compute_units = 0;
1130  cuDeviceGetAttribute(&compute_units,
1131  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1132  device->get()) == CUDA_SUCCESS);
1133  sycl::detail::pi::assertion(compute_units >= 0);
1134  return getInfo(param_value_size, param_value, param_value_size_ret,
1135  pi_uint32(compute_units));
1136  }
1138  return getInfo(param_value_size, param_value, param_value_size_ret,
1139  max_work_item_dimensions);
1140  }
1142  size_t return_sizes[max_work_item_dimensions];
1143 
1144  int max_x = 0, max_y = 0, max_z = 0;
1146  cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
1147  device->get()) == CUDA_SUCCESS);
1148  sycl::detail::pi::assertion(max_x >= 0);
1149 
1151  cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
1152  device->get()) == CUDA_SUCCESS);
1153  sycl::detail::pi::assertion(max_y >= 0);
1154 
1156  cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
1157  device->get()) == CUDA_SUCCESS);
1158  sycl::detail::pi::assertion(max_z >= 0);
1159 
1160  return_sizes[0] = size_t(max_x);
1161  return_sizes[1] = size_t(max_y);
1162  return_sizes[2] = size_t(max_z);
1163  return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
1164  param_value_size_ret, return_sizes);
1165  }
1166 
1168  size_t return_sizes[max_work_item_dimensions];
1169  int max_x = 0, max_y = 0, max_z = 0;
1171  cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
1172  device->get()) == CUDA_SUCCESS);
1173  sycl::detail::pi::assertion(max_x >= 0);
1174 
1176  cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
1177  device->get()) == CUDA_SUCCESS);
1178  sycl::detail::pi::assertion(max_y >= 0);
1179 
1181  cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
1182  device->get()) == CUDA_SUCCESS);
1183  sycl::detail::pi::assertion(max_z >= 0);
1184 
1185  return_sizes[0] = size_t(max_x);
1186  return_sizes[1] = size_t(max_y);
1187  return_sizes[2] = size_t(max_z);
1188  return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
1189  param_value_size_ret, return_sizes);
1190  }
1191 
1193  int max_work_group_size = 0;
1195  cuDeviceGetAttribute(&max_work_group_size,
1196  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1197  device->get()) == CUDA_SUCCESS);
1198 
1199  sycl::detail::pi::assertion(max_work_group_size >= 0);
1200 
1201  return getInfo(param_value_size, param_value, param_value_size_ret,
1202  size_t(max_work_group_size));
1203  }
1205  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1206  }
1208  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1209  }
1211  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1212  }
1214  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1215  }
1217  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1218  }
1220  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1221  }
1223  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1224  }
1226  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1227  }
1229  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1230  }
1232  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1233  }
1235  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1236  }
1238  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1239  }
1241  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1242  }
1244  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1245  }
1247  // Number of sub-groups = max block size / warp size + possible remainder
1248  int max_threads = 0;
1250  cuDeviceGetAttribute(&max_threads,
1251  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1252  device->get()) == CUDA_SUCCESS);
1253  int warpSize = 0;
1255  cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
1256  device->get()) == CUDA_SUCCESS);
1257  int maxWarps = (max_threads + warpSize - 1) / warpSize;
1258  return getInfo(param_value_size, param_value, param_value_size_ret,
1259  static_cast<uint32_t>(maxWarps));
1260  }
1262  // Volta provides independent thread scheduling
1263  // TODO: Revisit for previous generation GPUs
1264  int major = 0;
1266  cuDeviceGetAttribute(&major,
1267  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1268  device->get()) == CUDA_SUCCESS);
1269  bool ifp = (major >= 7);
1270  return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
1271  }
1272 
1273  case PI_DEVICE_INFO_ATOMIC_64: {
1274  int major = 0;
1276  cuDeviceGetAttribute(&major,
1277  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1278  device->get()) == CUDA_SUCCESS);
1279 
1280  bool atomic64 = (major >= 6) ? true : false;
1281  return getInfo(param_value_size, param_value, param_value_size_ret,
1282  atomic64);
1283  }
1285  pi_memory_order_capabilities capabilities =
1288  return getInfo(param_value_size, param_value, param_value_size_ret,
1289  capabilities);
1290  }
1292  int major = 0;
1294  cuDeviceGetAttribute(&major,
1295  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1296  device->get()) == CUDA_SUCCESS);
1297  pi_memory_order_capabilities capabilities =
1303  return getInfo(param_value_size, param_value, param_value_size_ret,
1304  capabilities);
1305  }
1308  // There is no way to query this in the backend
1309  setErrorMessage("CUDA backend does not support this query",
1310  PI_ERROR_INVALID_ARG_VALUE);
1311  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
1313  int major = 0;
1315  cuDeviceGetAttribute(&major,
1316  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1317  device->get()) == CUDA_SUCCESS);
1318 
1319  bool bfloat16 = (major >= 8) ? true : false;
1320  return getInfo(param_value_size, param_value, param_value_size_ret,
1321  bfloat16);
1322  }
1324  // NVIDIA devices only support one sub-group size (the warp size)
1325  int warpSize = 0;
1327  cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
1328  device->get()) == CUDA_SUCCESS);
1329  size_t sizes[1] = {static_cast<size_t>(warpSize)};
1330  return getInfoArray<size_t>(1, param_value_size, param_value,
1331  param_value_size_ret, sizes);
1332  }
1334  int clock_freq = 0;
1336  cuDeviceGetAttribute(&clock_freq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
1337  device->get()) == CUDA_SUCCESS);
1338  sycl::detail::pi::assertion(clock_freq >= 0);
1339  return getInfo(param_value_size, param_value, param_value_size_ret,
1340  pi_uint32(clock_freq) / 1000u);
1341  }
1343  auto bits = pi_uint32{std::numeric_limits<uintptr_t>::digits};
1344  return getInfo(param_value_size, param_value, param_value_size_ret, bits);
1345  }
1347  // Max size of memory object allocation in bytes.
1348  // The minimum value is max(min(1024 × 1024 ×
1349  // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE),
1350  // 32 × 1024 × 1024) for devices that are not of type
1351  // CL_DEVICE_TYPE_CUSTOM.
1352 
1353  size_t global = 0;
1354  sycl::detail::pi::assertion(cuDeviceTotalMem(&global, device->get()) ==
1355  CUDA_SUCCESS);
1356 
1357  auto quarter_global = static_cast<pi_uint32>(global / 4u);
1358 
1359  auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global),
1360  32u * 1024u * 1024u);
1361 
1362  return getInfo(param_value_size, param_value, param_value_size_ret,
1363  pi_uint64{max_alloc});
1364  }
1366  pi_bool enabled = PI_FALSE;
1367 
1368  if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) {
1369  enabled = PI_TRUE;
1370  } else {
1372  "Images are not fully supported by the CUDA BE, their support is "
1373  "disabled by default. Their partial support can be activated by "
1374  "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at "
1375  "runtime.");
1376  }
1377 
1378  return getInfo(param_value_size, param_value, param_value_size_ret,
1379  enabled);
1380  }
1382  // This call doesn't match to CUDA as it doesn't have images, but instead
1383  // surfaces and textures. No clear call in the CUDA API to determine this,
1384  // but some searching found as of SM 2.x 128 are supported.
1385  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1386  }
1388  // This call doesn't match to CUDA as it doesn't have images, but instead
1389  // surfaces and textures. No clear call in the CUDA API to determine this,
1390  // but some searching found as of SM 2.x 128 are supported.
1391  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1392  }
1394  // Take the smaller of maximum surface and maximum texture height.
1395  int tex_height = 0;
1397  cuDeviceGetAttribute(&tex_height,
1398  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT,
1399  device->get()) == CUDA_SUCCESS);
1400  sycl::detail::pi::assertion(tex_height >= 0);
1401  int surf_height = 0;
1403  cuDeviceGetAttribute(&surf_height,
1404  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT,
1405  device->get()) == CUDA_SUCCESS);
1406  sycl::detail::pi::assertion(surf_height >= 0);
1407 
1408  int min = std::min(tex_height, surf_height);
1409 
1410  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1411  }
1413  // Take the smaller of maximum surface and maximum texture width.
1414  int tex_width = 0;
1416  cuDeviceGetAttribute(&tex_width,
1417  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH,
1418  device->get()) == CUDA_SUCCESS);
1419  sycl::detail::pi::assertion(tex_width >= 0);
1420  int surf_width = 0;
1422  cuDeviceGetAttribute(&surf_width,
1423  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH,
1424  device->get()) == CUDA_SUCCESS);
1425  sycl::detail::pi::assertion(surf_width >= 0);
1426 
1427  int min = std::min(tex_width, surf_width);
1428 
1429  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1430  }
1432  // Take the smaller of maximum surface and maximum texture height.
1433  int tex_height = 0;
1435  cuDeviceGetAttribute(&tex_height,
1436  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT,
1437  device->get()) == CUDA_SUCCESS);
1438  sycl::detail::pi::assertion(tex_height >= 0);
1439  int surf_height = 0;
1441  cuDeviceGetAttribute(&surf_height,
1442  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT,
1443  device->get()) == CUDA_SUCCESS);
1444  sycl::detail::pi::assertion(surf_height >= 0);
1445 
1446  int min = std::min(tex_height, surf_height);
1447 
1448  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1449  }
1451  // Take the smaller of maximum surface and maximum texture width.
1452  int tex_width = 0;
1454  cuDeviceGetAttribute(&tex_width,
1455  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH,
1456  device->get()) == CUDA_SUCCESS);
1457  sycl::detail::pi::assertion(tex_width >= 0);
1458  int surf_width = 0;
1460  cuDeviceGetAttribute(&surf_width,
1461  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH,
1462  device->get()) == CUDA_SUCCESS);
1463  sycl::detail::pi::assertion(surf_width >= 0);
1464 
1465  int min = std::min(tex_width, surf_width);
1466 
1467  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1468  }
1470  // Take the smaller of maximum surface and maximum texture depth.
1471  int tex_depth = 0;
1473  cuDeviceGetAttribute(&tex_depth,
1474  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH,
1475  device->get()) == CUDA_SUCCESS);
1476  sycl::detail::pi::assertion(tex_depth >= 0);
1477  int surf_depth = 0;
1479  cuDeviceGetAttribute(&surf_depth,
1480  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH,
1481  device->get()) == CUDA_SUCCESS);
1482  sycl::detail::pi::assertion(surf_depth >= 0);
1483 
1484  int min = std::min(tex_depth, surf_depth);
1485 
1486  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1487  }
1489  // Take the smaller of maximum surface and maximum texture width.
1490  int tex_width = 0;
1492  cuDeviceGetAttribute(&tex_width,
1493  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH,
1494  device->get()) == CUDA_SUCCESS);
1495  sycl::detail::pi::assertion(tex_width >= 0);
1496  int surf_width = 0;
1498  cuDeviceGetAttribute(&surf_width,
1499  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH,
1500  device->get()) == CUDA_SUCCESS);
1501  sycl::detail::pi::assertion(surf_width >= 0);
1502 
1503  int min = std::min(tex_width, surf_width);
1504 
1505  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1506  }
1508  return getInfo(param_value_size, param_value, param_value_size_ret,
1509  size_t(0));
1510  }
1512  // This call is kind of meaningless for cuda, as samplers don't exist.
1513  // Closest thing is textures, which is 128.
1514  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1515  }
1517  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters
1518  // __global__ function parameters are passed to the device via constant
1519  // memory and are limited to 4 KB.
1520  return getInfo(param_value_size, param_value, param_value_size_ret,
1521  size_t{4000u});
1522  }
1524  int mem_base_addr_align = 0;
1526  cuDeviceGetAttribute(&mem_base_addr_align,
1527  CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
1528  device->get()) == CUDA_SUCCESS);
1529  // Multiply by 8 as clGetDeviceInfo returns this value in bits
1530  mem_base_addr_align *= 8;
1531  return getInfo(param_value_size, param_value, param_value_size_ret,
1532  mem_base_addr_align);
1533  }
1535  // TODO: is this config consistent across all NVIDIA GPUs?
1536  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1537  }
1539  // TODO: is this config consistent across all NVIDIA GPUs?
1543  return getInfo(param_value_size, param_value, param_value_size_ret, config);
1544  }
1546  // TODO: is this config consistent across all NVIDIA GPUs?
1549  return getInfo(param_value_size, param_value, param_value_size_ret, config);
1550  }
1552  // TODO: is this config consistent across all NVIDIA GPUs?
1553  return getInfo(param_value_size, param_value, param_value_size_ret,
1555  }
1557  // The value is documented for all existing GPUs in the CUDA programming
1558  // guidelines, section "H.3.2. Global Memory".
1559  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1560  }
1562  int cache_size = 0;
1564  cuDeviceGetAttribute(&cache_size, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1565  device->get()) == CUDA_SUCCESS);
1566  sycl::detail::pi::assertion(cache_size >= 0);
1567  // The L2 cache is global to the GPU.
1568  return getInfo(param_value_size, param_value, param_value_size_ret,
1569  pi_uint64(cache_size));
1570  }
1572  size_t bytes = 0;
1573  // Runtime API has easy access to this value, driver API info is scarse.
1574  sycl::detail::pi::assertion(cuDeviceTotalMem(&bytes, device->get()) ==
1575  CUDA_SUCCESS);
1576  return getInfo(param_value_size, param_value, param_value_size_ret,
1577  pi_uint64{bytes});
1578  }
1580  int constant_memory = 0;
1582  cuDeviceGetAttribute(&constant_memory,
1583  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
1584  device->get()) == CUDA_SUCCESS);
1585  sycl::detail::pi::assertion(constant_memory >= 0);
1586 
1587  return getInfo(param_value_size, param_value, param_value_size_ret,
1588  pi_uint64(constant_memory));
1589  }
1591  // TODO: is there a way to retrieve this from CUDA driver API?
1592  // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
1593  // 1060 3GB
1594  return getInfo(param_value_size, param_value, param_value_size_ret, 9u);
1595  }
1597  return getInfo(param_value_size, param_value, param_value_size_ret,
1599  }
1601  // OpenCL's "local memory" maps most closely to CUDA's "shared memory".
1602  // CUDA has its own definition of "local memory", which maps to OpenCL's
1603  // "private memory".
1604  int local_mem_size = 0;
1606  cuDeviceGetAttribute(&local_mem_size,
1607  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
1608  device->get()) == CUDA_SUCCESS);
1609  sycl::detail::pi::assertion(local_mem_size >= 0);
1610  return getInfo(param_value_size, param_value, param_value_size_ret,
1611  pi_uint64(local_mem_size));
1612  }
1614  int ecc_enabled = 0;
1616  cuDeviceGetAttribute(&ecc_enabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
1617  device->get()) == CUDA_SUCCESS);
1618 
1619  sycl::detail::pi::assertion((ecc_enabled == 0) | (ecc_enabled == 1));
1620  auto result = static_cast<pi_bool>(ecc_enabled);
1621  return getInfo(param_value_size, param_value, param_value_size_ret, result);
1622  }
1624  int is_integrated = 0;
1626  cuDeviceGetAttribute(&is_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
1627  device->get()) == CUDA_SUCCESS);
1628 
1629  sycl::detail::pi::assertion((is_integrated == 0) | (is_integrated == 1));
1630  auto result = static_cast<pi_bool>(is_integrated);
1631  return getInfo(param_value_size, param_value, param_value_size_ret, result);
1632  }
1634  // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
1635  // 1060 3GB
1636  return getInfo(param_value_size, param_value, param_value_size_ret,
1637  size_t{1000u});
1638  }
1640  return getInfo(param_value_size, param_value, param_value_size_ret,
1641  PI_TRUE);
1642  }
1643  case PI_DEVICE_INFO_AVAILABLE: {
1644  return getInfo(param_value_size, param_value, param_value_size_ret,
1645  PI_TRUE);
1646  }
1648  return getInfo(param_value_size, param_value, param_value_size_ret,
1649  PI_TRUE);
1650  }
1652  return getInfo(param_value_size, param_value, param_value_size_ret,
1653  PI_TRUE);
1654  }
1656  return getInfo(param_value_size, param_value, param_value_size_ret,
1657  PI_TRUE);
1658  }
1660  auto capability = PI_DEVICE_EXEC_CAPABILITIES_KERNEL;
1661  return getInfo(param_value_size, param_value, param_value_size_ret,
1662  capability);
1663  }
1665  // The mandated minimum capability:
1666  auto capability = PI_QUEUE_FLAG_PROFILING_ENABLE |
1668  return getInfo(param_value_size, param_value, param_value_size_ret,
1669  capability);
1670  }
1672  // The mandated minimum capability:
1673  auto capability = PI_QUEUE_FLAG_PROFILING_ENABLE;
1674  return getInfo(param_value_size, param_value, param_value_size_ret,
1675  capability);
1676  }
1678  // An empty string is returned if no built-in kernels are supported by the
1679  // device.
1680  return getInfo(param_value_size, param_value, param_value_size_ret, "");
1681  }
1682  case PI_DEVICE_INFO_PLATFORM: {
1683  return getInfo(param_value_size, param_value, param_value_size_ret,
1684  device->get_platform());
1685  }
1686  case PI_DEVICE_INFO_NAME: {
1687  static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u;
1688  char name[MAX_DEVICE_NAME_LENGTH];
1689  sycl::detail::pi::assertion(cuDeviceGetName(name, MAX_DEVICE_NAME_LENGTH,
1690  device->get()) == CUDA_SUCCESS);
1691  return getInfoArray(strlen(name) + 1, param_value_size, param_value,
1692  param_value_size_ret, name);
1693  }
1694  case PI_DEVICE_INFO_VENDOR: {
1695  return getInfo(param_value_size, param_value, param_value_size_ret,
1696  "NVIDIA Corporation");
1697  }
1699  auto version = getCudaVersionString();
1700  return getInfo(param_value_size, param_value, param_value_size_ret,
1701  version.c_str());
1702  }
1703  case PI_DEVICE_INFO_PROFILE: {
1704  return getInfo(param_value_size, param_value, param_value_size_ret, "CUDA");
1705  }
1707  return getInfo(param_value_size, param_value, param_value_size_ret,
1708  device->get_reference_count());
1709  }
1710  case PI_DEVICE_INFO_VERSION: {
1711  return getInfo(param_value_size, param_value, param_value_size_ret,
1712  "PI 0.0");
1713  }
1715  return getInfo(param_value_size, param_value, param_value_size_ret, "");
1716  }
1718 
1719  std::string SupportedExtensions = "cl_khr_fp64 ";
1720  SupportedExtensions += PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT;
1721  SupportedExtensions += " ";
1722 
1723  int major = 0;
1724  int minor = 0;
1725 
1727  cuDeviceGetAttribute(&major,
1728  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1729  device->get()) == CUDA_SUCCESS);
1731  cuDeviceGetAttribute(&minor,
1732  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
1733  device->get()) == CUDA_SUCCESS);
1734 
1735  if ((major >= 6) || ((major == 5) && (minor >= 3))) {
1736  SupportedExtensions += "cl_khr_fp16 ";
1737  }
1738 
1739  return getInfo(param_value_size, param_value, param_value_size_ret,
1740  SupportedExtensions.c_str());
1741  }
1743  // The minimum value for the FULL profile is 1 MB.
1744  return getInfo(param_value_size, param_value, param_value_size_ret,
1745  size_t{1024u});
1746  }
1748  return getInfo(param_value_size, param_value, param_value_size_ret,
1749  PI_TRUE);
1750  }
1752  return getInfo(param_value_size, param_value, param_value_size_ret,
1753  nullptr);
1754  }
1756  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1757  }
1759  return getInfo(param_value_size, param_value, param_value_size_ret,
1760  static_cast<pi_device_partition_property>(0u));
1761  }
1763  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1764  }
1766  return getInfo(param_value_size, param_value, param_value_size_ret,
1767  static_cast<pi_device_partition_property>(0u));
1768  }
1769 
1770  // Intel USM extensions
1771 
1773  // from cl_intel_unified_shared_memory: "The host memory access capabilities
1774  // apply to any host allocation."
1775  //
1776  // query if/how the device can access page-locked host memory, possibly
1777  // through PCIe, using the same pointer as the host
1778  pi_bitfield value = {};
1779  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
1780  // the device shares a unified address space with the host
1781  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
1782  6) {
1783  // compute capability 6.x introduces operations that are atomic with
1784  // respect to other CPUs and GPUs in the system
1787  } else {
1788  // on GPU architectures with compute capability lower than 6.x, atomic
1789  // operations from the GPU to CPU memory will not be atomic with respect
1790  // to CPU initiated atomic operations
1792  }
1793  }
1794  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1795  }
1797  // from cl_intel_unified_shared_memory:
1798  // "The device memory access capabilities apply to any device allocation
1799  // associated with this device."
1800  //
1801  // query how the device can access memory allocated on the device itself (?)
1805  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1806  }
1808  // from cl_intel_unified_shared_memory:
1809  // "The single device shared memory access capabilities apply to any shared
1810  // allocation associated with this device."
1811  //
1812  // query if/how the device can access managed memory associated to it
1813  pi_bitfield value = {};
1814  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
1815  // the device can allocate managed memory on this system
1817  }
1818  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
1819  // the device can coherently access managed memory concurrently with the
1820  // CPU
1821  value |= PI_USM_CONCURRENT_ACCESS;
1822  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
1823  6) {
1824  // compute capability 6.x introduces operations that are atomic with
1825  // respect to other CPUs and GPUs in the system
1827  }
1828  }
1829  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1830  }
1832  // from cl_intel_unified_shared_memory:
1833  // "The cross-device shared memory access capabilities apply to any shared
1834  // allocation associated with this device, or to any shared memory
1835  // allocation on another device that also supports the same cross-device
1836  // shared memory access capability."
1837  //
1838  // query if/how the device can access managed memory associated to other
1839  // devices
1840  pi_bitfield value = {};
1841  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
1842  // the device can allocate managed memory on this system
1843  value |= PI_USM_ACCESS;
1844  }
1845  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
1846  // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
1847  // attribute can coherently access managed memory concurrently with the
1848  // CPU
1849  value |= PI_USM_CONCURRENT_ACCESS;
1850  }
1851  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
1852  6) {
1853  // compute capability 6.x introduces operations that are atomic with
1854  // respect to other CPUs and GPUs in the system
1855  if (value & PI_USM_ACCESS)
1856  value |= PI_USM_ATOMIC_ACCESS;
1857  if (value & PI_USM_CONCURRENT_ACCESS)
1859  }
1860  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1861  }
1863  // from cl_intel_unified_shared_memory:
1864  // "The shared system memory access capabilities apply to any allocations
1865  // made by a system allocator, such as malloc or new."
1866  //
1867  // query if/how the device can access pageable host memory allocated by the
1868  // system allocator
1869  pi_bitfield value = {};
1870  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) {
1871  // the device suppports coherently accessing pageable memory without
1872  // calling cuMemHostRegister/cudaHostRegister on it
1873  if (getAttribute(device,
1874  CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) {
1875  // the link between the device and the host supports native atomic
1876  // operations
1879  } else {
1880  // the link between the device and the host does not support native
1881  // atomic operations
1883  }
1884  }
1885  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1886  }
1888  int value =
1889  getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8;
1890  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1891  }
1893  int major =
1894  getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
1895  int minor =
1896  getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
1897  std::string result = std::to_string(major) + "." + std::to_string(minor);
1898  return getInfo(param_value_size, param_value, param_value_size_ret,
1899  result.c_str());
1900  }
1901 
1903  size_t FreeMemory = 0;
1904  size_t TotalMemory = 0;
1905  sycl::detail::pi::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
1906  CUDA_SUCCESS,
1907  "failed cuMemGetInfo() API.");
1908  return getInfo(param_value_size, param_value, param_value_size_ret,
1909  FreeMemory);
1910  }
1912  int value = 0;
1914  cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
1915  device->get()) == CUDA_SUCCESS);
1916  sycl::detail::pi::assertion(value >= 0);
1917  // Convert kilohertz to megahertz when returning.
1918  return getInfo(param_value_size, param_value, param_value_size_ret,
1919  value / 1000);
1920  }
1922  int value = 0;
1924  cuDeviceGetAttribute(&value,
1925  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
1926  device->get()) == CUDA_SUCCESS);
1927  sycl::detail::pi::assertion(value >= 0);
1928  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1929  }
1931  return getInfo(param_value_size, param_value, param_value_size_ret,
1932  pi_int32{1});
1933  }
1934 
1935  case PI_DEVICE_INFO_DEVICE_ID: {
1936  int value = 0;
1938  cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
1939  device->get()) == CUDA_SUCCESS);
1940  sycl::detail::pi::assertion(value >= 0);
1941  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1942  }
1943 
1944  case PI_DEVICE_INFO_UUID: {
1945  CUuuid uuid;
1946 #if (CUDA_VERSION >= 11040)
1947  sycl::detail::pi::assertion(cuDeviceGetUuid_v2(&uuid, device->get()) ==
1948  CUDA_SUCCESS);
1949 #else
1950  sycl::detail::pi::assertion(cuDeviceGetUuid(&uuid, device->get()) ==
1951  CUDA_SUCCESS);
1952 #endif
1953  std::array<unsigned char, 16> name;
1954  std::copy(uuid.bytes, uuid.bytes + 16, name.begin());
1955  return getInfoArray(16, param_value_size, param_value, param_value_size_ret,
1956  name.data());
1957  }
1958 
1960  int major = 0;
1962  cuDeviceGetAttribute(&major,
1963  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1964  device->get()) == CUDA_SUCCESS);
1965 
1966  int minor = 0;
1968  cuDeviceGetAttribute(&minor,
1969  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
1970  device->get()) == CUDA_SUCCESS);
1971 
1972  // Some specific devices seem to need special handling. See reference
1973  // https://github.com/jeffhammond/HPCInfo/blob/master/cuda/gpu-detect.cu
1974  bool is_xavier_agx = major == 7 && minor == 2;
1975  bool is_orin_agx = major == 8 && minor == 7;
1976 
1977  int memory_clock_khz = 0;
1978  if (is_xavier_agx) {
1979  memory_clock_khz = 2133000;
1980  } else if (is_orin_agx) {
1981  memory_clock_khz = 3200000;
1982  } else {
1984  cuDeviceGetAttribute(&memory_clock_khz,
1985  CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
1986  device->get()) == CUDA_SUCCESS);
1987  }
1988 
1989  int memory_bus_width = 0;
1990  if (is_orin_agx) {
1991  memory_bus_width = 256;
1992  } else {
1994  cuDeviceGetAttribute(&memory_bus_width,
1995  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
1996  device->get()) == CUDA_SUCCESS);
1997  }
1998 
1999  uint64_t memory_bandwidth =
2000  uint64_t(memory_clock_khz) * memory_bus_width * 250;
2001 
2002  return getInfo(param_value_size, param_value, param_value_size_ret,
2003  memory_bandwidth);
2004  }
2005 
2006  // TODO: Investigate if this information is available on CUDA.
2014  return PI_ERROR_INVALID_VALUE;
2015 
2016  default:
2018  }
2019  sycl::detail::pi::die("Device info request not implemented");
2020  return {};
2021 }
2022 
2030  pi_native_handle *nativeHandle) {
2031  *nativeHandle = static_cast<pi_native_handle>(device->get());
2032  return PI_SUCCESS;
2033 }
2034 
2044  pi_platform platform,
2045  pi_device *piDevice) {
2046  assert(piDevice != nullptr);
2047 
2048  CUdevice cu_device = static_cast<CUdevice>(nativeHandle);
2049 
2050  auto is_device = [=](std::unique_ptr<_pi_device> &dev) {
2051  return dev->get() == cu_device;
2052  };
2053 
2054  // If a platform is provided just check if the device is in it
2055  if (platform) {
2056  auto search_res = std::find_if(begin(platform->devices_),
2057  end(platform->devices_), is_device);
2058  if (search_res != end(platform->devices_)) {
2059  *piDevice = (*search_res).get();
2060  return PI_SUCCESS;
2061  }
2062  }
2063 
2064  // Get list of platforms
2065  pi_uint32 num_platforms;
2066  pi_result result = cuda_piPlatformsGet(0, nullptr, &num_platforms);
2067  if (result != PI_SUCCESS)
2068  return result;
2069 
2070  pi_platform *plat =
2071  static_cast<pi_platform *>(malloc(num_platforms * sizeof(pi_platform)));
2072  result = cuda_piPlatformsGet(num_platforms, plat, nullptr);
2073  if (result != PI_SUCCESS)
2074  return result;
2075 
2076  // Iterate through platforms to find device that matches nativeHandle
2077  for (pi_uint32 j = 0; j < num_platforms; ++j) {
2078  auto search_res = std::find_if(begin(plat[j]->devices_),
2079  end(plat[j]->devices_), is_device);
2080  if (search_res != end(plat[j]->devices_)) {
2081  *piDevice = (*search_res).get();
2082  return PI_SUCCESS;
2083  }
2084  }
2085 
2086  // If the provided nativeHandle cannot be matched to an
2087  // existing device return error
2088  return PI_ERROR_INVALID_OPERATION;
2089 }
2090 
2091 /* Context APIs */
2092 
2112  pi_uint32 num_devices, const pi_device *devices,
2113  void (*pfn_notify)(const char *errinfo,
2114  const void *private_info,
2115  size_t cb, void *user_data),
2116  void *user_data, pi_context *retcontext) {
2117 
2118  assert(devices != nullptr);
2119  // TODO: How to implement context callback?
2120  assert(pfn_notify == nullptr);
2121  assert(user_data == nullptr);
2122  assert(num_devices == 1);
2123  // Need input context
2124  assert(retcontext != nullptr);
2125  pi_result errcode_ret = PI_SUCCESS;
2126 
2127  std::unique_ptr<_pi_context> piContextPtr{nullptr};
2128  try {
2129  piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{*devices});
2130  *retcontext = piContextPtr.release();
2131  } catch (pi_result err) {
2132  errcode_ret = err;
2133  } catch (...) {
2134  errcode_ret = PI_ERROR_OUT_OF_RESOURCES;
2135  }
2136  return errcode_ret;
2137 }
2138 
2140  assert(ctxt != nullptr);
2141 
2142  if (ctxt->decrement_reference_count() > 0) {
2143  return PI_SUCCESS;
2144  }
2145  ctxt->invoke_extended_deleters();
2146 
2147  std::unique_ptr<_pi_context> context{ctxt};
2148 
2149  return PI_SUCCESS;
2150 }
2151 
2159  pi_native_handle *nativeHandle) {
2160  *nativeHandle = reinterpret_cast<pi_native_handle>(context->get());
2161  return PI_SUCCESS;
2162 }
2163 
2172  pi_uint32 num_devices,
2173  const pi_device *devices,
2174  bool ownNativeHandle,
2175  pi_context *piContext) {
2176  (void)nativeHandle;
2177  (void)num_devices;
2178  (void)devices;
2179  (void)ownNativeHandle;
2180  (void)piContext;
2181  assert(piContext != nullptr);
2182  assert(ownNativeHandle == false);
2183 
2184  return PI_ERROR_INVALID_OPERATION;
2185 }
2186 
2192  size_t size, void *host_ptr, pi_mem *ret_mem,
2193  const pi_mem_properties *properties) {
2194  // Need input memory object
2195  assert(ret_mem != nullptr);
2196  assert((properties == nullptr || *properties == 0) &&
2197  "no mem properties goes to cuda RT yet");
2198  // Currently, USE_HOST_PTR is not implemented using host register
2199  // since this triggers a weird segfault after program ends.
2200  // Setting this constant to true enables testing that behavior.
2201  const bool enableUseHostPtr = false;
2202  const bool performInitialCopy =
2203  (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
2204  ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && !enableUseHostPtr);
2205  pi_result retErr = PI_SUCCESS;
2206  pi_mem retMemObj = nullptr;
2207 
2208  try {
2209  ScopedContext active(context);
2210  CUdeviceptr ptr;
2213 
2214  if ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && enableUseHostPtr) {
2215  retErr = PI_CHECK_ERROR(
2216  cuMemHostRegister(host_ptr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
2217  retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, host_ptr, 0));
2219  } else if (flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
2220  retErr = PI_CHECK_ERROR(cuMemAllocHost(&host_ptr, size));
2221  retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, host_ptr, 0));
2223  } else {
2224  retErr = PI_CHECK_ERROR(cuMemAlloc(&ptr, size));
2225  if (flags & PI_MEM_FLAGS_HOST_PTR_COPY) {
2227  }
2228  }
2229 
2230  if (retErr == PI_SUCCESS) {
2231  pi_mem parentBuffer = nullptr;
2232 
2233  auto piMemObj = std::unique_ptr<_pi_mem>(
2234  new _pi_mem{context, parentBuffer, allocMode, ptr, host_ptr, size});
2235  if (piMemObj != nullptr) {
2236  retMemObj = piMemObj.release();
2237  if (performInitialCopy) {
2238  // Operates on the default stream of the current CUDA context.
2239  retErr = PI_CHECK_ERROR(cuMemcpyHtoD(ptr, host_ptr, size));
2240  // Synchronize with default stream implicitly used by cuMemcpyHtoD
2241  // to make buffer data available on device before any other PI call
2242  // uses it.
2243  if (retErr == PI_SUCCESS) {
2244  CUstream defaultStream = 0;
2245  retErr = PI_CHECK_ERROR(cuStreamSynchronize(defaultStream));
2246  }
2247  }
2248  } else {
2249  retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
2250  }
2251  }
2252  } catch (pi_result err) {
2253  retErr = err;
2254  } catch (...) {
2255  retErr = PI_ERROR_OUT_OF_RESOURCES;
2256  }
2257 
2258  *ret_mem = retMemObj;
2259 
2260  return retErr;
2261 }
2262 
2268  assert((memObj != nullptr) && "PI_ERROR_INVALID_MEM_OBJECTS");
2269 
2270  pi_result ret = PI_SUCCESS;
2271 
2272  try {
2273 
2274  // Do nothing if there are other references
2275  if (memObj->decrement_reference_count() > 0) {
2276  return PI_SUCCESS;
2277  }
2278 
2279  // make sure memObj is released in case PI_CHECK_ERROR throws
2280  std::unique_ptr<_pi_mem> uniqueMemObj(memObj);
2281 
2282  if (memObj->is_sub_buffer()) {
2283  return PI_SUCCESS;
2284  }
2285 
2286  ScopedContext active(uniqueMemObj->get_context());
2287 
2288  if (memObj->mem_type_ == _pi_mem::mem_type::buffer) {
2289  switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) {
2292  ret = PI_CHECK_ERROR(cuMemFree(uniqueMemObj->mem_.buffer_mem_.ptr_));
2293  break;
2295  ret = PI_CHECK_ERROR(
2296  cuMemHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
2297  break;
2299  ret = PI_CHECK_ERROR(
2300  cuMemFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
2301  };
2302  } else if (memObj->mem_type_ == _pi_mem::mem_type::surface) {
2303  ret = PI_CHECK_ERROR(
2304  cuSurfObjectDestroy(uniqueMemObj->mem_.surface_mem_.get_surface()));
2305  ret = PI_CHECK_ERROR(
2306  cuArrayDestroy(uniqueMemObj->mem_.surface_mem_.get_array()));
2307  }
2308 
2309  } catch (pi_result err) {
2310  ret = err;
2311  } catch (...) {
2312  ret = PI_ERROR_OUT_OF_RESOURCES;
2313  }
2314 
2315  if (ret != PI_SUCCESS) {
2316  // A reported CUDA error is either an implementation or an asynchronous CUDA
2317  // error for which it is unclear if the function that reported it succeeded
2318  // or not. Either way, the state of the program is compromised and likely
2319  // unrecoverable.
2321  "Unrecoverable program state reached in cuda_piMemRelease");
2322  }
2323 
2324  return PI_SUCCESS;
2325 }
2326 
2332  pi_buffer_create_type buffer_create_type,
2333  void *buffer_create_info, pi_mem *memObj) {
2334  assert((parent_buffer != nullptr) && "PI_ERROR_INVALID_MEM_OBJECT");
2335  assert(parent_buffer->is_buffer() && "PI_ERROR_INVALID_MEM_OBJECTS");
2336  assert(!parent_buffer->is_sub_buffer() && "PI_ERROR_INVALID_MEM_OBJECT");
2337 
2338  // Default value for flags means PI_MEM_FLAGS_ACCCESS_RW.
2339  if (flags == 0) {
2340  flags = PI_MEM_FLAGS_ACCESS_RW;
2341  }
2342 
2343  assert((flags == PI_MEM_FLAGS_ACCESS_RW) && "PI_ERROR_INVALID_VALUE");
2344  assert((buffer_create_type == PI_BUFFER_CREATE_TYPE_REGION) &&
2345  "PI_ERROR_INVALID_VALUE");
2346  assert((buffer_create_info != nullptr) && "PI_ERROR_INVALID_VALUE");
2347  assert(memObj != nullptr);
2348 
2349  const auto bufferRegion =
2350  *reinterpret_cast<pi_buffer_region>(buffer_create_info);
2351  assert((bufferRegion.size != 0u) && "PI_ERROR_INVALID_BUFFER_SIZE");
2352 
2353  assert((bufferRegion.origin <= (bufferRegion.origin + bufferRegion.size)) &&
2354  "Overflow");
2355  assert(((bufferRegion.origin + bufferRegion.size) <=
2356  parent_buffer->mem_.buffer_mem_.get_size()) &&
2357  "PI_ERROR_INVALID_BUFFER_SIZE");
2358  // Retained indirectly due to retaining parent buffer below.
2359  pi_context context = parent_buffer->context_;
2362 
2363  assert(parent_buffer->mem_.buffer_mem_.ptr_ !=
2366  parent_buffer->mem_.buffer_mem_.ptr_ + bufferRegion.origin;
2367 
2368  void *hostPtr = nullptr;
2369  if (parent_buffer->mem_.buffer_mem_.hostPtr_) {
2370  hostPtr = static_cast<char *>(parent_buffer->mem_.buffer_mem_.hostPtr_) +
2371  bufferRegion.origin;
2372  }
2373 
2374  std::unique_ptr<_pi_mem> retMemObj{nullptr};
2375  try {
2376  retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{
2377  context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}};
2378  } catch (pi_result err) {
2379  *memObj = nullptr;
2380  return err;
2381  } catch (...) {
2382  *memObj = nullptr;
2383  return PI_ERROR_OUT_OF_HOST_MEMORY;
2384  }
2385 
2386  *memObj = retMemObj.release();
2387  return PI_SUCCESS;
2388 }
2389 
2390 pi_result cuda_piMemGetInfo(pi_mem, pi_mem_info, size_t, void *, size_t *) {
2391  sycl::detail::pi::die("cuda_piMemGetInfo not implemented");
2392 }
2393 
2401  pi_native_handle *nativeHandle) {
2402  *nativeHandle = static_cast<pi_native_handle>(mem->mem_.buffer_mem_.get());
2403  return PI_SUCCESS;
2404 }
2405 
2418  pi_context context,
2419  bool ownNativeHandle,
2420  pi_mem *mem) {
2422  "Creation of PI mem from native handle not implemented");
2423  return {};
2424 }
2425 
2433  pi_queue_properties properties, pi_queue *queue) {
2434  try {
2435  std::unique_ptr<_pi_queue> queueImpl{nullptr};
2436 
2437  if (context->get_device() != device) {
2438  *queue = nullptr;
2439  return PI_ERROR_INVALID_DEVICE;
2440  }
2441 
2442  unsigned int flags = 0;
2443  if (properties == __SYCL_PI_CUDA_USE_DEFAULT_STREAM) {
2444  flags = CU_STREAM_DEFAULT;
2445  } else if (properties == __SYCL_PI_CUDA_SYNC_WITH_DEFAULT) {
2446  flags = 0;
2447  } else {
2448  flags = CU_STREAM_NON_BLOCKING;
2449  }
2450 
2451  const bool is_out_of_order =
2453 
2454  std::vector<CUstream> computeCuStreams(
2455  is_out_of_order ? _pi_queue::default_num_compute_streams : 1);
2456  std::vector<CUstream> transferCuStreams(
2457  is_out_of_order ? _pi_queue::default_num_transfer_streams : 0);
2458 
2459  queueImpl = std::unique_ptr<_pi_queue>(
2460  new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams),
2461  context, device, properties, flags});
2462 
2463  *queue = queueImpl.release();
2464 
2465  return PI_SUCCESS;
2466  } catch (pi_result err) {
2467 
2468  return err;
2469 
2470  } catch (...) {
2471 
2472  return PI_ERROR_OUT_OF_RESOURCES;
2473  }
2474 }
2476  pi_queue_properties *Properties,
2477  pi_queue *Queue) {
2478  assert(Properties);
2479  // Expect flags mask to be passed first.
2480  assert(Properties[0] == PI_QUEUE_FLAGS);
2481  if (Properties[0] != PI_QUEUE_FLAGS)
2482  return PI_ERROR_INVALID_VALUE;
2483  pi_queue_properties Flags = Properties[1];
2484  // Extra data isn't supported yet.
2485  assert(Properties[2] == 0);
2486  if (Properties[2] != 0)
2487  return PI_ERROR_INVALID_VALUE;
2488  return cuda_piQueueCreate(Context, Device, Flags, Queue);
2489 }
2490 
2492  size_t param_value_size, void *param_value,
2493  size_t *param_value_size_ret) {
2494  assert(command_queue != nullptr);
2495 
2496  switch (param_name) {
2497  case PI_QUEUE_INFO_CONTEXT:
2498  return getInfo(param_value_size, param_value, param_value_size_ret,
2499  command_queue->context_);
2500  case PI_QUEUE_INFO_DEVICE:
2501  return getInfo(param_value_size, param_value, param_value_size_ret,
2502  command_queue->device_);
2504  return getInfo(param_value_size, param_value, param_value_size_ret,
2505  command_queue->get_reference_count());
2507  return getInfo(param_value_size, param_value, param_value_size_ret,
2508  command_queue->properties_);
2510  try {
2511  bool IsReady = command_queue->all_of([](CUstream s) -> bool {
2512  const CUresult ret = cuStreamQuery(s);
2513  if (ret == CUDA_SUCCESS)
2514  return true;
2515 
2516  if (ret == CUDA_ERROR_NOT_READY)
2517  return false;
2518 
2519  PI_CHECK_ERROR(ret);
2520  return false;
2521  });
2522  return getInfo(param_value_size, param_value, param_value_size_ret,
2523  IsReady);
2524  } catch (pi_result err) {
2525  return err;
2526  } catch (...) {
2527  return PI_ERROR_OUT_OF_RESOURCES;
2528  }
2529  }
2530  default:
2532  }
2533  sycl::detail::pi::die("Queue info request not implemented");
2534  return {};
2535 }
2536 
2538  assert(command_queue != nullptr);
2539  assert(command_queue->get_reference_count() > 0);
2540 
2541  command_queue->increment_reference_count();
2542  return PI_SUCCESS;
2543 }
2544 
2546  assert(command_queue != nullptr);
2547 
2548  if (command_queue->decrement_reference_count() > 0) {
2549  return PI_SUCCESS;
2550  }
2551 
2552  try {
2553  std::unique_ptr<_pi_queue> queueImpl(command_queue);
2554 
2555  if (!command_queue->backend_has_ownership())
2556  return PI_SUCCESS;
2557 
2558  ScopedContext active(command_queue->get_context());
2559 
2560  command_queue->for_each_stream([](CUstream s) {
2561  PI_CHECK_ERROR(cuStreamSynchronize(s));
2562  PI_CHECK_ERROR(cuStreamDestroy(s));
2563  });
2564 
2565  return PI_SUCCESS;
2566  } catch (pi_result err) {
2567  return err;
2568  } catch (...) {
2569  return PI_ERROR_OUT_OF_RESOURCES;
2570  }
2571 }
2572 
2574  pi_result result = PI_SUCCESS;
2575 
2576  try {
2577 
2578  assert(command_queue !=
2579  nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code
2580  ScopedContext active(command_queue->get_context());
2581 
2582  command_queue->sync_streams</*ResetUsed=*/true>([&result](CUstream s) {
2583  result = PI_CHECK_ERROR(cuStreamSynchronize(s));
2584  });
2585 
2586  } catch (pi_result err) {
2587 
2588  result = err;
2589 
2590  } catch (...) {
2591 
2592  result = PI_ERROR_OUT_OF_RESOURCES;
2593  }
2594 
2595  return result;
2596 }
2597 
2598 // There is no CUDA counterpart for queue flushing and we don't run into the
2599 // same problem of having to flush cross-queue dependencies as some of the
2600 // other plugins, so it can be left as no-op.
2602  (void)command_queue;
2603  return PI_SUCCESS;
2604 }
2605 
2613  pi_native_handle *nativeHandle) {
2614  ScopedContext active(queue->get_context());
2615  *nativeHandle =
2616  reinterpret_cast<pi_native_handle>(queue->get_next_compute_stream());
2617  return PI_SUCCESS;
2618 }
2619 
2631  pi_context context,
2632  pi_device device,
2633  bool ownNativeHandle,
2634  pi_queue *queue) {
2635  (void)device;
2636  (void)ownNativeHandle;
2637  assert(ownNativeHandle == false);
2638 
2639  unsigned int flags;
2640  CUstream cuStream = reinterpret_cast<CUstream>(nativeHandle);
2641 
2642  auto retErr = PI_CHECK_ERROR(cuStreamGetFlags(cuStream, &flags));
2643 
2644  pi_queue_properties properties = 0;
2645  if (flags == CU_STREAM_DEFAULT)
2646  properties = __SYCL_PI_CUDA_USE_DEFAULT_STREAM;
2647  else if (flags == CU_STREAM_NON_BLOCKING)
2648  properties = __SYCL_PI_CUDA_SYNC_WITH_DEFAULT;
2649  else
2650  sycl::detail::pi::die("Unknown cuda stream");
2651 
2652  std::vector<CUstream> computeCuStreams(1, cuStream);
2653  std::vector<CUstream> transferCuStreams(0);
2654 
2655  // Create queue and set num_compute_streams to 1, as computeCuStreams has
2656  // valid stream
2657  *queue = new _pi_queue{std::move(computeCuStreams),
2658  std::move(transferCuStreams),
2659  context,
2660  context->get_device(),
2661  properties,
2662  flags,
2663  /*backend_owns*/ false};
2664  (*queue)->num_compute_streams_ = 1;
2665 
2666  return retErr;
2667 }
2668 
2670  pi_bool blocking_write, size_t offset,
2671  size_t size, const void *ptr,
2672  pi_uint32 num_events_in_wait_list,
2673  const pi_event *event_wait_list,
2674  pi_event *event) {
2675 
2676  assert(buffer != nullptr);
2677  assert(command_queue != nullptr);
2678  pi_result retErr = PI_SUCCESS;
2679  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
2680  std::unique_ptr<_pi_event> retImplEv{nullptr};
2681 
2682  try {
2683  ScopedContext active(command_queue->get_context());
2684  CUstream cuStream = command_queue->get_next_transfer_stream();
2685 
2686  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
2687  event_wait_list);
2688 
2689  if (event) {
2690  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
2691  PI_COMMAND_TYPE_MEM_BUFFER_WRITE, command_queue, cuStream));
2692  retImplEv->start();
2693  }
2694 
2695  retErr =
2696  PI_CHECK_ERROR(cuMemcpyHtoDAsync(devPtr + offset, ptr, size, cuStream));
2697 
2698  if (event) {
2699  retErr = retImplEv->record();
2700  }
2701 
2702  if (blocking_write) {
2703  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
2704  }
2705 
2706  if (event) {
2707  *event = retImplEv.release();
2708  }
2709  } catch (pi_result err) {
2710  retErr = err;
2711  }
2712  return retErr;
2713 }
2714 
2716  pi_bool blocking_read, size_t offset,
2717  size_t size, void *ptr,
2718  pi_uint32 num_events_in_wait_list,
2719  const pi_event *event_wait_list,
2720  pi_event *event) {
2721 
2722  assert(buffer != nullptr);
2723  assert(command_queue != nullptr);
2724  pi_result retErr = PI_SUCCESS;
2725  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
2726  std::unique_ptr<_pi_event> retImplEv{nullptr};
2727 
2728  try {
2729  ScopedContext active(command_queue->get_context());
2730  CUstream cuStream = command_queue->get_next_transfer_stream();
2731 
2732  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
2733  event_wait_list);
2734 
2735  if (event) {
2736  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
2737  PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue, cuStream));
2738  retImplEv->start();
2739  }
2740 
2741  retErr =
2742  PI_CHECK_ERROR(cuMemcpyDtoHAsync(ptr, devPtr + offset, size, cuStream));
2743 
2744  if (event) {
2745  retErr = retImplEv->record();
2746  }
2747 
2748  if (blocking_read) {
2749  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
2750  }
2751 
2752  if (event) {
2753  *event = retImplEv.release();
2754  }
2755 
2756  } catch (pi_result err) {
2757  retErr = err;
2758  }
2759  return retErr;
2760 }
2761 
2762 pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
2763 
2764  try {
2765  assert(num_events != 0);
2766  assert(event_list);
2767  if (num_events == 0) {
2768  return PI_ERROR_INVALID_VALUE;
2769  }
2770 
2771  if (!event_list) {
2772  return PI_ERROR_INVALID_EVENT;
2773  }
2774 
2775  auto context = event_list[0]->get_context();
2776  ScopedContext active(context);
2777 
2778  auto waitFunc = [context](pi_event event) -> pi_result {
2779  if (!event) {
2780  return PI_ERROR_INVALID_EVENT;
2781  }
2782 
2783  if (event->get_context() != context) {
2784  return PI_ERROR_INVALID_CONTEXT;
2785  }
2786 
2787  return event->wait();
2788  };
2789  return forLatestEvents(event_list, num_events, waitFunc);
2790  } catch (pi_result err) {
2791  return err;
2792  } catch (...) {
2793  return PI_ERROR_OUT_OF_RESOURCES;
2794  }
2795 }
2796 
2797 pi_result cuda_piKernelCreate(pi_program program, const char *kernel_name,
2798  pi_kernel *kernel) {
2799  assert(kernel != nullptr);
2800  assert(program != nullptr);
2801 
2802  pi_result retErr = PI_SUCCESS;
2803  std::unique_ptr<_pi_kernel> retKernel{nullptr};
2804 
2805  try {
2806  ScopedContext active(program->get_context());
2807 
2808  CUfunction cuFunc;
2809  retErr = PI_CHECK_ERROR(
2810  cuModuleGetFunction(&cuFunc, program->get(), kernel_name));
2811 
2812  std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset";
2813  CUfunction cuFuncWithOffsetParam;
2814  CUresult offsetRes = cuModuleGetFunction(
2815  &cuFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str());
2816 
2817  // If there is no kernel with global offset parameter we mark it as missing
2818  if (offsetRes == CUDA_ERROR_NOT_FOUND) {
2819  cuFuncWithOffsetParam = nullptr;
2820  } else {
2821  retErr = PI_CHECK_ERROR(offsetRes);
2822  }
2823 
2824  retKernel = std::unique_ptr<_pi_kernel>(
2825  new _pi_kernel{cuFunc, cuFuncWithOffsetParam, kernel_name, program,
2826  program->get_context()});
2827  } catch (pi_result err) {
2828  retErr = err;
2829  } catch (...) {
2830  retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
2831  }
2832 
2833  *kernel = retKernel.release();
2834  return retErr;
2835 }
2836 
2838  size_t arg_size, const void *arg_value) {
2839 
2840  assert(kernel != nullptr);
2841  pi_result retErr = PI_SUCCESS;
2842  try {
2843  if (arg_value) {
2844  kernel->set_kernel_arg(arg_index, arg_size, arg_value);
2845  } else {
2846  kernel->set_kernel_local_arg(arg_index, arg_size);
2847  }
2848  } catch (pi_result err) {
2849  retErr = err;
2850  }
2851  return retErr;
2852 }
2853 
2855  const pi_mem *arg_value) {
2856 
2857  assert(kernel != nullptr);
2858  assert(arg_value != nullptr);
2859 
2860  pi_result retErr = PI_SUCCESS;
2861  try {
2862  pi_mem arg_mem = *arg_value;
2863  if (arg_mem->mem_type_ == _pi_mem::mem_type::surface) {
2864  CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
2865  PI_CHECK_ERROR(cuArray3DGetDescriptor(
2866  &arrayDesc, arg_mem->mem_.surface_mem_.get_array()));
2867  if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 &&
2868  arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 &&
2869  arrayDesc.Format != CU_AD_FORMAT_HALF &&
2870  arrayDesc.Format != CU_AD_FORMAT_FLOAT) {
2871  setErrorMessage("PI CUDA kernels only support images with channel "
2872  "types int32, uint32, float, and half.",
2873  PI_ERROR_PLUGIN_SPECIFIC_ERROR);
2874  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
2875  }
2876  CUsurfObject cuSurf = arg_mem->mem_.surface_mem_.get_surface();
2877  kernel->set_kernel_arg(arg_index, sizeof(cuSurf), (void *)&cuSurf);
2878  } else {
2879  CUdeviceptr cuPtr = arg_mem->mem_.buffer_mem_.get();
2880  kernel->set_kernel_arg(arg_index, sizeof(CUdeviceptr), (void *)&cuPtr);
2881  }
2882  } catch (pi_result err) {
2883  retErr = err;
2884  }
2885  return retErr;
2886 }
2887 
2889  const pi_sampler *arg_value) {
2890 
2891  assert(kernel != nullptr);
2892  assert(arg_value != nullptr);
2893 
2894  pi_result retErr = PI_SUCCESS;
2895  try {
2896  pi_uint32 samplerProps = (*arg_value)->props_;
2897  kernel->set_kernel_arg(arg_index, sizeof(pi_uint32), (void *)&samplerProps);
2898  } catch (pi_result err) {
2899  retErr = err;
2900  }
2901  return retErr;
2902 }
2903 
2905  pi_kernel_group_info param_name,
2906  size_t param_value_size, void *param_value,
2907  size_t *param_value_size_ret) {
2908 
2909  // Here we want to query about a kernel's cuda blocks!
2910 
2911  if (kernel != nullptr) {
2912 
2913  switch (param_name) {
2915  int max_threads = 0;
2917  cuFuncGetAttribute(&max_threads,
2918  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
2919  kernel->get()) == CUDA_SUCCESS);
2920  return getInfo(param_value_size, param_value, param_value_size_ret,
2921  size_t(max_threads));
2922  }
2924  size_t group_size[3] = {0, 0, 0};
2925  const auto &reqd_wg_size_md_map =
2926  kernel->program_->kernelReqdWorkGroupSizeMD_;
2927  const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_);
2928  if (reqd_wg_size_md != reqd_wg_size_md_map.end()) {
2929  const auto reqd_wg_size = reqd_wg_size_md->second;
2930  group_size[0] = std::get<0>(reqd_wg_size);
2931  group_size[1] = std::get<1>(reqd_wg_size);
2932  group_size[2] = std::get<2>(reqd_wg_size);
2933  }
2934  return getInfoArray(3, param_value_size, param_value,
2935  param_value_size_ret, group_size);
2936  }
2938  // OpenCL LOCAL == CUDA SHARED
2939  int bytes = 0;
2941  cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
2942  kernel->get()) == CUDA_SUCCESS);
2943  return getInfo(param_value_size, param_value, param_value_size_ret,
2944  pi_uint64(bytes));
2945  }
2947  // Work groups should be multiples of the warp size
2948  int warpSize = 0;
2950  cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
2951  device->get()) == CUDA_SUCCESS);
2952  return getInfo(param_value_size, param_value, param_value_size_ret,
2953  static_cast<size_t>(warpSize));
2954  }
2956  // OpenCL PRIVATE == CUDA LOCAL
2957  int bytes = 0;
2959  cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
2960  kernel->get()) == CUDA_SUCCESS);
2961  return getInfo(param_value_size, param_value, param_value_size_ret,
2962  pi_uint64(bytes));
2963  }
2965  int numRegs = 0;
2967  cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
2968  kernel->get()) == CUDA_SUCCESS);
2969  return getInfo(param_value_size, param_value, param_value_size_ret,
2970  pi_uint32(numRegs));
2971  }
2972  default:
2974  }
2975  }
2976 
2977  return PI_ERROR_INVALID_KERNEL;
2978 }
2979 
2981  pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim,
2982  const size_t *global_work_offset, const size_t *global_work_size,
2983  const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
2984  const pi_event *event_wait_list, pi_event *event) {
2985 
2986  // Preconditions
2987  assert(command_queue != nullptr);
2988  assert(command_queue->get_context() == kernel->get_context());
2989  assert(kernel != nullptr);
2990  assert(global_work_offset != nullptr);
2991  assert(work_dim > 0);
2992  assert(work_dim < 4);
2993 
2994  if (*global_work_size == 0) {
2996  command_queue, num_events_in_wait_list, event_wait_list, event);
2997  }
2998 
2999  // Set the number of threads per block to the number of threads per warp
3000  // by default unless user has provided a better number
3001  size_t threadsPerBlock[3] = {32u, 1u, 1u};
3002  size_t maxWorkGroupSize = 0u;
3003  size_t maxThreadsPerBlock[3] = {};
3004  bool providedLocalWorkGroupSize = (local_work_size != nullptr);
3005  pi_uint32 local_size = kernel->get_local_size();
3006  pi_result retError = PI_SUCCESS;
3007 
3008  try {
3009  // Set the active context here as guessLocalWorkSize needs an active context
3010  ScopedContext active(command_queue->get_context());
3011  {
3012  size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_;
3013  maxWorkGroupSize = command_queue->device_->get_max_work_group_size();
3014  command_queue->device_->get_max_work_item_sizes(
3015  sizeof(maxThreadsPerBlock), maxThreadsPerBlock);
3016 
3017  if (providedLocalWorkGroupSize) {
3018  auto isValid = [&](int dim) {
3019  if (reqdThreadsPerBlock[dim] != 0 &&
3020  local_work_size[dim] != reqdThreadsPerBlock[dim])
3021  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
3022 
3023  if (local_work_size[dim] > maxThreadsPerBlock[dim])
3024  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
3025  // Checks that local work sizes are a divisor of the global work sizes
3026  // which includes that the local work sizes are neither larger than
3027  // the global work sizes and not 0.
3028  if (0u == local_work_size[dim])
3029  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
3030  if (0u != (global_work_size[dim] % local_work_size[dim]))
3031  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
3032  threadsPerBlock[dim] = local_work_size[dim];
3033  return PI_SUCCESS;
3034  };
3035 
3036  for (size_t dim = 0; dim < work_dim; dim++) {
3037  auto err = isValid(dim);
3038  if (err != PI_SUCCESS)
3039  return err;
3040  }
3041  } else {
3042  guessLocalWorkSize(command_queue->device_, threadsPerBlock,
3043  global_work_size, maxThreadsPerBlock, kernel,
3044  local_size);
3045  }
3046  }
3047 
3048  if (maxWorkGroupSize <
3049  size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
3050  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
3051  }
3052 
3053  size_t blocksPerGrid[3] = {1u, 1u, 1u};
3054 
3055  for (size_t i = 0; i < work_dim; i++) {
3056  blocksPerGrid[i] =
3057  (global_work_size[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i];
3058  }
3059 
3060  std::unique_ptr<_pi_event> retImplEv{nullptr};
3061 
3062  pi_uint32 stream_token;
3063  _pi_stream_guard guard;
3064  CUstream cuStream = command_queue->get_next_compute_stream(
3065  num_events_in_wait_list, event_wait_list, guard, &stream_token);
3066  CUfunction cuFunc = kernel->get();
3067 
3068  retError = enqueueEventsWait(command_queue, cuStream,
3069  num_events_in_wait_list, event_wait_list);
3070 
3071  // Set the implicit global offset parameter if kernel has offset variant
3072  if (kernel->get_with_offset_parameter()) {
3073  std::uint32_t cuda_implicit_offset[3] = {0, 0, 0};
3074  if (global_work_offset) {
3075  for (size_t i = 0; i < work_dim; i++) {
3076  cuda_implicit_offset[i] =
3077  static_cast<std::uint32_t>(global_work_offset[i]);
3078  if (global_work_offset[i] != 0) {
3079  cuFunc = kernel->get_with_offset_parameter();
3080  }
3081  }
3082  }
3083  kernel->set_implicit_offset_arg(sizeof(cuda_implicit_offset),
3084  cuda_implicit_offset);
3085  }
3086 
3087  auto &argIndices = kernel->get_arg_indices();
3088 
3089  if (event) {
3090  retImplEv = std::unique_ptr<_pi_event>(
3092  cuStream, stream_token));
3093  retImplEv->start();
3094  }
3095 
3096  // Set local mem max size if env var is present
3097  static const char *local_mem_sz_ptr =
3098  std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE");
3099 
3100  if (local_mem_sz_ptr) {
3101  int device_max_local_mem = 0;
3102  cuDeviceGetAttribute(
3103  &device_max_local_mem,
3104  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
3105  command_queue->get_device()->get());
3106 
3107  static const int env_val = std::atoi(local_mem_sz_ptr);
3108  if (env_val <= 0 || env_val > device_max_local_mem) {
3109  setErrorMessage("Invalid value specified for "
3110  "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
3111  PI_ERROR_PLUGIN_SPECIFIC_ERROR);
3112  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
3113  }
3114  PI_CHECK_ERROR(cuFuncSetAttribute(
3115  cuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, env_val));
3116  }
3117 
3118  retError = PI_CHECK_ERROR(cuLaunchKernel(
3119  cuFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2],
3120  threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], local_size,
3121  cuStream, const_cast<void **>(argIndices.data()), nullptr));
3122  if (local_size != 0)
3123  kernel->clear_local_size();
3124 
3125  if (event) {
3126  retError = retImplEv->record();
3127  *event = retImplEv.release();
3128  }
3129  } catch (pi_result err) {
3130  retError = err;
3131  }
3132  return retError;
3133 }
3134 
3136 pi_result cuda_piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t,
3137  pi_uint32, const pi_mem *, const void **,
3138  pi_uint32, const pi_event *, pi_event *) {
3139  sycl::detail::pi::die("Not implemented in CUDA backend");
3140  return {};
3141 }
3142 
3144  pi_program, bool,
3145  pi_kernel *) {
3146  sycl::detail::pi::die("Unsupported operation");
3147  return PI_SUCCESS;
3148 }
3149 
3152  const pi_image_format *image_format,
3153  const pi_image_desc *image_desc, void *host_ptr,
3154  pi_mem *ret_mem) {
3155  // Need input memory object
3156  assert(ret_mem != nullptr);
3157  const bool performInitialCopy = (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
3158  ((flags & PI_MEM_FLAGS_HOST_PTR_USE));
3159  pi_result retErr = PI_SUCCESS;
3160 
3161  // We only support RBGA channel order
3162  // TODO: check SYCL CTS and spec. May also have to support BGRA
3163  if (image_format->image_channel_order !=
3166  "cuda_piMemImageCreate only supports RGBA channel order");
3167  }
3168 
3169  // We have to use cuArray3DCreate, which has some caveats. The height and
3170  // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc gives
3171  // a minimum value of 1, so we need to convert the answer.
3172  CUDA_ARRAY3D_DESCRIPTOR array_desc;
3173  array_desc.NumChannels = 4; // Only support 4 channel image
3174  array_desc.Flags = 0; // No flags required
3175  array_desc.Width = image_desc->image_width;
3176  if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
3177  array_desc.Height = 0;
3178  array_desc.Depth = 0;
3179  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
3180  array_desc.Height = image_desc->image_height;
3181  array_desc.Depth = 0;
3182  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
3183  array_desc.Height = image_desc->image_height;
3184  array_desc.Depth = image_desc->image_depth;
3185  }
3186 
3187  // We need to get this now in bytes for calculating the total image size later
3188  size_t pixel_type_size_bytes;
3189 
3190  switch (image_format->image_channel_data_type) {
3193  array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
3194  pixel_type_size_bytes = 1;
3195  break;
3197  array_desc.Format = CU_AD_FORMAT_SIGNED_INT8;
3198  pixel_type_size_bytes = 1;
3199  break;
3202  array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
3203  pixel_type_size_bytes = 2;
3204  break;
3206  array_desc.Format = CU_AD_FORMAT_SIGNED_INT16;
3207  pixel_type_size_bytes = 2;
3208  break;
3210  array_desc.Format = CU_AD_FORMAT_HALF;
3211  pixel_type_size_bytes = 2;
3212  break;
3214  array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
3215  pixel_type_size_bytes = 4;
3216  break;
3218  array_desc.Format = CU_AD_FORMAT_SIGNED_INT32;
3219  pixel_type_size_bytes = 4;
3220  break;
3222  array_desc.Format = CU_AD_FORMAT_FLOAT;
3223  pixel_type_size_bytes = 4;
3224  break;
3225  default:
3227  "cuda_piMemImageCreate given unsupported image_channel_data_type");
3228  }
3229 
3230  // When a dimension isn't used image_desc has the size set to 1
3231  size_t pixel_size_bytes =
3232  pixel_type_size_bytes * 4; // 4 is the only number of channels we support
3233  size_t image_size_bytes = pixel_size_bytes * image_desc->image_width *
3234  image_desc->image_height * image_desc->image_depth;
3235 
3236  ScopedContext active(context);
3237  CUarray image_array;
3238  retErr = PI_CHECK_ERROR(cuArray3DCreate(&image_array, &array_desc));
3239 
3240  try {
3241  if (performInitialCopy) {
3242  // We have to use a different copy function for each image dimensionality
3243  if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
3244  retErr = PI_CHECK_ERROR(
3245  cuMemcpyHtoA(image_array, 0, host_ptr, image_size_bytes));
3246  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
3247  CUDA_MEMCPY2D cpy_desc;
3248  memset(&cpy_desc, 0, sizeof(cpy_desc));
3249  cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
3250  cpy_desc.srcHost = host_ptr;
3251  cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
3252  cpy_desc.dstArray = image_array;
3253  cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
3254  cpy_desc.Height = image_desc->image_height;
3255  retErr = PI_CHECK_ERROR(cuMemcpy2D(&cpy_desc));
3256  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
3257  CUDA_MEMCPY3D cpy_desc;
3258  memset(&cpy_desc, 0, sizeof(cpy_desc));
3259  cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
3260  cpy_desc.srcHost = host_ptr;
3261  cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
3262  cpy_desc.dstArray = image_array;
3263  cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
3264  cpy_desc.Height = image_desc->image_height;
3265  cpy_desc.Depth = image_desc->image_depth;
3266  retErr = PI_CHECK_ERROR(cuMemcpy3D(&cpy_desc));
3267  }
3268  }
3269 
3270  // CUDA_RESOURCE_DESC is a union of different structs, shown here
3271  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
3272  // We need to fill it as described here to use it for a surface or texture
3273  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html
3274  // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and
3275  // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array
3276  // handle.
3277  // CUDA_RESOURCE_DESC::flags must be set to zero
3278 
3279  CUDA_RESOURCE_DESC image_res_desc;
3280  image_res_desc.res.array.hArray = image_array;
3281  image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
3282  image_res_desc.flags = 0;
3283 
3284  CUsurfObject surface;
3285  retErr = PI_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc));
3286 
3287  auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{
3288  context, image_array, surface, image_desc->image_type, host_ptr});
3289 
3290  if (piMemObj == nullptr) {
3291  return PI_ERROR_OUT_OF_HOST_MEMORY;
3292  }
3293 
3294  *ret_mem = piMemObj.release();
3295  } catch (pi_result err) {
3296  cuArrayDestroy(image_array);
3297  return err;
3298  } catch (...) {
3299  cuArrayDestroy(image_array);
3300  return PI_ERROR_UNKNOWN;
3301  }
3302 
3303  return retErr;
3304 }
3305 
3308  size_t *) {
3309  sycl::detail::pi::die("cuda_piMemImageGetInfo not implemented");
3310  return {};
3311 }
3312 
3314  assert(mem != nullptr);
3315  assert(mem->get_reference_count() > 0);
3317  return PI_SUCCESS;
3318 }
3319 
3324  const size_t *, pi_program *) {
3325  sycl::detail::pi::cuPrint("cuda_piclProgramCreateWithSource not implemented");
3326  return PI_ERROR_INVALID_OPERATION;
3327 }
3328 
3334  const pi_device *device_list, const char *options,
3335  void (*pfn_notify)(pi_program program,
3336  void *user_data),
3337  void *user_data) {
3338 
3339  assert(program != nullptr);
3340  assert(num_devices == 1 || num_devices == 0);
3341  assert(device_list != nullptr || num_devices == 0);
3342  assert(pfn_notify == nullptr);
3343  assert(user_data == nullptr);
3344  pi_result retError = PI_SUCCESS;
3345 
3346  try {
3347  ScopedContext active(program->get_context());
3348 
3349  program->build_program(options);
3350 
3351  } catch (pi_result err) {
3352  retError = err;
3353  }
3354  return retError;
3355 }
3356 
3359  sycl::detail::pi::die("cuda_piProgramCreate not implemented");
3360  return {};
3361 }
3362 
3370  pi_context context, pi_uint32 num_devices, const pi_device *device_list,
3371  const size_t *lengths, const unsigned char **binaries,
3372  size_t num_metadata_entries, const pi_device_binary_property *metadata,
3373  pi_int32 *binary_status, pi_program *program) {
3374  // Ignore unused parameter
3375  (void)binary_status;
3376 
3377  assert(context != nullptr);
3378  assert(binaries != nullptr);
3379  assert(program != nullptr);
3380  assert(device_list != nullptr);
3381  assert(num_devices == 1 && "CUDA contexts are for a single device");
3382  assert((context->get_device()->get() == device_list[0]->get()) &&
3383  "Mismatch between devices context and passed context when creating "
3384  "program from binary");
3385 
3386  pi_result retError = PI_SUCCESS;
3387 
3388  std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
3389 
3390  retProgram->set_metadata(metadata, num_metadata_entries);
3391 
3392  const bool has_length = (lengths != nullptr);
3393  size_t length = has_length
3394  ? lengths[0]
3395  : strlen(reinterpret_cast<const char *>(binaries[0])) + 1;
3396 
3397  assert(length != 0);
3398 
3399  retProgram->set_binary(reinterpret_cast<const char *>(binaries[0]), length);
3400 
3401  *program = retProgram.release();
3402 
3403  return retError;
3404 }
3405 
3407  size_t param_value_size, void *param_value,
3408  size_t *param_value_size_ret) {
3409  assert(program != nullptr);
3410 
3411  switch (param_name) {
3413  return getInfo(param_value_size, param_value, param_value_size_ret,
3414  program->get_reference_count());
3416  return getInfo(param_value_size, param_value, param_value_size_ret,
3417  program->context_);
3419  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
3421  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3422  &program->context_->deviceId_);
3424  return getInfo(param_value_size, param_value, param_value_size_ret,
3425  program->binary_);
3427  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3428  &program->binarySizeInBytes_);
3430  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3431  &program->binary_);
3433  return getInfo(param_value_size, param_value, param_value_size_ret,
3434  getKernelNames(program).c_str());
3435  }
3436  default:
3438  }
3439  sycl::detail::pi::die("Program info request not implemented");
3440  return {};
3441 }
3442 
3448  const pi_device *device_list, const char *options,
3449  pi_uint32 num_input_programs,
3450  const pi_program *input_programs,
3451  void (*pfn_notify)(pi_program program,
3452  void *user_data),
3453  void *user_data, pi_program *ret_program) {
3454 
3455  assert(ret_program != nullptr);
3456  assert(num_devices == 1 || num_devices == 0);
3457  assert(device_list != nullptr || num_devices == 0);
3458  assert(pfn_notify == nullptr);
3459  assert(user_data == nullptr);
3460  pi_result retError = PI_SUCCESS;
3461 
3462  try {
3463  ScopedContext active(context);
3464 
3465  CUlinkState state;
3466  std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
3467 
3468  retError = PI_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &state));
3469  try {
3470  for (size_t i = 0; i < num_input_programs; ++i) {
3471  pi_program program = input_programs[i];
3472  retError = PI_CHECK_ERROR(cuLinkAddData(
3473  state, CU_JIT_INPUT_PTX, const_cast<char *>(program->binary_),
3474  program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr));
3475  }
3476  void *cubin = nullptr;
3477  size_t cubinSize = 0;
3478  retError = PI_CHECK_ERROR(cuLinkComplete(state, &cubin, &cubinSize));
3479 
3480  retError =
3481  retProgram->set_binary(static_cast<const char *>(cubin), cubinSize);
3482 
3483  if (retError != PI_SUCCESS) {
3484  return retError;
3485  }
3486 
3487  retError = retProgram->build_program(options);
3488 
3489  if (retError != PI_SUCCESS) {
3490  return retError;
3491  }
3492  } catch (...) {
3493  // Upon error attempt cleanup
3494  PI_CHECK_ERROR(cuLinkDestroy(state));
3495  throw;
3496  }
3497 
3498  retError = PI_CHECK_ERROR(cuLinkDestroy(state));
3499  *ret_program = retProgram.release();
3500 
3501  } catch (pi_result err) {
3502  retError = err;
3503  }
3504  return retError;
3505 }
3506 
3512  pi_program program, pi_uint32 num_devices, const pi_device *device_list,
3513  const char *options, pi_uint32 num_input_headers,
3514  const pi_program *input_headers, const char **header_include_names,
3515  void (*pfn_notify)(pi_program program, void *user_data), void *user_data) {
3516  // Ignore unused parameters
3517  (void)header_include_names;
3518  (void)input_headers;
3519 
3520  assert(program != nullptr);
3521  assert(num_devices == 1 || num_devices == 0);
3522  assert(device_list != nullptr || num_devices == 0);
3523  assert(pfn_notify == nullptr);
3524  assert(user_data == nullptr);
3525  assert(num_input_headers == 0);
3526  pi_result retError = PI_SUCCESS;
3527 
3528  try {
3529  ScopedContext active(program->get_context());
3530 
3531  program->build_program(options);
3532 
3533  } catch (pi_result err) {
3534  retError = err;
3535  }
3536  return retError;
3537 }
3538 
3540  pi_program_build_info param_name,
3541  size_t param_value_size, void *param_value,
3542  size_t *param_value_size_ret) {
3543  // Ignore unused parameter
3544  (void)device;
3545 
3546  assert(program != nullptr);
3547 
3548  switch (param_name) {
3550  return getInfo(param_value_size, param_value, param_value_size_ret,
3551  program->buildStatus_);
3552  }
3554  return getInfo(param_value_size, param_value, param_value_size_ret,
3555  program->buildOptions_.c_str());
3557  return getInfoArray(program->MAX_LOG_SIZE, param_value_size, param_value,
3558  param_value_size_ret, program->infoLog_);
3559  default:
3561  }
3562  sycl::detail::pi::die("Program Build info request not implemented");
3563  return {};
3564 }
3565 
3567  assert(program != nullptr);
3568  assert(program->get_reference_count() > 0);
3569  program->increment_reference_count();
3570  return PI_SUCCESS;
3571 }
3572 
3577  assert(program != nullptr);
3578 
3579  // double delete or someone is messing with the ref count.
3580  // either way, cannot safely proceed.
3581  assert(program->get_reference_count() != 0 &&
3582  "Reference count overflow detected in cuda_piProgramRelease.");
3583 
3584  // decrement ref count. If it is 0, delete the program.
3585  if (program->decrement_reference_count() == 0) {
3586 
3587  std::unique_ptr<_pi_program> program_ptr{program};
3588 
3589  pi_result result = PI_ERROR_INVALID_PROGRAM;
3590 
3591  try {
3592  ScopedContext active(program->get_context());
3593  auto cuModule = program->get();
3594  result = PI_CHECK_ERROR(cuModuleUnload(cuModule));
3595  } catch (...) {
3596  result = PI_ERROR_OUT_OF_RESOURCES;
3597  }
3598 
3599  return result;
3600  }
3601 
3602  return PI_SUCCESS;
3603 }
3604 
3612  pi_native_handle *nativeHandle) {
3613  *nativeHandle = reinterpret_cast<pi_native_handle>(program->get());
3614  return PI_SUCCESS;
3615 }
3616 
3627  bool, pi_program *) {
3629  "Creation of PI program from native handle not implemented");
3630  return {};
3631 }
3632 
3634  size_t param_value_size, void *param_value,
3635  size_t *param_value_size_ret) {
3636 
3637  if (kernel != nullptr) {
3638 
3639  switch (param_name) {
3641  return getInfo(param_value_size, param_value, param_value_size_ret,
3642  kernel->get_name());
3644  return getInfo(param_value_size, param_value, param_value_size_ret,
3645  kernel->get_num_args());
3647  return getInfo(param_value_size, param_value, param_value_size_ret,
3648  kernel->get_reference_count());
3649  case PI_KERNEL_INFO_CONTEXT: {
3650  return getInfo(param_value_size, param_value, param_value_size_ret,
3651  kernel->get_context());
3652  }
3653  case PI_KERNEL_INFO_PROGRAM: {
3654  return getInfo(param_value_size, param_value, param_value_size_ret,
3655  kernel->get_program());
3656  }
3658  return getInfo(param_value_size, param_value, param_value_size_ret, "");
3659  }
3660  default: {
3662  }
3663  }
3664  }
3665 
3666  return PI_ERROR_INVALID_KERNEL;
3667 }
3668 
3670  pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name,
3671  size_t input_value_size, const void *input_value, size_t param_value_size,
3672  void *param_value, size_t *param_value_size_ret) {
3673  // Ignore unused parameters
3674  (void)input_value_size;
3675  (void)input_value;
3676 
3677  if (kernel != nullptr) {
3678  switch (param_name) {
3680  // Sub-group size is equivalent to warp size
3681  int warpSize = 0;
3683  cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
3684  device->get()) == CUDA_SUCCESS);
3685  return getInfo(param_value_size, param_value, param_value_size_ret,
3686  static_cast<uint32_t>(warpSize));
3687  }
3689  // Number of sub-groups = max block size / warp size + possible remainder
3690  int max_threads = 0;
3692  cuFuncGetAttribute(&max_threads,
3693  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
3694  kernel->get()) == CUDA_SUCCESS);
3695  int warpSize = 0;
3697  0, nullptr, sizeof(uint32_t), &warpSize,
3698  nullptr);
3699  int maxWarps = (max_threads + warpSize - 1) / warpSize;
3700  return getInfo(param_value_size, param_value, param_value_size_ret,
3701  static_cast<uint32_t>(maxWarps));
3702  }
3704  // Return value of 0 => not specified
3705  // TODO: Revisit if PTX is generated for compile-time work-group sizes
3706  return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3707  }
3709  // Return value of 0 => unspecified or "auto" sub-group size
3710  // Correct for now, since warp size may be read from special register
3711  // TODO: Return warp size once default is primary sub-group size
3712  // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX
3713  return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3714  }
3715  default:
3717  }
3718  }
3719  return PI_ERROR_INVALID_KERNEL;
3720 }
3721 
3723  assert(kernel != nullptr);
3724  assert(kernel->get_reference_count() > 0u);
3725 
3726  kernel->increment_reference_count();
3727  return PI_SUCCESS;
3728 }
3729 
3731  assert(kernel != nullptr);
3732 
3733  // double delete or someone is messing with the ref count.
3734  // either way, cannot safely proceed.
3735  assert(kernel->get_reference_count() != 0 &&
3736  "Reference count overflow detected in cuda_piKernelRelease.");
3737 
3738  // decrement ref count. If it is 0, delete the program.
3739  if (kernel->decrement_reference_count() == 0) {
3740  // no internal cuda resources to clean up. Just delete it.
3741  delete kernel;
3742  return PI_SUCCESS;
3743  }
3744 
3745  return PI_SUCCESS;
3746 }
3747 
3748 // A NOP for the CUDA backend
3750  const void *) {
3751  return PI_SUCCESS;
3752 }
3753 
3755  size_t, const void *) {
3756  // This entry point is only used for native specialization constants (SPIR-V),
3757  // and the CUDA plugin is AOT only so this entry point is not supported.
3758  sycl::detail::pi::die("Native specialization constants are not supported");
3759  return {};
3760 }
3761 
3763  size_t arg_size,
3764  const void *arg_value) {
3765  kernel->set_kernel_arg(arg_index, arg_size, arg_value);
3766  return PI_SUCCESS;
3767 }
3768 
3769 //
3770 // Events
3771 //
3773  sycl::detail::pi::die("PI Event Create not implemented in CUDA backend");
3774 }
3775 
3777  size_t param_value_size, void *param_value,
3778  size_t *param_value_size_ret) {
3779  assert(event != nullptr);
3780 
3781  switch (param_name) {
3783  return getInfo(param_value_size, param_value, param_value_size_ret,
3784  event->get_queue());
3786  return getInfo(param_value_size, param_value, param_value_size_ret,
3787  event->get_command_type());
3789  return getInfo(param_value_size, param_value, param_value_size_ret,
3790  event->get_reference_count());
3792  return getInfo(param_value_size, param_value, param_value_size_ret,
3793  static_cast<pi_event_status>(event->get_execution_status()));
3794  }
3795  case PI_EVENT_INFO_CONTEXT:
3796  return getInfo(param_value_size, param_value, param_value_size_ret,
3797  event->get_context());
3798  default:
3800  }
3801 
3802  return PI_ERROR_INVALID_EVENT;
3803 }
3804 
3808  pi_profiling_info param_name,
3809  size_t param_value_size,
3810  void *param_value,
3811  size_t *param_value_size_ret) {
3812 
3813  assert(event != nullptr);
3814 
3815  pi_queue queue = event->get_queue();
3816  if (queue == nullptr ||
3818  return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE;
3819  }
3820 
3821  switch (param_name) {
3824  // Note: No user for this case
3825  return getInfo<pi_uint64>(param_value_size, param_value,
3826  param_value_size_ret, event->get_queued_time());
3828  return getInfo<pi_uint64>(param_value_size, param_value,
3829  param_value_size_ret, event->get_start_time());
3831  return getInfo<pi_uint64>(param_value_size, param_value,
3832  param_value_size_ret, event->get_end_time());
3833  default:
3835  }
3836  sycl::detail::pi::die("Event Profiling info request not implemented");
3837  return {};
3838 }
3839 
3841  sycl::detail::pi::die("Event Callback not implemented in CUDA backend");
3842  return PI_SUCCESS;
3843 }
3844 
3846  sycl::detail::pi::die("Event Set Status not implemented in CUDA backend");
3847  return PI_ERROR_INVALID_VALUE;
3848 }
3849 
3851  assert(event != nullptr);
3852 
3853  const auto refCount = event->increment_reference_count();
3854 
3856  refCount != 0,
3857  "Reference count overflow detected in cuda_piEventRetain.");
3858 
3859  return PI_SUCCESS;
3860 }
3861 
3863  assert(event != nullptr);
3864 
3865  // double delete or someone is messing with the ref count.
3866  // either way, cannot safely proceed.
3868  event->get_reference_count() != 0,
3869  "Reference count overflow detected in cuda_piEventRelease.");
3870 
3871  // decrement ref count. If it is 0, delete the event.
3872  if (event->decrement_reference_count() == 0) {
3873  std::unique_ptr<_pi_event> event_ptr{event};
3874  pi_result result = PI_ERROR_INVALID_EVENT;
3875  try {
3876  ScopedContext active(event->get_context());
3877  result = event->release();
3878  } catch (...) {
3879  result = PI_ERROR_OUT_OF_RESOURCES;
3880  }
3881  return result;
3882  }
3883 
3884  return PI_SUCCESS;
3885 }
3886 
3893  pi_uint32 num_events_in_wait_list,
3894  const pi_event *event_wait_list,
3895  pi_event *event) {
3897  command_queue, num_events_in_wait_list, event_wait_list, event);
3898 }
3899 
3913  pi_uint32 num_events_in_wait_list,
3914  const pi_event *event_wait_list,
3915  pi_event *event) {
3916  // This function makes one stream work on the previous work (or work
3917  // represented by input events) and then all future work waits on that stream.
3918  if (!command_queue) {
3919  return PI_ERROR_INVALID_QUEUE;
3920  }
3921 
3922  pi_result result;
3923 
3924  try {
3925  ScopedContext active(command_queue->get_context());
3926  pi_uint32 stream_token;
3927  _pi_stream_guard guard;
3928  CUstream cuStream = command_queue->get_next_compute_stream(
3929  num_events_in_wait_list, event_wait_list, guard, &stream_token);
3930  {
3931  std::lock_guard<std::mutex> guard(command_queue->barrier_mutex_);
3932  if (command_queue->barrier_event_ == nullptr) {
3933  PI_CHECK_ERROR(cuEventCreate(&command_queue->barrier_event_,
3934  CU_EVENT_DISABLE_TIMING));
3935  }
3936  if (num_events_in_wait_list == 0) { // wait on all work
3937  if (command_queue->barrier_tmp_event_ == nullptr) {
3938  PI_CHECK_ERROR(cuEventCreate(&command_queue->barrier_tmp_event_,
3939  CU_EVENT_DISABLE_TIMING));
3940  }
3941  command_queue->sync_streams(
3942  [cuStream,
3943  tmp_event = command_queue->barrier_tmp_event_](CUstream s) {
3944  if (cuStream != s) {
3945  // record a new CUDA event on every stream and make one stream
3946  // wait for these events
3947  PI_CHECK_ERROR(cuEventRecord(tmp_event, s));
3948  PI_CHECK_ERROR(cuStreamWaitEvent(cuStream, tmp_event, 0));
3949  }
3950  });
3951  } else { // wait just on given events
3952  forLatestEvents(event_wait_list, num_events_in_wait_list,
3953  [cuStream](pi_event event) -> pi_result {
3954  if (event->get_queue()->has_been_synchronized(
3955  event->get_compute_stream_token())) {
3956  return PI_SUCCESS;
3957  } else {
3958  return PI_CHECK_ERROR(
3959  cuStreamWaitEvent(cuStream, event->get(), 0));
3960  }
3961  });
3962  }
3963 
3964  result = PI_CHECK_ERROR(
3965  cuEventRecord(command_queue->barrier_event_, cuStream));
3966  for (unsigned int i = 0;
3967  i < command_queue->compute_applied_barrier_.size(); i++) {
3968  command_queue->compute_applied_barrier_[i] = false;
3969  }
3970  for (unsigned int i = 0;
3971  i < command_queue->transfer_applied_barrier_.size(); i++) {
3972  command_queue->transfer_applied_barrier_[i] = false;
3973  }
3974  }
3975  if (result != PI_SUCCESS) {
3976  return result;
3977  }
3978 
3979  if (event) {
3980  *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue,
3981  cuStream, stream_token);
3982  (*event)->start();
3983  (*event)->record();
3984  }
3985 
3986  return PI_SUCCESS;
3987  } catch (pi_result err) {
3988  return err;
3989  } catch (...) {
3990  return PI_ERROR_UNKNOWN;
3991  }
3992 }
3993 
4001  pi_native_handle *nativeHandle) {
4002  *nativeHandle = reinterpret_cast<pi_native_handle>(event->get());
4003  return PI_SUCCESS;
4004 }
4005 
4015  pi_context context,
4016  bool ownNativeHandle,
4017  pi_event *event) {
4018  (void)ownNativeHandle;
4019  assert(!ownNativeHandle);
4020 
4021  std::unique_ptr<_pi_event> event_ptr{nullptr};
4022 
4023  *event = _pi_event::make_with_native(context,
4024  reinterpret_cast<CUevent>(nativeHandle));
4025 
4026  return PI_SUCCESS;
4027 }
4028 
4039  const pi_sampler_properties *sampler_properties,
4040  pi_sampler *result_sampler) {
4041  std::unique_ptr<_pi_sampler> retImplSampl{new _pi_sampler(context)};
4042 
4043  bool propSeen[3] = {false, false, false};
4044  for (size_t i = 0; sampler_properties[i] != 0; i += 2) {
4045  switch (sampler_properties[i]) {
4047  if (propSeen[0]) {
4048  return PI_ERROR_INVALID_VALUE;
4049  }
4050  propSeen[0] = true;
4051  retImplSampl->props_ |= sampler_properties[i + 1];
4052  break;
4054  if (propSeen[1]) {
4055  return PI_ERROR_INVALID_VALUE;
4056  }
4057  propSeen[1] = true;
4058  retImplSampl->props_ |=
4059  (sampler_properties[i + 1] - PI_SAMPLER_FILTER_MODE_NEAREST) << 1;
4060  break;
4062  if (propSeen[2]) {
4063  return PI_ERROR_INVALID_VALUE;
4064  }
4065  propSeen[2] = true;
4066  retImplSampl->props_ |=
4067  (sampler_properties[i + 1] - PI_SAMPLER_ADDRESSING_MODE_NONE) << 2;
4068  break;
4069  default:
4070  return PI_ERROR_INVALID_VALUE;
4071  }
4072  }
4073 
4074  if (!propSeen[0]) {
4075  retImplSampl->props_ |= PI_TRUE;
4076  }
4077  // Default filter mode to PI_SAMPLER_FILTER_MODE_NEAREST
4078  if (!propSeen[2]) {
4079  retImplSampl->props_ |=
4081  << 2;
4082  }
4083 
4084  *result_sampler = retImplSampl.release();
4085  return PI_SUCCESS;
4086 }
4087 
4098  size_t param_value_size, void *param_value,
4099  size_t *param_value_size_ret) {
4100  assert(sampler != nullptr);
4101 
4102  switch (param_name) {
4104  return getInfo(param_value_size, param_value, param_value_size_ret,
4105  sampler->get_reference_count());
4107  return getInfo(param_value_size, param_value, param_value_size_ret,
4108  sampler->context_);
4110  pi_bool norm_coords_prop = static_cast<pi_bool>(sampler->props_ & 0x1);
4111  return getInfo(param_value_size, param_value, param_value_size_ret,
4112  norm_coords_prop);
4113  }
4115  pi_sampler_filter_mode filter_prop = static_cast<pi_sampler_filter_mode>(
4116  ((sampler->props_ >> 1) & 0x1) + PI_SAMPLER_FILTER_MODE_NEAREST);
4117  return getInfo(param_value_size, param_value, param_value_size_ret,
4118  filter_prop);
4119  }
4121  pi_sampler_addressing_mode addressing_prop =
4122  static_cast<pi_sampler_addressing_mode>(
4123  (sampler->props_ >> 2) + PI_SAMPLER_ADDRESSING_MODE_NONE);
4124  return getInfo(param_value_size, param_value, param_value_size_ret,
4125  addressing_prop);
4126  }
4127  default:
4129  }
4130  return {};
4131 }
4132 
4139  assert(sampler != nullptr);
4140  sampler->increment_reference_count();
4141  return PI_SUCCESS;
4142 }
4143 
4151  assert(sampler != nullptr);
4152 
4153  // double delete or someone is messing with the ref count.
4154  // either way, cannot safely proceed.
4156  sampler->get_reference_count() != 0,
4157  "Reference count overflow detected in cuda_piSamplerRelease.");
4158 
4159  // decrement ref count. If it is 0, delete the sampler.
4160  if (sampler->decrement_reference_count() == 0) {
4161  delete sampler;
4162  }
4163 
4164  return PI_SUCCESS;
4165 }
4166 
4173  CUstream cu_stream, pi_buff_rect_region region, const void *src_ptr,
4174  const CUmemorytype_enum src_type, pi_buff_rect_offset src_offset,
4175  size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr,
4176  const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset,
4177  size_t dst_row_pitch, size_t dst_slice_pitch) {
4178 
4179  assert(region != nullptr);
4180  assert(src_offset != nullptr);
4181  assert(dst_offset != nullptr);
4182 
4183  assert(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST);
4184  assert(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST);
4185 
4186  src_row_pitch = (!src_row_pitch) ? region->width_bytes + src_offset->x_bytes
4187  : src_row_pitch;
4188  src_slice_pitch =
4189  (!src_slice_pitch)
4190  ? ((region->height_scalar + src_offset->y_scalar) * src_row_pitch)
4191  : src_slice_pitch;
4192  dst_row_pitch = (!dst_row_pitch) ? region->width_bytes + dst_offset->x_bytes
4193  : dst_row_pitch;
4194  dst_slice_pitch =
4195  (!dst_slice_pitch)
4196  ? ((region->height_scalar + dst_offset->y_scalar) * dst_row_pitch)
4197  : dst_slice_pitch;
4198 
4199  CUDA_MEMCPY3D params = {};
4200 
4201  params.WidthInBytes = region->width_bytes;
4202  params.Height = region->height_scalar;
4203  params.Depth = region->depth_scalar;
4204 
4205  params.srcMemoryType = src_type;
4206  params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE
4207  ? *static_cast<const CUdeviceptr *>(src_ptr)
4208  : 0;
4209  params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr;
4210  params.srcXInBytes = src_offset->x_bytes;
4211  params.srcY = src_offset->y_scalar;
4212  params.srcZ = src_offset->z_scalar;
4213  params.srcPitch = src_row_pitch;
4214  params.srcHeight = src_slice_pitch / src_row_pitch;
4215 
4216  params.dstMemoryType = dst_type;
4217  params.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE
4218  ? *static_cast<CUdeviceptr *>(dst_ptr)
4219  : 0;
4220  params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr;
4221  params.dstXInBytes = dst_offset->x_bytes;
4222  params.dstY = dst_offset->y_scalar;
4223  params.dstZ = dst_offset->z_scalar;
4224  params.dstPitch = dst_row_pitch;
4225  params.dstHeight = dst_slice_pitch / dst_row_pitch;
4226 
4227  return PI_CHECK_ERROR(cuMemcpy3DAsync(&params, cu_stream));
4228 }
4229 
4231  pi_queue command_queue, pi_mem buffer, pi_bool blocking_read,
4232  pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
4233  pi_buff_rect_region region, size_t buffer_row_pitch,
4234  size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
4235  void *ptr, pi_uint32 num_events_in_wait_list,
4236  const pi_event *event_wait_list, pi_event *event) {
4237 
4238  assert(buffer != nullptr);
4239  assert(command_queue != nullptr);
4240 
4241  pi_result retErr = PI_SUCCESS;
4242  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
4243  std::unique_ptr<_pi_event> retImplEv{nullptr};
4244 
4245  try {
4246  ScopedContext active(command_queue->get_context());
4247  CUstream cuStream = command_queue->get_next_transfer_stream();
4248 
4249  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4250  event_wait_list);
4251 
4252  if (event) {
4253  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4254  PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, command_queue, cuStream));
4255  retImplEv->start();
4256  }
4257 
4259  cuStream, region, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset,
4260  buffer_row_pitch, buffer_slice_pitch, ptr, CU_MEMORYTYPE_HOST,
4261  host_offset, host_row_pitch, host_slice_pitch);
4262 
4263  if (event) {
4264  retErr = retImplEv->record();
4265  }
4266 
4267  if (blocking_read) {
4268  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
4269  }
4270 
4271  if (event) {
4272  *event = retImplEv.release();
4273  }
4274 
4275  } catch (pi_result err) {
4276  retErr = err;
4277  }
4278  return retErr;
4279 }
4280 
4282  pi_queue command_queue, pi_mem buffer, pi_bool blocking_write,
4283  pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
4284  pi_buff_rect_region region, size_t buffer_row_pitch,
4285  size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
4286  const void *ptr, pi_uint32 num_events_in_wait_list,
4287  const pi_event *event_wait_list, pi_event *event) {
4288 
4289  assert(buffer != nullptr);
4290  assert(command_queue != nullptr);
4291 
4292  pi_result retErr = PI_SUCCESS;
4293  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
4294  std::unique_ptr<_pi_event> retImplEv{nullptr};
4295 
4296  try {
4297  ScopedContext active(command_queue->get_context());
4298  CUstream cuStream = command_queue->get_next_transfer_stream();
4299  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4300  event_wait_list);
4301 
4302  if (event) {
4303  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4304  PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, command_queue, cuStream));
4305  retImplEv->start();
4306  }
4307 
4309  cuStream, region, ptr, CU_MEMORYTYPE_HOST, host_offset, host_row_pitch,
4310  host_slice_pitch, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset,
4311  buffer_row_pitch, buffer_slice_pitch);
4312 
4313  if (event) {
4314  retErr = retImplEv->record();
4315  }
4316 
4317  if (blocking_write) {
4318  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
4319  }
4320 
4321  if (event) {
4322  *event = retImplEv.release();
4323  }
4324 
4325  } catch (pi_result err) {
4326  retErr = err;
4327  }
4328  return retErr;
4329 }
4330 
4332  pi_mem dst_buffer, size_t src_offset,
4333  size_t dst_offset, size_t size,
4334  pi_uint32 num_events_in_wait_list,
4335  const pi_event *event_wait_list,
4336  pi_event *event) {
4337  if (!command_queue) {
4338  return PI_ERROR_INVALID_QUEUE;
4339  }
4340 
4341  std::unique_ptr<_pi_event> retImplEv{nullptr};
4342 
4343  try {
4344  ScopedContext active(command_queue->get_context());
4345  pi_result result;
4346 
4347  auto stream = command_queue->get_next_transfer_stream();
4348  result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
4349  event_wait_list);
4350 
4351  if (event) {
4352  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4353  PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue, stream));
4354  result = retImplEv->start();
4355  }
4356 
4357  auto src = src_buffer->mem_.buffer_mem_.get() + src_offset;
4358  auto dst = dst_buffer->mem_.buffer_mem_.get() + dst_offset;
4359 
4360  result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream));
4361 
4362  if (event) {
4363  result = retImplEv->record();
4364  *event = retImplEv.release();
4365  }
4366 
4367  return result;
4368  } catch (pi_result err) {
4369  return err;
4370  } catch (...) {
4371  return PI_ERROR_UNKNOWN;
4372  }
4373 }
4374 
4376  pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer,
4377  pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin,
4378  pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch,
4379  size_t dst_row_pitch, size_t dst_slice_pitch,
4380  pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
4381  pi_event *event) {
4382 
4383  assert(src_buffer != nullptr);
4384  assert(dst_buffer != nullptr);
4385  assert(command_queue != nullptr);
4386 
4387  pi_result retErr = PI_SUCCESS;
4388  CUdeviceptr srcPtr = src_buffer->mem_.buffer_mem_.get();
4389  CUdeviceptr dstPtr = dst_buffer->mem_.buffer_mem_.get();
4390  std::unique_ptr<_pi_event> retImplEv{nullptr};
4391 
4392  try {
4393  ScopedContext active(command_queue->get_context());
4394  CUstream cuStream = command_queue->get_next_transfer_stream();
4395  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4396  event_wait_list);
4397 
4398  if (event) {
4399  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4400  PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue, cuStream));
4401  retImplEv->start();
4402  }
4403 
4405  cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin,
4406  src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE,
4407  dst_origin, dst_row_pitch, dst_slice_pitch);
4408 
4409  if (event) {
4410  retImplEv->record();
4411  *event = retImplEv.release();
4412  }
4413 
4414  } catch (pi_result err) {
4415  retErr = err;
4416  }
4417  return retErr;
4418 }
4419 
4421  const void *pattern, size_t pattern_size,
4422  size_t offset, size_t size,
4423  pi_uint32 num_events_in_wait_list,
4424  const pi_event *event_wait_list,
4425  pi_event *event) {
4426  assert(command_queue != nullptr);
4427 
4428  auto args_are_multiples_of_pattern_size =
4429  (offset % pattern_size == 0) || (size % pattern_size == 0);
4430 
4431  auto pattern_is_valid = (pattern != nullptr);
4432 
4433  auto pattern_size_is_valid =
4434  ((pattern_size & (pattern_size - 1)) == 0) && // is power of two
4435  (pattern_size > 0) && (pattern_size <= 128); // falls within valid range
4436 
4437  assert(args_are_multiples_of_pattern_size && pattern_is_valid &&
4438  pattern_size_is_valid);
4439  (void)args_are_multiples_of_pattern_size;
4440  (void)pattern_is_valid;
4441  (void)pattern_size_is_valid;
4442 
4443  std::unique_ptr<_pi_event> retImplEv{nullptr};
4444 
4445  try {
4446  ScopedContext active(command_queue->get_context());
4447 
4448  auto stream = command_queue->get_next_transfer_stream();
4449  pi_result result;
4450  result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
4451  event_wait_list);
4452 
4453  if (event) {
4454  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4455  PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue, stream));
4456  result = retImplEv->start();
4457  }
4458 
4459  auto dstDevice = buffer->mem_.buffer_mem_.get() + offset;
4460  auto N = size / pattern_size;
4461 
4462  // pattern size in bytes
4463  switch (pattern_size) {
4464  case 1: {
4465  auto value = *static_cast<const uint8_t *>(pattern);
4466  result = PI_CHECK_ERROR(cuMemsetD8Async(dstDevice, value, N, stream));
4467  break;
4468  }
4469  case 2: {
4470  auto value = *static_cast<const uint16_t *>(pattern);
4471  result = PI_CHECK_ERROR(cuMemsetD16Async(dstDevice, value, N, stream));
4472  break;
4473  }
4474  case 4: {
4475  auto value = *static_cast<const uint32_t *>(pattern);
4476  result = PI_CHECK_ERROR(cuMemsetD32Async(dstDevice, value, N, stream));
4477  break;
4478  }
4479  default: {
4480  // CUDA has no memset functions that allow setting values more than 4
4481  // bytes. PI API lets you pass an arbitrary "pattern" to the buffer
4482  // fill, which can be more than 4 bytes. We must break up the pattern
4483  // into 4 byte values, and set the buffer using multiple strided calls.
4484  // This means that one cuMemsetD2D32Async call is made for every 4 bytes
4485  // in the pattern.
4486 
4487  auto number_of_steps = pattern_size / sizeof(uint32_t);
4488 
4489  // we walk up the pattern in 4-byte steps, and call cuMemset for each
4490  // 4-byte chunk of the pattern.
4491  for (auto step = 0u; step < number_of_steps; ++step) {
4492  // take 4 bytes of the pattern
4493  auto value = *(static_cast<const uint32_t *>(pattern) + step);
4494 
4495  // offset the pointer to the part of the buffer we want to write to
4496  auto offset_ptr = dstDevice + (step * sizeof(uint32_t));
4497 
4498  // set all of the pattern chunks
4499  result = PI_CHECK_ERROR(
4500  cuMemsetD2D32Async(offset_ptr, pattern_size, value, 1, N, stream));
4501  }
4502 
4503  break;
4504  }
4505  }
4506 
4507  if (event) {
4508  result = retImplEv->record();
4509  *event = retImplEv.release();
4510  }
4511 
4512  return result;
4513  } catch (pi_result err) {
4514  return err;
4515  } catch (...) {
4516  return PI_ERROR_UNKNOWN;
4517  }
4518 }
4519 
4520 static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR array_desc) {
4521  switch (array_desc.Format) {
4522  case CU_AD_FORMAT_UNSIGNED_INT8:
4523  case CU_AD_FORMAT_SIGNED_INT8:
4524  return 1;
4525  case CU_AD_FORMAT_UNSIGNED_INT16:
4526  case CU_AD_FORMAT_SIGNED_INT16:
4527  case CU_AD_FORMAT_HALF:
4528  return 2;
4529  case CU_AD_FORMAT_UNSIGNED_INT32:
4530  case CU_AD_FORMAT_SIGNED_INT32:
4531  case CU_AD_FORMAT_FLOAT:
4532  return 4;
4533  default:
4534  sycl::detail::pi::die("Invalid image format.");
4535  return 0;
4536  }
4537 }
4538 
4545  CUstream cu_stream, pi_mem_type img_type, const size_t *region,
4546  const void *src_ptr, const CUmemorytype_enum src_type,
4547  const size_t *src_offset, void *dst_ptr, const CUmemorytype_enum dst_type,
4548  const size_t *dst_offset) {
4549  assert(region != nullptr);
4550 
4551  assert(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST);
4552  assert(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST);
4553 
4554  if (img_type == PI_MEM_TYPE_IMAGE2D) {
4555  CUDA_MEMCPY2D cpyDesc;
4556  memset(&cpyDesc, 0, sizeof(cpyDesc));
4557  cpyDesc.srcMemoryType = src_type;
4558  if (src_type == CU_MEMORYTYPE_ARRAY) {
4559  cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
4560  cpyDesc.srcXInBytes = src_offset[0];
4561  cpyDesc.srcY = src_offset[1];
4562  } else {
4563  cpyDesc.srcHost = src_ptr;
4564  }
4565  cpyDesc.dstMemoryType = dst_type;
4566  if (dst_type == CU_MEMORYTYPE_ARRAY) {
4567  cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
4568  cpyDesc.dstXInBytes = dst_offset[0];
4569  cpyDesc.dstY = dst_offset[1];
4570  } else {
4571  cpyDesc.dstHost = dst_ptr;
4572  }
4573  cpyDesc.WidthInBytes = region[0];
4574  cpyDesc.Height = region[1];
4575  return PI_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cu_stream));
4576  }
4577  if (img_type == PI_MEM_TYPE_IMAGE3D) {
4578  CUDA_MEMCPY3D cpyDesc;
4579  memset(&cpyDesc, 0, sizeof(cpyDesc));
4580  cpyDesc.srcMemoryType = src_type;
4581  if (src_type == CU_MEMORYTYPE_ARRAY) {
4582  cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
4583  cpyDesc.srcXInBytes = src_offset[0];
4584  cpyDesc.srcY = src_offset[1];
4585  cpyDesc.srcZ = src_offset[2];
4586  } else {
4587  cpyDesc.srcHost = src_ptr;
4588  }
4589  cpyDesc.dstMemoryType = dst_type;
4590  if (dst_type == CU_MEMORYTYPE_ARRAY) {
4591  cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
4592  cpyDesc.dstXInBytes = dst_offset[0];
4593  cpyDesc.dstY = dst_offset[1];
4594  cpyDesc.dstZ = dst_offset[2];
4595  } else {
4596  cpyDesc.dstHost = dst_ptr;
4597  }
4598  cpyDesc.WidthInBytes = region[0];
4599  cpyDesc.Height = region[1];
4600  cpyDesc.Depth = region[2];
4601  return PI_CHECK_ERROR(cuMemcpy3DAsync(&cpyDesc, cu_stream));
4602  }
4603  return PI_ERROR_INVALID_VALUE;
4604 }
4605 
4607  pi_queue command_queue, pi_mem image, pi_bool blocking_read,
4608  const size_t *origin, const size_t *region, size_t row_pitch,
4609  size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list,
4610  const pi_event *event_wait_list, pi_event *event) {
4611  // Ignore unused parameters
4612  (void)row_pitch;
4613  (void)slice_pitch;
4614 
4615  assert(command_queue != nullptr);
4616  assert(image != nullptr);
4617  assert(image->mem_type_ == _pi_mem::mem_type::surface);
4618 
4619  pi_result retErr = PI_SUCCESS;
4620 
4621  try {
4622  ScopedContext active(command_queue->get_context());
4623  CUstream cuStream = command_queue->get_next_transfer_stream();
4624  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4625  event_wait_list);
4626 
4627  CUarray array = image->mem_.surface_mem_.get_array();
4628 
4629  CUDA_ARRAY_DESCRIPTOR arrayDesc;
4630  retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
4631 
4632  int elementByteSize = imageElementByteSize(arrayDesc);
4633 
4634  size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels;
4635  size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0];
4636 
4637  pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
4638  if (imgType == PI_MEM_TYPE_IMAGE1D) {
4639  retErr = PI_CHECK_ERROR(
4640  cuMemcpyAtoHAsync(ptr, array, byteOffsetX, bytesToCopy, cuStream));
4641  } else {
4642  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4643  size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]};
4644 
4645  retErr = commonEnqueueMemImageNDCopy(
4646  cuStream, imgType, adjustedRegion, &array, CU_MEMORYTYPE_ARRAY,
4647  srcOffset, ptr, CU_MEMORYTYPE_HOST, nullptr);
4648 
4649  if (retErr != PI_SUCCESS) {
4650  return retErr;
4651  }
4652  }
4653 
4654  if (event) {
4656  command_queue, cuStream);
4657  new_event->record();
4658  *event = new_event;
4659  }
4660 
4661  if (blocking_read) {
4662  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
4663  }
4664  } catch (pi_result err) {
4665  return err;
4666  } catch (...) {
4667  return PI_ERROR_UNKNOWN;
4668  }
4669 
4670  return retErr;
4671 }
4672 
4673 pi_result
4675  pi_bool blocking_write, const size_t *origin,
4676  const size_t *region, size_t input_row_pitch,
4677  size_t input_slice_pitch, const void *ptr,
4678  pi_uint32 num_events_in_wait_list,
4679  const pi_event *event_wait_list, pi_event *event) {
4680  // Ignore unused parameters
4681  (void)blocking_write;
4682  (void)input_row_pitch;
4683  (void)input_slice_pitch;
4684 
4685  assert(command_queue != nullptr);
4686  assert(image != nullptr);
4687  assert(image->mem_type_ == _pi_mem::mem_type::surface);
4688 
4689  pi_result retErr = PI_SUCCESS;
4690 
4691  try {
4692  ScopedContext active(command_queue->get_context());
4693  CUstream cuStream = command_queue->get_next_transfer_stream();
4694  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4695  event_wait_list);
4696 
4697  CUarray array = image->mem_.surface_mem_.get_array();
4698 
4699  CUDA_ARRAY_DESCRIPTOR arrayDesc;
4700  retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
4701 
4702  int elementByteSize = imageElementByteSize(arrayDesc);
4703 
4704  size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels;
4705  size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0];
4706 
4707  pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
4708  if (imgType == PI_MEM_TYPE_IMAGE1D) {
4709  retErr = PI_CHECK_ERROR(
4710  cuMemcpyHtoAAsync(array, byteOffsetX, ptr, bytesToCopy, cuStream));
4711  } else {
4712  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4713  size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]};
4714 
4715  retErr = commonEnqueueMemImageNDCopy(
4716  cuStream, imgType, adjustedRegion, ptr, CU_MEMORYTYPE_HOST, nullptr,
4717  &array, CU_MEMORYTYPE_ARRAY, dstOffset);
4718 
4719  if (retErr != PI_SUCCESS) {
4720  return retErr;
4721  }
4722  }
4723 
4724  if (event) {
4726  command_queue, cuStream);
4727  new_event->record();
4728  *event = new_event;
4729  }
4730  } catch (pi_result err) {
4731  return err;
4732  } catch (...) {
4733  return PI_ERROR_UNKNOWN;
4734  }
4735 
4736  return retErr;
4737 }
4738 
4740  pi_mem dst_image, const size_t *src_origin,
4741  const size_t *dst_origin,
4742  const size_t *region,
4743  pi_uint32 num_events_in_wait_list,
4744  const pi_event *event_wait_list,
4745  pi_event *event) {
4746  assert(src_image->mem_type_ == _pi_mem::mem_type::surface);
4747  assert(dst_image->mem_type_ == _pi_mem::mem_type::surface);
4748  assert(src_image->mem_.surface_mem_.get_image_type() ==
4749  dst_image->mem_.surface_mem_.get_image_type());
4750 
4751  pi_result retErr = PI_SUCCESS;
4752 
4753  try {
4754  ScopedContext active(command_queue->get_context());
4755  CUstream cuStream = command_queue->get_next_transfer_stream();
4756  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4757  event_wait_list);
4758 
4759  CUarray srcArray = src_image->mem_.surface_mem_.get_array();
4760  CUarray dstArray = dst_image->mem_.surface_mem_.get_array();
4761 
4762  CUDA_ARRAY_DESCRIPTOR srcArrayDesc;
4763  retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&srcArrayDesc, srcArray));
4764  CUDA_ARRAY_DESCRIPTOR dstArrayDesc;
4765  retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&dstArrayDesc, dstArray));
4766 
4767  assert(srcArrayDesc.Format == dstArrayDesc.Format);
4768  assert(srcArrayDesc.NumChannels == dstArrayDesc.NumChannels);
4769 
4770  int elementByteSize = imageElementByteSize(srcArrayDesc);
4771 
4772  size_t dstByteOffsetX =
4773  dst_origin[0] * elementByteSize * srcArrayDesc.NumChannels;
4774  size_t srcByteOffsetX =
4775  src_origin[0] * elementByteSize * dstArrayDesc.NumChannels;
4776  size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0];
4777 
4778  pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type();
4779  if (imgType == PI_MEM_TYPE_IMAGE1D) {
4780  retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray,
4781  srcByteOffsetX, bytesToCopy));
4782  } else {
4783  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4784  size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
4785  size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
4786 
4787  retErr = commonEnqueueMemImageNDCopy(
4788  cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY,
4789  srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset);
4790 
4791  if (retErr != PI_SUCCESS) {
4792  return retErr;
4793  }
4794  }
4795 
4796  if (event) {
4798  command_queue, cuStream);
4799  new_event->record();
4800  *event = new_event;
4801  }
4802  } catch (pi_result err) {
4803  return err;
4804  } catch (...) {
4805  return PI_ERROR_UNKNOWN;
4806  }
4807 
4808  return retErr;
4809 }
4810 
4813  const size_t *, const size_t *, pi_uint32,
4814  const pi_event *, pi_event *) {
4815  sycl::detail::pi::die("cuda_piEnqueueMemImageFill not implemented");
4816  return {};
4817 }
4818 
4825  pi_bool blocking_map,
4826  pi_map_flags map_flags, size_t offset,
4827  size_t size,
4828  pi_uint32 num_events_in_wait_list,
4829  const pi_event *event_wait_list,
4830  pi_event *event, void **ret_map) {
4831  assert(ret_map != nullptr);
4832  assert(command_queue != nullptr);
4833  assert(buffer != nullptr);
4834  assert(buffer->mem_type_ == _pi_mem::mem_type::buffer);
4835 
4836  pi_result ret_err = PI_ERROR_INVALID_OPERATION;
4837  const bool is_pinned = buffer->mem_.buffer_mem_.allocMode_ ==
4839 
4840  // Currently no support for overlapping regions
4841  if (buffer->mem_.buffer_mem_.get_map_ptr() != nullptr) {
4842  return ret_err;
4843  }
4844 
4845  // Allocate a pointer in the host to store the mapped information
4846  auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags);
4847  *ret_map = buffer->mem_.buffer_mem_.get_map_ptr();
4848  if (hostPtr) {
4849  ret_err = PI_SUCCESS;
4850  }
4851 
4852  if (!is_pinned && ((map_flags & PI_MAP_READ) || (map_flags & PI_MAP_WRITE))) {
4853  // Pinned host memory is already on host so it doesn't need to be read.
4854  ret_err = cuda_piEnqueueMemBufferRead(
4855  command_queue, buffer, blocking_map, offset, size, hostPtr,
4856  num_events_in_wait_list, event_wait_list, event);
4857  } else {
4858  ScopedContext active(command_queue->get_context());
4859 
4860  if (is_pinned) {
4861  ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
4862  event_wait_list, nullptr);
4863  }
4864 
4865  if (event) {
4866  try {
4867  *event = _pi_event::make_native(
4868  PI_COMMAND_TYPE_MEM_BUFFER_MAP, command_queue,
4869  command_queue->get_next_transfer_stream());
4870  (*event)->start();
4871  (*event)->record();
4872  } catch (pi_result error) {
4873  ret_err = error;
4874  }
4875  }
4876  }
4877 
4878  return ret_err;
4879 }
4880 
4886  void *mapped_ptr,
4887  pi_uint32 num_events_in_wait_list,
4888  const pi_event *event_wait_list,
4889  pi_event *event) {
4890  pi_result ret_err = PI_SUCCESS;
4891 
4892  assert(command_queue != nullptr);
4893  assert(mapped_ptr != nullptr);
4894  assert(memobj != nullptr);
4895  assert(memobj->mem_type_ == _pi_mem::mem_type::buffer);
4896  assert(memobj->mem_.buffer_mem_.get_map_ptr() != nullptr);
4897  assert(memobj->mem_.buffer_mem_.get_map_ptr() == mapped_ptr);
4898 
4899  const bool is_pinned = memobj->mem_.buffer_mem_.allocMode_ ==
4901 
4902  if (!is_pinned &&
4903  ((memobj->mem_.buffer_mem_.get_map_flags() & PI_MAP_WRITE) ||
4904  (memobj->mem_.buffer_mem_.get_map_flags() &
4906  // Pinned host memory is only on host so it doesn't need to be written to.
4907  ret_err = cuda_piEnqueueMemBufferWrite(
4908  command_queue, memobj, true,
4909  memobj->mem_.buffer_mem_.get_map_offset(mapped_ptr),
4910  memobj->mem_.buffer_mem_.get_size(), mapped_ptr,
4911  num_events_in_wait_list, event_wait_list, event);
4912  } else {
4913  ScopedContext active(command_queue->get_context());
4914 
4915  if (is_pinned) {
4916  ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
4917  event_wait_list, nullptr);
4918  }
4919 
4920  if (event) {
4921  try {
4922  *event = _pi_event::make_native(
4923  PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, command_queue,
4924  command_queue->get_next_transfer_stream());
4925  (*event)->start();
4926  (*event)->record();
4927  } catch (pi_result error) {
4928  ret_err = error;
4929  }
4930  }
4931  }
4932 
4933  memobj->mem_.buffer_mem_.unmap(mapped_ptr);
4934  return ret_err;
4935 }
4936 
4939 pi_result cuda_piextUSMHostAlloc(void **result_ptr, pi_context context,
4940  pi_usm_mem_properties *properties, size_t size,
4941  pi_uint32 alignment) {
4942  assert(result_ptr != nullptr);
4943  assert(context != nullptr);
4944  assert(properties == nullptr || *properties == 0);
4945  pi_result result = PI_SUCCESS;
4946  try {
4947  ScopedContext active(context);
4948  result = PI_CHECK_ERROR(cuMemAllocHost(result_ptr, size));
4949  } catch (pi_result error) {
4950  result = error;
4951  }
4952 
4953  assert(alignment == 0 ||
4954  (result == PI_SUCCESS &&
4955  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
4956  return result;
4957 }
4958 
4961 pi_result cuda_piextUSMDeviceAlloc(void **result_ptr, pi_context context,
4962  pi_device device,
4963  pi_usm_mem_properties *properties,
4964  size_t size, pi_uint32 alignment) {
4965  assert(result_ptr != nullptr);
4966  assert(context != nullptr);
4967  assert(device != nullptr);
4968  assert(properties == nullptr || *properties == 0);
4969  pi_result result = PI_SUCCESS;
4970  try {
4971  ScopedContext active(context);
4972  result = PI_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)result_ptr, size));
4973  } catch (pi_result error) {
4974  result = error;
4975  }
4976 
4977  assert(alignment == 0 ||
4978  (result == PI_SUCCESS &&
4979  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
4980  return result;
4981 }
4982 
4985 pi_result cuda_piextUSMSharedAlloc(void **result_ptr, pi_context context,
4986  pi_device device,
4987  pi_usm_mem_properties *properties,
4988  size_t size, pi_uint32 alignment) {
4989  assert(result_ptr != nullptr);
4990  assert(context != nullptr);
4991  assert(device != nullptr);
4992  assert(properties == nullptr || *properties == 0);
4993  pi_result result = PI_SUCCESS;
4994  try {
4995  ScopedContext active(context);
4996  result = PI_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)result_ptr, size,
4997  CU_MEM_ATTACH_GLOBAL));
4998  } catch (pi_result error) {
4999  result = error;
5000  }
5001 
5002  assert(alignment == 0 ||
5003  (result == PI_SUCCESS &&
5004  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
5005  return result;
5006 }
5007 
5011  assert(context != nullptr);
5012  pi_result result = PI_SUCCESS;
5013  try {
5014  ScopedContext active(context);
5015  bool is_managed;
5016  unsigned int type;
5017  void *attribute_values[2] = {&is_managed, &type};
5018  CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
5019  CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
5020  result = PI_CHECK_ERROR(cuPointerGetAttributes(
5021  2, attributes, attribute_values, (CUdeviceptr)ptr));
5022  assert(type == CU_MEMORYTYPE_DEVICE || type == CU_MEMORYTYPE_HOST);
5023  if (is_managed || type == CU_MEMORYTYPE_DEVICE) {
5024  // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
5025  // with cuMemFree
5026  result = PI_CHECK_ERROR(cuMemFree((CUdeviceptr)ptr));
5027  } else {
5028  // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost
5029  result = PI_CHECK_ERROR(cuMemFreeHost(ptr));
5030  }
5031  } catch (pi_result error) {
5032  result = error;
5033  }
5034  return result;
5035 }
5036 
5038  size_t count,
5039  pi_uint32 num_events_in_waitlist,
5040  const pi_event *events_waitlist,
5041  pi_event *event) {
5042  assert(queue != nullptr);
5043  assert(ptr != nullptr);
5044  pi_result result = PI_SUCCESS;
5045  std::unique_ptr<_pi_event> event_ptr{nullptr};
5046 
5047  try {
5048  ScopedContext active(queue->get_context());
5049  pi_uint32 stream_token;
5050  _pi_stream_guard guard;
5051  CUstream cuStream = queue->get_next_compute_stream(
5052  num_events_in_waitlist, events_waitlist, guard, &stream_token);
5053  result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
5054  events_waitlist);
5055  if (event) {
5056  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5057  PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue, cuStream, stream_token));
5058  event_ptr->start();
5059  }
5060  result = PI_CHECK_ERROR(cuMemsetD8Async(
5061  (CUdeviceptr)ptr, (unsigned char)value & 0xFF, count, cuStream));
5062  if (event) {
5063  result = event_ptr->record();
5064  *event = event_ptr.release();
5065  }
5066  } catch (pi_result err) {
5067  result = err;
5068  }
5069  return result;
5070 }
5071 
5073  void *dst_ptr, const void *src_ptr,
5074  size_t size,
5075  pi_uint32 num_events_in_waitlist,
5076  const pi_event *events_waitlist,
5077  pi_event *event) {
5078  assert(queue != nullptr);
5079  assert(dst_ptr != nullptr);
5080  assert(src_ptr != nullptr);
5081  pi_result result = PI_SUCCESS;
5082 
5083  std::unique_ptr<_pi_event> event_ptr{nullptr};
5084 
5085  try {
5086  ScopedContext active(queue->get_context());
5087  CUstream cuStream = queue->get_next_transfer_stream();
5088  result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
5089  events_waitlist);
5090  if (event) {
5091  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5092  PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream));
5093  event_ptr->start();
5094  }
5095  result = PI_CHECK_ERROR(cuMemcpyAsync(
5096  (CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr, size, cuStream));
5097  if (event) {
5098  result = event_ptr->record();
5099  }
5100  if (blocking) {
5101  result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
5102  }
5103  if (event) {
5104  *event = event_ptr.release();
5105  }
5106  } catch (pi_result err) {
5107  result = err;
5108  }
5109  return result;
5110 }
5111 
5113  size_t size,
5114  pi_usm_migration_flags flags,
5115  pi_uint32 num_events_in_waitlist,
5116  const pi_event *events_waitlist,
5117  pi_event *event) {
5118  pi_device device = queue->get_context()->get_device();
5119 
5120  // Certain cuda devices and Windows do not have support for some Unified
5121  // Memory features. cuMemPrefetchAsync requires concurrent memory access
5122  // for managed memory. Therfore, ignore prefetch hint if concurrent managed
5123  // memory access is not available.
5124  if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
5125  setErrorMessage("Prefetch hint ignored as device does not support "
5126  "concurrent managed access",
5127  PI_SUCCESS);
5128  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5129  }
5130 
5131  unsigned int is_managed;
5132  PI_CHECK_ERROR(cuPointerGetAttribute(
5133  &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr));
5134  if (!is_managed) {
5135  setErrorMessage("Prefetch hint ignored as prefetch only works with USM",
5136  PI_SUCCESS);
5137  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5138  }
5139 
5140  // flags is currently unused so fail if set
5141  if (flags != 0)
5142  return PI_ERROR_INVALID_VALUE;
5143  assert(queue != nullptr);
5144  assert(ptr != nullptr);
5145  pi_result result = PI_SUCCESS;
5146  std::unique_ptr<_pi_event> event_ptr{nullptr};
5147 
5148  try {
5149  ScopedContext active(queue->get_context());
5150  CUstream cuStream = queue->get_next_transfer_stream();
5151  result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
5152  events_waitlist);
5153  if (event) {
5154  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5155  PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream));
5156  event_ptr->start();
5157  }
5158  result = PI_CHECK_ERROR(
5159  cuMemPrefetchAsync((CUdeviceptr)ptr, size, device->get(), cuStream));
5160  if (event) {
5161  result = event_ptr->record();
5162  *event = event_ptr.release();
5163  }
5164  } catch (pi_result err) {
5165  result = err;
5166  }
5167  return result;
5168 }
5169 
5172  size_t length, pi_mem_advice advice,
5173  pi_event *event) {
5174  assert(queue != nullptr);
5175  assert(ptr != nullptr);
5176 
5177  // Certain cuda devices and Windows do not have support for some Unified
5178  // Memory features. Passing CU_MEM_ADVISE_[UN]SET_PREFERRED_LOCATION and
5179  // CU_MEM_ADVISE_[UN]SET_ACCESSED_BY to cuMemAdvise on a GPU device requires
5180  // the GPU device to report a non-zero value for
5181  // CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Therfore, ignore memory
5182  // advise if concurrent managed memory access is not available.
5187  advice == PI_MEM_ADVICE_RESET) {
5188  pi_device device = queue->get_context()->get_device();
5189  if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
5190  setErrorMessage("Mem advise ignored as device does not support "
5191  "concurrent managed access",
5192  PI_SUCCESS);
5193  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5194  }
5195 
5196  // TODO: If ptr points to valid system-allocated pageable memory we should
5197  // check that the device also has the
5198  // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property.
5199  }
5200 
5201  unsigned int is_managed;
5202  PI_CHECK_ERROR(cuPointerGetAttribute(
5203  &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr));
5204  if (!is_managed) {
5206  "Memory advice ignored as memory advices only works with USM",
5207  PI_SUCCESS);
5208  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5209  }
5210 
5211  pi_result result = PI_SUCCESS;
5212  std::unique_ptr<_pi_event> event_ptr{nullptr};
5213 
5214  try {
5215  ScopedContext active(queue->get_context());
5216 
5217  if (event) {
5218  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5220  event_ptr->start();
5221  }
5222 
5223  switch (advice) {
5230  result = PI_CHECK_ERROR(cuMemAdvise(
5231  (CUdeviceptr)ptr, length,
5232  (CUmem_advise)(advice - PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY + 1),
5233  queue->get_context()->get_device()->get()));
5234  break;
5239  result = PI_CHECK_ERROR(cuMemAdvise(
5240  (CUdeviceptr)ptr, length,
5241  (CUmem_advise)(advice - PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY + 1 -
5244  CU_DEVICE_CPU));
5245  break;
5246  case PI_MEM_ADVICE_RESET:
5247  PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
5248  CU_MEM_ADVISE_UNSET_READ_MOSTLY,
5249  queue->get_context()->get_device()->get()));
5250  PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
5251  CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
5252  queue->get_context()->get_device()->get()));
5253  PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
5254  CU_MEM_ADVISE_UNSET_ACCESSED_BY,
5255  queue->get_context()->get_device()->get()));
5256  break;
5257  default:
5258  sycl::detail::pi::die("Unknown advice");
5259  }
5260  if (event) {
5261  result = event_ptr->record();
5262  *event = event_ptr.release();
5263  }
5264  } catch (pi_result err) {
5265  result = err;
5266  } catch (...) {
5267  result = PI_ERROR_UNKNOWN;
5268  }
5269  return result;
5270 }
5271 
5272 // TODO: Implement this. Remember to return true for
5273 // PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT when it is implemented.
5275  const void *, size_t, size_t, pi_uint32,
5276  const pi_event *, pi_event *) {
5277  sycl::detail::pi::die("piextUSMEnqueueFill2D: not implemented");
5278  return {};
5279 }
5280 
5281 // TODO: Implement this. Remember to return true for
5282 // PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT when it is implemented.
5283 pi_result cuda_piextUSMEnqueueMemset2D(pi_queue, void *, size_t, int, size_t,
5284  size_t, pi_uint32, const pi_event *,
5285  pi_event *) {
5286  sycl::detail::pi::die("cuda_piextUSMEnqueueMemset2D: not implemented");
5287  return {};
5288 }
5289 
5305  void *dst_ptr, size_t dst_pitch,
5306  const void *src_ptr, size_t src_pitch,
5307  size_t width, size_t height,
5308  pi_uint32 num_events_in_wait_list,
5309  const pi_event *event_wait_list,
5310  pi_event *event) {
5311 
5312  assert(queue != nullptr);
5313 
5314  pi_result result = PI_SUCCESS;
5315 
5316  try {
5317  ScopedContext active(queue->get_context());
5318  CUstream cuStream = queue->get_next_transfer_stream();
5319  result = enqueueEventsWait(queue, cuStream, num_events_in_wait_list,
5320  event_wait_list);
5321  if (event) {
5323  queue, cuStream);
5324  (*event)->start();
5325  }
5326 
5327  // Determine the direction of copy using cuPointerGetAttribute
5328  // for both the src_ptr and dst_ptr
5329  CUDA_MEMCPY2D cpyDesc = {0};
5330 
5331  getUSMHostOrDevicePtr(src_ptr, &cpyDesc.srcMemoryType, &cpyDesc.srcDevice,
5332  &cpyDesc.srcHost);
5333  getUSMHostOrDevicePtr(dst_ptr, &cpyDesc.dstMemoryType, &cpyDesc.dstDevice,
5334  &cpyDesc.dstHost);
5335 
5336  cpyDesc.dstPitch = dst_pitch;
5337  cpyDesc.srcPitch = src_pitch;
5338  cpyDesc.WidthInBytes = width;
5339  cpyDesc.Height = height;
5340 
5341  result = PI_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cuStream));
5342 
5343  if (event) {
5344  (*event)->record();
5345  }
5346  if (blocking) {
5347  result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
5348  }
5349  } catch (pi_result err) {
5350  result = err;
5351  }
5352  return result;
5353 }
5354 
5372  pi_mem_alloc_info param_name,
5373  size_t param_value_size,
5374  void *param_value,
5375  size_t *param_value_size_ret) {
5376  assert(context != nullptr);
5377  assert(ptr != nullptr);
5378  pi_result result = PI_SUCCESS;
5379 
5380  try {
5381  ScopedContext active(context);
5382  switch (param_name) {
5383  case PI_MEM_ALLOC_TYPE: {
5384  unsigned int value;
5385  // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
5386  CUresult ret = cuPointerGetAttribute(
5387  &value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr);
5388  if (ret == CUDA_ERROR_INVALID_VALUE) {
5389  // pointer not known to the CUDA subsystem
5390  return getInfo(param_value_size, param_value, param_value_size_ret,
5392  }
5393  result = check_error(ret, __func__, __LINE__ - 5, __FILE__);
5394  if (value) {
5395  // pointer to managed memory
5396  return getInfo(param_value_size, param_value, param_value_size_ret,
5398  }
5399  result = PI_CHECK_ERROR(cuPointerGetAttribute(
5400  &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)ptr));
5401  assert(value == CU_MEMORYTYPE_DEVICE || value == CU_MEMORYTYPE_HOST);
5402  if (value == CU_MEMORYTYPE_DEVICE) {
5403  // pointer to device memory
5404  return getInfo(param_value_size, param_value, param_value_size_ret,
5406  }
5407  if (value == CU_MEMORYTYPE_HOST) {
5408  // pointer to host memory
5409  return getInfo(param_value_size, param_value, param_value_size_ret,
5411  }
5412  // should never get here
5413 #ifdef _MSC_VER
5414  __assume(0);
5415 #else
5416  __builtin_unreachable();
5417 #endif
5418  return getInfo(param_value_size, param_value, param_value_size_ret,
5420  }
5421  case PI_MEM_ALLOC_BASE_PTR: {
5422 #if CUDA_VERSION >= 10020
5423  // CU_POINTER_ATTRIBUTE_RANGE_START_ADDR was introduced in CUDA 10.2
5424  unsigned int value;
5425  result = PI_CHECK_ERROR(cuPointerGetAttribute(
5426  &value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)ptr));
5427  return getInfo(param_value_size, param_value, param_value_size_ret,
5428  value);
5429 #else
5430  return PI_ERROR_INVALID_VALUE;
5431 #endif
5432  }
5433  case PI_MEM_ALLOC_SIZE: {
5434 #if CUDA_VERSION >= 10020
5435  // CU_POINTER_ATTRIBUTE_RANGE_SIZE was introduced in CUDA 10.2
5436  unsigned int value;
5437  result = PI_CHECK_ERROR(cuPointerGetAttribute(
5438  &value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)ptr));
5439  return getInfo(param_value_size, param_value, param_value_size_ret,
5440  value);
5441 #else
5442  return PI_ERROR_INVALID_VALUE;
5443 #endif
5444  }
5445  case PI_MEM_ALLOC_DEVICE: {
5446  // get device index associated with this pointer
5447  unsigned int device_idx;
5448  result = PI_CHECK_ERROR(cuPointerGetAttribute(
5449  &device_idx, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr));
5450 
5451  // currently each device is in its own platform, so find the platform at
5452  // the same index
5453  std::vector<pi_platform> platforms;
5454  platforms.resize(device_idx + 1);
5455  result = cuda_piPlatformsGet(device_idx + 1, platforms.data(), nullptr);
5456 
5457  // get the device from the platform
5458  pi_device device = platforms[device_idx]->devices_[0].get();
5459  return getInfo(param_value_size, param_value, param_value_size_ret,
5460  device);
5461  }
5462  }
5463  } catch (pi_result error) {
5464  result = error;
5465  }
5466  return result;
5467 }
5468 
5470  pi_queue queue, pi_program program, const char *name,
5471  pi_bool blocking_write, size_t count, size_t offset, const void *src,
5472  pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
5473  pi_event *event) {
5474  assert(queue != nullptr);
5475  assert(program != nullptr);
5476 
5477  if (name == nullptr || src == nullptr)
5478  return PI_ERROR_INVALID_VALUE;
5479 
5480  // Since CUDA requires a the global variable to be referenced by name, we use
5481  // metadata to find the correct name to access it by.
5482  auto device_global_name_it = program->globalIDMD_.find(name);
5483  if (device_global_name_it == program->globalIDMD_.end())
5484  return PI_ERROR_INVALID_VALUE;
5485  std::string device_global_name = device_global_name_it->second;
5486 
5487  pi_result result = PI_SUCCESS;
5488  try {
5489  CUdeviceptr device_global = 0;
5490  size_t device_global_size = 0;
5491  result = PI_CHECK_ERROR(
5492  cuModuleGetGlobal(&device_global, &device_global_size, program->get(),
5493  device_global_name.c_str()));
5494 
5495  if (offset + count > device_global_size)
5496  return PI_ERROR_INVALID_VALUE;
5497 
5499  queue, blocking_write, reinterpret_cast<void *>(device_global + offset),
5500  src, count, num_events_in_wait_list, event_wait_list, event);
5501  } catch (pi_result error) {
5502  result = error;
5503  }
5504  return result;
5505 }
5506 
5508  pi_queue queue, pi_program program, const char *name, pi_bool blocking_read,
5509  size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list,
5510  const pi_event *event_wait_list, pi_event *event) {
5511  assert(queue != nullptr);
5512  assert(program != nullptr);
5513 
5514  if (name == nullptr || dst == nullptr)
5515  return PI_ERROR_INVALID_VALUE;
5516 
5517  // Since CUDA requires a the global variable to be referenced by name, we use
5518  // metadata to find the correct name to access it by.
5519  auto device_global_name_it = program->globalIDMD_.find(name);
5520  if (device_global_name_it == program->globalIDMD_.end())
5521  return PI_ERROR_INVALID_VALUE;
5522  std::string device_global_name = device_global_name_it->second;
5523 
5524  pi_result result = PI_SUCCESS;
5525  try {
5526  CUdeviceptr device_global = 0;
5527  size_t device_global_size = 0;
5528  result = PI_CHECK_ERROR(
5529  cuModuleGetGlobal(&device_global, &device_global_size, program->get(),
5530  device_global_name.c_str()));
5531 
5532  if (offset + count > device_global_size)
5533  return PI_ERROR_INVALID_VALUE;
5534 
5536  queue, blocking_read, dst,
5537  reinterpret_cast<const void *>(device_global + offset), count,
5538  num_events_in_wait_list, event_wait_list, event);
5539  } catch (pi_result error) {
5540  result = error;
5541  }
5542  return result;
5543 }
5544 
5545 // This API is called by Sycl RT to notify the end of the plugin lifetime.
5546 // Windows: dynamically loaded plugins might have been unloaded already
5547 // when this is called. Sycl RT holds onto the PI plugin so it can be
5548 // called safely. But this is not transitive. If the PI plugin in turn
5549 // dynamically loaded a different DLL, that may have been unloaded.
5550 // TODO: add a global variable lifetime management code here (see
5551 // pi_level_zero.cpp for reference) Currently this is just a NOOP.
5554  return PI_SUCCESS;
5555 }
5556 
5557 pi_result cuda_piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime,
5558  uint64_t *HostTime) {
5559  _pi_event::native_type event;
5560  ScopedContext active(Device->get_context());
5561 
5562  if (DeviceTime) {
5563  PI_CHECK_ERROR(cuEventCreate(&event, CU_EVENT_DEFAULT));
5564  PI_CHECK_ERROR(cuEventRecord(event, 0));
5565  }
5566  if (HostTime) {
5567 
5568  using namespace std::chrono;
5569  *HostTime =
5570  duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
5571  .count();
5572  }
5573 
5574  if (DeviceTime) {
5575  PI_CHECK_ERROR(cuEventSynchronize(event));
5576  *DeviceTime = Device->get_elapsed_time(event);
5577  }
5578 
5579  return PI_SUCCESS;
5580 }
5581 
5583 
5585  // Check that the major version matches in PiVersion and SupportedVersion
5587 
5588  // PI interface supports higher version or the same version.
5589  size_t PluginVersionSize = sizeof(PluginInit->PluginVersion);
5590  if (strlen(SupportedVersion) >= PluginVersionSize)
5591  return PI_ERROR_INVALID_VALUE;
5592  strncpy(PluginInit->PluginVersion, SupportedVersion, PluginVersionSize);
5593 
5594  // Set whole function table to zero to make it easier to detect if
5595  // functions are not set up below.
5596  std::memset(&(PluginInit->PiFunctionTable), 0,
5597  sizeof(PluginInit->PiFunctionTable));
5598 
5600 
5601 // Forward calls to CUDA RT.
5602 #define _PI_CL(pi_api, cuda_api) \
5603  (PluginInit->PiFunctionTable).pi_api = (decltype(&::pi_api))(&cuda_api);
5604 
5605  // Platform
5608  // Device
5619  // Context
5628  // Queue
5639  // Memory
5649  // Program
5663  // Kernel
5677  // Event
5689  // Sampler
5694  // Queue commands
5712  // USM
5725  // Device global variable
5730 
5733  _PI_CL(piPluginGetLastError, cuda_piPluginGetLastError)
5736 
5737 #undef _PI_CL
5738 
5739  return PI_SUCCESS;
5740 }
5741 
5742 #ifdef _WIN32
5743 #define __SYCL_PLUGIN_DLL_NAME "pi_cuda.dll"
5744 #include "../common_win_pi_trace/common_win_pi_trace.hpp"
5745 #undef __SYCL_PLUGIN_DLL_NAME
5746 #endif
5747 
5748 } // extern "C"
cuda_piEventRetain
pi_result cuda_piEventRetain(pi_event event)
Definition: pi_cuda.cpp:3850
_pi_sampler::context_
pi_context context_
Definition: pi_cuda.hpp:990
PI_COMMAND_TYPE_USER
@ PI_COMMAND_TYPE_USER
Definition: pi.h:431
setErrorMessage
static void setErrorMessage(const char *message, pi_result error_code)
Definition: pi_esimd_emulator.cpp:157
cuda_piProgramBuild
pi_result cuda_piProgramBuild(pi_program program, pi_uint32 num_devices, const pi_device *device_list, const char *options, void(*pfn_notify)(pi_program program, void *user_data), void *user_data)
Loads the images from a PI program into a CUmodule that can be used later on to extract functions (ke...
Definition: pi_cuda.cpp:3333
PI_PROFILING_INFO_COMMAND_START
@ PI_PROFILING_INFO_COMMAND_START
Definition: pi.h:577
piEventGetProfilingInfo
pi_result piEventGetProfilingInfo(pi_event event, pi_profiling_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1440
PI_DEVICE_INFO_HOST_UNIFIED_MEMORY
@ PI_DEVICE_INFO_HOST_UNIFIED_MEMORY
Definition: pi.h:259
PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT
#define PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT
Extension to denote native support of assert feature by an arbitrary device piDeviceGetInfo call shou...
Definition: pi.h:801
cuda_piMemBufferCreate
pi_result cuda_piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size, void *host_ptr, pi_mem *ret_mem, const pi_mem_properties *properties)
Creates a PI Memory object using a CUDA memory allocation.
Definition: pi_cuda.cpp:2191
piKernelCreate
pi_result piKernelCreate(pi_program program, const char *kernel_name, pi_kernel *ret_kernel)
Definition: pi_esimd_emulator.cpp:1366
PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR
Definition: pi.h:220
piextEnqueueDeviceGlobalVariableRead
pi_result piextEnqueueDeviceGlobalVariableRead(pi_queue queue, pi_program program, const char *name, pi_bool blocking_read, size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API reading data from a device global variable to host.
Definition: pi_esimd_emulator.cpp:2043
getKernelNames
std::string getKernelNames(pi_program)
Finds kernel names by searching for entry points in the PTX source, as the CUDA driver API doesn't ex...
Definition: pi_cuda.cpp:796
PI_DEVICE_INFO_OPENCL_C_VERSION
@ PI_DEVICE_INFO_OPENCL_C_VERSION
Definition: pi.h:277
_pi_mem
PI Mem mapping to CUDA memory allocations, both data and texture/surface.
Definition: pi_cuda.hpp:224
CUevent
struct CUevent_st * CUevent
Definition: backend_traits_cuda.hpp:28
pi_buff_rect_region_struct::depth_scalar
size_t depth_scalar
Definition: pi.h:911
_pi_program::globalIDMD_
std::unordered_map< std::string, std::string > globalIDMD_
Definition: pi_cuda.hpp:771
piEventRelease
pi_result piEventRelease(pi_event event)
Definition: pi_esimd_emulator.cpp:1488
PI_MAP_WRITE
constexpr pi_map_flags PI_MAP_WRITE
Definition: pi.h:597
PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY
@ PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY
Definition: pi.h:308
PI_USM_ACCESS
@ PI_USM_ACCESS
Definition: pi.h:1689
PI_DEVICE_INFO_PROFILE
@ PI_DEVICE_INFO_PROFILE
Definition: pi.h:275
piEnqueueKernelLaunch
pi_result piEnqueueKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1813
piMemGetInfo
pi_result piMemGetInfo(pi_mem mem, pi_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1096
_pi_mem_type
_pi_mem_type
Definition: pi.h:455
PI_KERNEL_INFO_REFERENCE_COUNT
@ PI_KERNEL_INFO_REFERENCE_COUNT
Definition: pi.h:379
PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH
@ PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH
Definition: pi.h:242
PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR
Definition: pi.h:227
_pi_mem::mem_::surface_mem_::get_surface
CUsurfObject get_surface() const noexcept
Definition: pi_cuda.hpp:327
PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES
@ PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES
Definition: pi.h:354
cuda_piEnqueueMemBufferFill
pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, const void *pattern, size_t pattern_size, size_t offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:4420
cuda_piextUSMEnqueueMemcpy
pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, void *dst_ptr, const void *src_ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Definition: pi_cuda.cpp:5072
CUstream
struct CUstream_st * CUstream
Definition: backend_traits_cuda.hpp:27
PI_DEVICE_INFO_DOUBLE_FP_CONFIG
@ PI_DEVICE_INFO_DOUBLE_FP_CONFIG
Definition: pi.h:218
_pi_context_info
_pi_context_info
Definition: pi.h:347
PI_EXT_INTEL_DEVICE_INFO_MEMORY_BUS_WIDTH
@ PI_EXT_INTEL_DEVICE_INFO_MEMORY_BUS_WIDTH
Definition: pi.h:314
pi_buff_rect_region_struct::height_scalar
size_t height_scalar
Definition: pi.h:910
PI_MEM_TYPE_IMAGE1D
@ PI_MEM_TYPE_IMAGE1D
Definition: pi.h:460
pi_buff_rect_offset_struct
Definition: pi.h:899
_pi_context::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:211
_pi_program::context_
_pi_context * context_
Definition: pi_cuda.hpp:766
piextUSMFree
pi_result piextUSMFree(pi_context context, void *ptr)
Indicates that the allocated USM memory is no longer needed on the runtime side.
Definition: pi_esimd_emulator.cpp:1943
piKernelSetArg
pi_result piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Definition: pi_esimd_emulator.cpp:1370
_pi_event::get_context
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:672
PI_KERNEL_INFO_ATTRIBUTES
@ PI_KERNEL_INFO_ATTRIBUTES
Definition: pi.h:382
cuda_piEnqueueMemBufferRead
pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, pi_bool blocking_read, size_t offset, size_t size, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:2715
_pi_mem::mem_::buffer_mem_::hostPtr_
void * hostPtr_
Pointer associated with this device on the host.
Definition: pi_cuda.hpp:253
_pi_context::deviceId_
_pi_device * deviceId_
Definition: pi_cuda.hpp:182
PI_DEVICE_INFO_DRIVER_VERSION
@ PI_DEVICE_INFO_DRIVER_VERSION
Definition: pi.h:274
cuda_piEnqueueMemImageFill
pi_result cuda_piEnqueueMemImageFill(pi_queue, pi_mem, const void *, const size_t *, const size_t *, pi_uint32, const pi_event *, pi_event *)
\TODO Not implemented in CUDA.
Definition: pi_cuda.cpp:4812
PI_DEVICE_INFO_GPU_EU_COUNT
@ PI_DEVICE_INFO_GPU_EU_COUNT
Definition: pi.h:299
_pi_device_binary_property_struct::Name
char * Name
Definition: pi.h:742
pi_bool
pi_uint32 pi_bool
Definition: pi.h:131
_pi_program::kernelReqdWorkGroupSizeMD_
std::unordered_map< std::string, std::tuple< uint32_t, uint32_t, uint32_t > > kernelReqdWorkGroupSizeMD_
Definition: pi_cuda.hpp:770
_pi_queue::context_
_pi_context * context_
Definition: pi_cuda.hpp:410
PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT
@ PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT
Definition: pi.h:513
PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC
@ PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC
Definition: pi.h:280
CUdeviceptr
unsigned int CUdeviceptr
Definition: backend_traits_cuda.hpp:35
_pi_event::get_compute_stream_token
pi_uint32 get_compute_stream_token() const noexcept
Definition: pi_cuda.hpp:648
_pi_image_format::image_channel_data_type
pi_image_channel_type image_channel_data_type
Definition: pi.h:977
piPluginGetLastError
pi_result piPluginGetLastError(char **message)
API to get Plugin specific warning and error messages.
Definition: pi_esimd_emulator.cpp:165
ErrorMessage
thread_local char ErrorMessage[MaxMessageSize]
Definition: pi_esimd_emulator.cpp:154
PI_QUEUE_INFO_CONTEXT
@ PI_QUEUE_INFO_CONTEXT
Definition: pi.h:365
_pi_queue::barrier_tmp_event_
CUevent barrier_tmp_event_
Definition: pi_cuda.hpp:414
_pi_event::is_started
bool is_started() const noexcept
Definition: pi_cuda.hpp:656
_pi_sampler::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:995
PI_DEVICE_INFO_NAME
@ PI_DEVICE_INFO_NAME
Definition: pi.h:272
cuda_piextEnqueueDeviceGlobalVariableRead
pi_result cuda_piextEnqueueDeviceGlobalVariableRead(pi_queue queue, pi_program program, const char *name, pi_bool blocking_read, size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:5507
_pi_queue::get_next_compute_stream
native_type get_next_compute_stream(pi_uint32 *stream_token=nullptr)
Definition: pi_cuda.cpp:442
piProgramLink
pi_result piProgramLink(pi_context context, pi_uint32 num_devices, const pi_device *device_list, const char *options, pi_uint32 num_input_programs, const pi_program *input_programs, void(*pfn_notify)(pi_program program, void *user_data), void *user_data, pi_program *ret_program)
Definition: pi_opencl.cpp:1230
PI_IMAGE_CHANNEL_TYPE_UNORM_INT8
@ PI_IMAGE_CHANNEL_TYPE_UNORM_INT8
Definition: pi.h:502
PI_PROFILING_INFO_COMMAND_SUBMIT
@ PI_PROFILING_INFO_COMMAND_SUBMIT
Definition: pi.h:576
PI_MEMORY_ORDER_ACQUIRE
constexpr pi_memory_order_capabilities PI_MEMORY_ORDER_ACQUIRE
Definition: pi.h:562
_pi_sampler::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:997
PI_PROGRAM_INFO_BINARY_SIZES
@ PI_PROGRAM_INFO_BINARY_SIZES
Definition: pi.h:341
cuda_piProgramLink
pi_result cuda_piProgramLink(pi_context context, pi_uint32 num_devices, const pi_device *device_list, const char *options, pi_uint32 num_input_programs, const pi_program *input_programs, void(*pfn_notify)(pi_program program, void *user_data), void *user_data, pi_program *ret_program)
Creates a new PI program object that is the outcome of linking all input programs.
Definition: pi_cuda.cpp:3447
cuda_piextKernelSetArgSampler
pi_result cuda_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, const pi_sampler *arg_value)
Definition: pi_cuda.cpp:2888
PI_MEM_FLAGS_HOST_PTR_COPY
constexpr pi_mem_flags PI_MEM_FLAGS_HOST_PTR_COPY
Definition: pi.h:591
cuda_piMemBufferPartition
pi_result cuda_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags, pi_buffer_create_type buffer_create_type, void *buffer_create_info, pi_mem *memObj)
Implements a buffer partition in the CUDA backend.
Definition: pi_cuda.cpp:2331
cuda_piEnqueueMemImageWrite
pi_result cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, pi_bool blocking_write, const size_t *origin, const size_t *region, size_t input_row_pitch, size_t input_slice_pitch, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:4674
_pi_image_format::image_channel_order
pi_image_channel_order image_channel_order
Definition: pi.h:976
cuda_piDevicesGet
pi_result cuda_piDevicesGet(pi_platform platform, pi_device_type device_type, pi_uint32 num_entries, pi_device *devices, pi_uint32 *num_devices)
Definition: pi_cuda.cpp:959
PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT
@ PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT
Definition: pi.h:360
PI_MEM_ALLOC_SIZE
@ PI_MEM_ALLOC_SIZE
Definition: pi.h:1698
_pi_context::get
native_type get() const noexcept
Definition: pi_cuda.hpp:207
PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN
@ PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN
Definition: pi.h:284
enableCUDATracing
void enableCUDATracing()
Definition: tracing.cpp:72
piextProgramGetNativeHandle
pi_result piextProgramGetNativeHandle(pi_program program, pi_native_handle *nativeHandle)
Gets the native handle of a PI program object.
Definition: pi_esimd_emulator.cpp:1357
__SYCL_INLINE_VER_NAMESPACE
#define __SYCL_INLINE_VER_NAMESPACE(X)
Definition: defines_elementary.hpp:11
_pi_image_desc::image_type
pi_mem_type image_type
Definition: pi.h:981
piProgramRetain
pi_result piProgramRetain(pi_program program)
Definition: pi_esimd_emulator.cpp:1353
piextUSMEnqueueMemcpy2D
pi_result piextUSMEnqueueMemcpy2D(pi_queue queue, pi_bool blocking, void *dst_ptr, size_t dst_pitch, const void *src_ptr, size_t src_pitch, size_t width, size_t height, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
USM 2D Memcpy API.
Definition: pi_esimd_emulator.cpp:1997
_pi_plugin
Definition: pi.h:1992
_pi_program::get_context
pi_context get_context() const
Definition: pi_cuda.hpp:789
sycl::_V1::errc::event
@ event
PI_PROGRAM_INFO_SOURCE
@ PI_PROGRAM_INFO_SOURCE
Definition: pi.h:340
cuda_piextUSMEnqueueMemset
pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, size_t count, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Definition: pi_cuda.cpp:5037
enqueueEventWait
pi_result enqueueEventWait(pi_queue queue, pi_event event)
Definition: pi_cuda.cpp:679
_pi_program::MAX_LOG_SIZE
constexpr static size_t MAX_LOG_SIZE
Definition: pi_cuda.hpp:773
PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE
@ PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE
Definition: pi.h:250
cuda_piEnqueueEventsWait
pi_result cuda_piEnqueueEventsWait(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Enqueues a wait on the given CUstream for all events.
Definition: pi_cuda.cpp:3892
piDevicePartition
pi_result piDevicePartition(pi_device device, const pi_device_partition_property *properties, pi_uint32 num_devices, pi_device *out_devices, pi_uint32 *out_num_devices)
Definition: pi_esimd_emulator.cpp:822
PI_KERNEL_COMPILE_NUM_SUB_GROUPS
@ PI_KERNEL_COMPILE_NUM_SUB_GROUPS
Definition: pi.h:409
_pi_mem_advice
_pi_mem_advice
Definition: pi.h:465
cuda_piEnqueueMemBufferCopy
pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:4331
PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES
@ PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES
Definition: pi.h:214
piEnqueueMemBufferCopy
pi_result piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1621
cuda_piEventGetProfilingInfo
pi_result cuda_piEventGetProfilingInfo(pi_event event, pi_profiling_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Obtain profiling information from PI CUDA events \TODO Timings from CUDA are only elapsed time.
Definition: pi_cuda.cpp:3807
PI_DEVICE_INFO_MAX_COMPUTE_UNITS
@ PI_DEVICE_INFO_MAX_COMPUTE_UNITS
Definition: pi.h:212
_pi_stream_guard
std::unique_lock< std::mutex > _pi_stream_guard
Definition: pi_cuda.hpp:67
splitMetadataName
std::pair< std::string, std::string > splitMetadataName(const std::string &metadataName)
Definition: pi_cuda.cpp:698
PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION
@ PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION
Definition: pi.h:471
PI_EVENT_INFO_CONTEXT
@ PI_EVENT_INFO_CONTEXT
Definition: pi.h:415
_pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr
@ alloc_host_ptr
_pi_plugin::PluginVersion
char PluginVersion[20]
Definition: pi.h:2002
PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU
@ PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU
Definition: pi.h:321
_pi_result
_pi_result
Definition: pi.h:140
PI_MEM_ADVICE_CUDA_UNSET_READ_MOSTLY
@ PI_MEM_ADVICE_CUDA_UNSET_READ_MOSTLY
Definition: pi.h:469
PI_PROFILING_INFO_COMMAND_QUEUED
@ PI_PROFILING_INFO_COMMAND_QUEUED
Definition: pi.h:575
sycl::_V1::detail::memcpy
void memcpy(void *Dst, const void *Src, size_t Size)
Definition: memcpy.hpp:16
piextQueueGetNativeHandle
pi_result piextQueueGetNativeHandle(pi_queue queue, pi_native_handle *nativeHandle)
Gets the native handle of a PI queue object.
Definition: pi_esimd_emulator.cpp:1014
piTearDown
pi_result piTearDown(void *PluginParameter)
API to notify that the plugin should clean up its resources.
Definition: pi_esimd_emulator.cpp:2059
_pi_queue::transfer_applied_barrier_
std::vector< bool > transfer_applied_barrier_
Definition: pi_cuda.hpp:409
PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST
@ PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST
Definition: pi.h:477
cuda_piEventCreate
pi_result cuda_piEventCreate(pi_context, pi_event *)
Definition: pi_cuda.cpp:3772
ErrorMessageCode
thread_local pi_result ErrorMessageCode
Definition: pi_esimd_emulator.cpp:153
pi_context_properties
intptr_t pi_context_properties
Definition: pi.h:546
piEnqueueMemUnmap
pi_result piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, void *mapped_ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1684
_pi_queue::for_each_stream
void for_each_stream(T &&f)
Definition: pi_cuda.hpp:526
PI_SAMPLER_INFO_FILTER_MODE
@ PI_SAMPLER_INFO_FILTER_MODE
Definition: pi.h:527
PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE
@ PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE
Definition: pi.h:387
PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8
@ PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8
Definition: pi.h:507
sycl::_V1::ext::oneapi::experimental::alignment
constexpr alignment_key::value_t< K > alignment
Definition: properties.hpp:349
_pi_queue::device_
_pi_device * device_
Definition: pi_cuda.hpp:411
PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION_HOST
@ PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION_HOST
Definition: pi.h:474
piSamplerGetInfo
pi_result piSamplerGetInfo(pi_sampler sampler, pi_sampler_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1522
cuda_piMemImageGetInfo
pi_result cuda_piMemImageGetInfo(pi_mem, pi_image_info, size_t, void *, size_t *)
\TODO Not implemented
Definition: pi_cuda.cpp:3307
_pi_event::get_queued_time
pi_uint64 get_queued_time() const
Definition: pi_cuda.cpp:605
piPlatformGetInfo
pi_result piPlatformGetInfo(pi_platform platform, pi_platform_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:444
_pi_queue::default_num_transfer_streams
static constexpr int default_num_transfer_streams
Definition: pi_cuda.hpp:398
PI_DEVICE_INFO_GPU_SLICES
@ PI_DEVICE_INFO_GPU_SLICES
Definition: pi.h:301
piDeviceRetain
pi_result piDeviceRetain(pi_device device)
Definition: pi_esimd_emulator.cpp:572
PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE
@ PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE
Definition: pi.h:236
piProgramCompile
pi_result piProgramCompile(pi_program program, pi_uint32 num_devices, const pi_device *device_list, const char *options, pi_uint32 num_input_headers, const pi_program *input_headers, const char **header_include_names, void(*pfn_notify)(pi_program program, void *user_data), void *user_data)
_pi_device_type
_pi_device_type
Definition: pi.h:187
cuda_piEnqueueMemBufferCopyRect
pi_result cuda_piEnqueueMemBufferCopyRect(pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:4375
piContextGetInfo
pi_result piContextGetInfo(pi_context context, pi_context_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:867
piQueueRelease
pi_result piQueueRelease(pi_queue command_queue)
Definition: pi_esimd_emulator.cpp:986
cuda_piEventSetCallback
pi_result cuda_piEventSetCallback(pi_event, pi_int32, pfn_notify, void *)
Definition: pi_cuda.cpp:3840
_pi_mem::mem_::buffer_mem_::get_map_offset
size_t get_map_offset(void *) const noexcept
Definition: pi_cuda.hpp:283
PI_EXT_INTEL_DEVICE_INFO_MEMORY_CLOCK_RATE
@ PI_EXT_INTEL_DEVICE_INFO_MEMORY_CLOCK_RATE
Definition: pi.h:311
_pi_mem::context_
pi_context context_
Definition: pi_cuda.hpp:230
PI_DEVICE_INFO_REFERENCE_COUNT
@ PI_DEVICE_INFO_REFERENCE_COUNT
Definition: pi.h:270
_pi_device::get_max_work_group_size
int get_max_work_group_size() const noexcept
Definition: pi_cuda.hpp:129
cuda_piextUSMFree
pi_result cuda_piextUSMFree(pi_context context, void *ptr)
USM: Frees the given USM pointer associated with the context.
Definition: pi_cuda.cpp:5010
PI_MEMORY_SCOPE_WORK_ITEM
constexpr pi_memory_scope_capabilities PI_MEMORY_SCOPE_WORK_ITEM
Definition: pi.h:568
PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT
@ PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT
Definition: pi.h:292
cuda_piextDeviceCreateWithNativeHandle
pi_result cuda_piextDeviceCreateWithNativeHandle(pi_native_handle nativeHandle, pi_platform platform, pi_device *piDevice)
Created a PI device object from a CUDA device handle.
Definition: pi_cuda.cpp:2043
_pi_mem::is_buffer
bool is_buffer() const noexcept
Definition: pi_cuda.hpp:376
PI_DEVICE_INFO_PLATFORM
@ PI_DEVICE_INFO_PLATFORM
Definition: pi.h:269
_pi_queue::backend_has_ownership
bool backend_has_ownership() const noexcept
Definition: pi_cuda.hpp:625
PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE
@ PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE
Definition: pi.h:303
cuda_definitions.hpp
_pi_queue::get_next_transfer_stream
native_type get_next_transfer_stream()
Definition: pi_cuda.cpp:503
cuda_piMemGetInfo
pi_result cuda_piMemGetInfo(pi_mem, pi_mem_info, size_t, void *, size_t *)
Definition: pi_cuda.cpp:2390
PI_DEVICE_LOCAL_MEM_TYPE_LOCAL
@ PI_DEVICE_LOCAL_MEM_TYPE_LOCAL
Definition: pi.h:205
PI_PROGRAM_BUILD_INFO_OPTIONS
@ PI_PROGRAM_BUILD_INFO_OPTIONS
Definition: pi.h:165
PI_PROGRAM_BUILD_STATUS_SUCCESS
@ PI_PROGRAM_BUILD_STATUS_SUCCESS
Definition: pi.h:173
PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES
@ PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES
Definition: pi.h:355
piextDeviceCreateWithNativeHandle
pi_result piextDeviceCreateWithNativeHandle(pi_native_handle nativeHandle, pi_platform platform, pi_device *device)
Creates PI device object from a native handle.
Definition: pi_esimd_emulator.cpp:831
_pi_queue::sync_streams
void sync_streams(T &&f)
Definition: pi_cuda.hpp:547
PI_DEVICE_INFO_DEVICE_ID
@ PI_DEVICE_INFO_DEVICE_ID
Definition: pi.h:297
sycl
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
_pi_event::get_stream
CUstream get_stream() const noexcept
Definition: pi_cuda.hpp:646
piKernelGetSubGroupInfo
pi_result piKernelGetSubGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name, size_t input_value_size, const void *input_value, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
API to query information from the sub-group from a kernel.
Definition: pi_esimd_emulator.cpp:1392
commonEnqueueMemBufferCopyRect
static pi_result commonEnqueueMemBufferCopyRect(CUstream cu_stream, pi_buff_rect_region region, const void *src_ptr, const CUmemorytype_enum src_type, pi_buff_rect_offset src_offset, size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr, const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset, size_t dst_row_pitch, size_t dst_slice_pitch)
General 3D memory copy operation.
Definition: pi_cuda.cpp:4172
PI_DEVICE_INFO_ADDRESS_BITS
@ PI_DEVICE_INFO_ADDRESS_BITS
Definition: pi.h:235
_pi_mem::mem_::surface_mem_::get_array
CUarray get_array() const noexcept
Definition: pi_cuda.hpp:325
max
simd< _Tp, _Abi > max(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
_pi_image_desc::image_height
size_t image_height
Definition: pi.h:983
cuda_piDeviceGetInfo
<