DPC++ Runtime
Runtime libraries for oneAPI DPC++
pi_cuda.cpp
Go to the documentation of this file.
1 //==---------- pi_cuda.cpp - CUDA Plugin -----------------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
13 
16 #include <CL/sycl/detail/pi.hpp>
17 #include <pi_cuda.hpp>
18 
19 #include <algorithm>
20 #include <cassert>
21 #include <cuda.h>
22 #include <cuda_device_runtime_api.h>
23 #include <limits>
24 #include <memory>
25 #include <mutex>
26 #include <regex>
27 
28 namespace {
29 std::string getCudaVersionString() {
30  int driver_version = 0;
31  cuDriverGetVersion(&driver_version);
32  // The version is returned as (1000 major + 10 minor).
33  std::stringstream stream;
34  stream << "CUDA " << driver_version / 1000 << "."
35  << driver_version % 1000 / 10;
36  return stream.str();
37 }
38 
39 pi_result map_error(CUresult result) {
40  switch (result) {
41  case CUDA_SUCCESS:
42  return PI_SUCCESS;
43  case CUDA_ERROR_NOT_PERMITTED:
44  return PI_ERROR_INVALID_OPERATION;
45  case CUDA_ERROR_INVALID_CONTEXT:
46  return PI_ERROR_INVALID_CONTEXT;
47  case CUDA_ERROR_INVALID_DEVICE:
48  return PI_ERROR_INVALID_DEVICE;
49  case CUDA_ERROR_INVALID_VALUE:
50  return PI_ERROR_INVALID_VALUE;
51  case CUDA_ERROR_OUT_OF_MEMORY:
52  return PI_ERROR_OUT_OF_HOST_MEMORY;
53  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
54  return PI_ERROR_OUT_OF_RESOURCES;
55  default:
56  return PI_ERROR_UNKNOWN;
57  }
58 }
59 
60 // Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR
61 constexpr size_t MaxMessageSize = 256;
62 thread_local pi_result ErrorMessageCode = PI_SUCCESS;
63 thread_local char ErrorMessage[MaxMessageSize];
64 
65 // Utility function for setting a message and warning
66 static void setErrorMessage(const char *message, pi_result error_code) {
67  assert(strlen(message) <= MaxMessageSize);
68  strcpy(ErrorMessage, message);
69  ErrorMessageCode = error_code;
70 }
71 
72 // Returns plugin specific error and warning messages
73 pi_result cuda_piPluginGetLastError(char **message) {
74  *message = &ErrorMessage[0];
75  return ErrorMessageCode;
76 }
77 
78 // Iterates over the event wait list, returns correct pi_result error codes.
79 // Invokes the callback for the latest event of each queue in the wait list.
80 // The callback must take a single pi_event argument and return a pi_result.
81 template <typename Func>
82 pi_result forLatestEvents(const pi_event *event_wait_list,
83  std::size_t num_events_in_wait_list, Func &&f) {
84 
85  if (event_wait_list == nullptr || num_events_in_wait_list == 0) {
86  return PI_ERROR_INVALID_EVENT_WAIT_LIST;
87  }
88 
89  // Fast path if we only have a single event
90  if (num_events_in_wait_list == 1) {
91  return f(event_wait_list[0]);
92  }
93 
94  std::vector<pi_event> events{event_wait_list,
95  event_wait_list + num_events_in_wait_list};
96  std::sort(events.begin(), events.end(), [](pi_event e0, pi_event e1) {
97  // Tiered sort creating sublists of streams (smallest value first) in which
98  // the corresponding events are sorted into a sequence of newest first.
99  return e0->get_stream() < e1->get_stream() ||
100  (e0->get_stream() == e1->get_stream() &&
101  e0->get_event_id() > e1->get_event_id());
102  });
103 
104  bool first = true;
105  CUstream lastSeenStream = 0;
106  for (pi_event event : events) {
107  if (!event || (!first && event->get_stream() == lastSeenStream)) {
108  continue;
109  }
110 
111  first = false;
112  lastSeenStream = event->get_stream();
113 
114  auto result = f(event);
115  if (result != PI_SUCCESS) {
116  return result;
117  }
118  }
119 
120  return PI_SUCCESS;
121 }
122 
130 pi_result check_error(CUresult result, const char *function, int line,
131  const char *file) {
132  if (result == CUDA_SUCCESS || result == CUDA_ERROR_DEINITIALIZED) {
133  return PI_SUCCESS;
134  }
135 
136  const char *errorString = nullptr;
137  const char *errorName = nullptr;
138  cuGetErrorName(result, &errorName);
139  cuGetErrorString(result, &errorString);
140  std::cerr << "\nPI CUDA ERROR:"
141  << "\n\tValue: " << result
142  << "\n\tName: " << errorName
143  << "\n\tDescription: " << errorString
144  << "\n\tFunction: " << function
145  << "\n\tSource Location: " << file << ":" << line << "\n"
146  << std::endl;
147 
148  if (std::getenv("PI_CUDA_ABORT") != nullptr) {
149  std::abort();
150  }
151 
152  throw map_error(result);
153 }
154 
156 #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)
157 
160 //
164 //
177 //
180 class ScopedContext {
181 public:
182  ScopedContext(pi_context ctxt) {
183  if (!ctxt) {
184  throw PI_ERROR_INVALID_CONTEXT;
185  }
186 
187  set_context(ctxt->get());
188  }
189 
190  ScopedContext(CUcontext ctxt) { set_context(ctxt); }
191 
192  ~ScopedContext() {}
193 
194 private:
195  void set_context(CUcontext desired) {
196  CUcontext original = nullptr;
197 
198  PI_CHECK_ERROR(cuCtxGetCurrent(&original));
199 
200  // Make sure the desired context is active on the current thread, setting
201  // it if necessary
202  if (original != desired) {
203  PI_CHECK_ERROR(cuCtxSetCurrent(desired));
204  }
205  }
206 };
207 
209 template <typename T, typename Assign>
210 pi_result getInfoImpl(size_t param_value_size, void *param_value,
211  size_t *param_value_size_ret, T value, size_t value_size,
212  Assign &&assign_func) {
213 
214  if (param_value != nullptr) {
215 
216  if (param_value_size < value_size) {
217  return PI_ERROR_INVALID_VALUE;
218  }
219 
220  assign_func(param_value, value, value_size);
221  }
222 
223  if (param_value_size_ret != nullptr) {
224  *param_value_size_ret = value_size;
225  }
226 
227  return PI_SUCCESS;
228 }
229 
230 template <typename T>
231 pi_result getInfo(size_t param_value_size, void *param_value,
232  size_t *param_value_size_ret, T value) {
233 
234  auto assignment = [](void *param_value, T value, size_t value_size) {
235  // Ignore unused parameter
236  (void)value_size;
237 
238  *static_cast<T *>(param_value) = value;
239  };
240 
241  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
242  sizeof(T), assignment);
243 }
244 
245 template <typename T>
246 pi_result getInfoArray(size_t array_length, size_t param_value_size,
247  void *param_value, size_t *param_value_size_ret,
248  T *value) {
249  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
250  array_length * sizeof(T), memcpy);
251 }
252 
253 template <>
254 pi_result getInfo<const char *>(size_t param_value_size, void *param_value,
255  size_t *param_value_size_ret,
256  const char *value) {
257  return getInfoArray(strlen(value) + 1, param_value_size, param_value,
258  param_value_size_ret, value);
259 }
260 
261 int getAttribute(pi_device device, CUdevice_attribute attribute) {
262  int value;
264  cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS);
265  return value;
266 }
268 
269 // Determine local work sizes that result in uniform work groups.
270 // The default threadsPerBlock only require handling the first work_dim
271 // dimension.
272 void guessLocalWorkSize(size_t *threadsPerBlock, const size_t *global_work_size,
273  const size_t maxThreadsPerBlock[3], pi_kernel kernel,
274  pi_uint32 local_size) {
275  assert(threadsPerBlock != nullptr);
276  assert(global_work_size != nullptr);
277  assert(kernel != nullptr);
278  int recommendedBlockSize, minGrid;
279 
280  PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
281  &minGrid, &recommendedBlockSize, kernel->get(), NULL, local_size,
282  maxThreadsPerBlock[0]));
283 
284  (void)minGrid; // Not used, avoid warnings
285 
286  threadsPerBlock[0] = std::min(
287  maxThreadsPerBlock[0],
288  std::min(global_work_size[0], static_cast<size_t>(recommendedBlockSize)));
289 
290  // Find a local work group size that is a divisor of the global
291  // work group size to produce uniform work groups.
292  while (0u != (global_work_size[0] % threadsPerBlock[0])) {
293  --threadsPerBlock[0];
294  }
295 }
296 
297 pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream,
298  pi_uint32 num_events_in_wait_list,
299  const pi_event *event_wait_list) {
300  if (!event_wait_list) {
301  return PI_SUCCESS;
302  }
303  try {
304  ScopedContext active(command_queue->get_context());
305 
306  auto result = forLatestEvents(
307  event_wait_list, num_events_in_wait_list,
308  [stream](pi_event event) -> pi_result {
309  if (event->get_stream() == stream) {
310  return PI_SUCCESS;
311  } else {
312  return PI_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0));
313  }
314  });
315 
316  if (result != PI_SUCCESS) {
317  return result;
318  }
319  return PI_SUCCESS;
320  } catch (pi_result err) {
321  return err;
322  } catch (...) {
323  return PI_ERROR_UNKNOWN;
324  }
325 }
326 
327 } // anonymous namespace
328 
331 namespace sycl {
332 namespace detail {
333 namespace pi {
334 
335 // Report error and no return (keeps compiler from printing warnings).
336 // TODO: Probably change that to throw a catchable exception,
337 // but for now it is useful to see every failure.
338 //
339 [[noreturn]] void die(const char *Message) {
340  std::cerr << "pi_die: " << Message << std::endl;
341  std::terminate();
342 }
343 
344 // Reports error messages
345 void cuPrint(const char *Message) {
346  std::cerr << "pi_print: " << Message << std::endl;
347 }
348 
349 void assertion(bool Condition, const char *Message) {
350  if (!Condition)
351  die(Message);
352 }
353 
354 } // namespace pi
355 } // namespace detail
356 } // namespace sycl
357 } // __SYCL_INLINE_NAMESPACE(cl)
358 
359 //--------------
360 // PI object implementation
361 
362 extern "C" {
363 
364 // Required in a number of functions, so forward declare here
366  pi_uint32 num_events_in_wait_list,
367  const pi_event *event_wait_list,
368  pi_event *event);
370  pi_uint32 num_events_in_wait_list,
371  const pi_event *event_wait_list,
372  pi_event *event);
375 
376 } // extern "C"
377 
379 
381  pi_uint32 stream_i;
382  while (true) {
383  if (num_compute_streams_ < compute_streams_.size()) {
384  // the check above is for performance - so as not to lock mutex every time
385  std::lock_guard<std::mutex> guard(compute_stream_mutex_);
386  // The second check is done after mutex is locked so other threads can not
387  // change num_compute_streams_ after that
388  if (num_compute_streams_ < compute_streams_.size()) {
389  PI_CHECK_ERROR(
390  cuStreamCreate(&compute_streams_[num_compute_streams_++], flags_));
391  }
392  }
393  stream_i = compute_stream_idx_++;
394  // if a stream has been reused before it was next selected round-robin
395  // fashion, we want to delay its next use and instead select another one
396  // that is more likely to have completed all the enqueued work.
397  if (delay_compute_[stream_i % compute_streams_.size()]) {
398  delay_compute_[stream_i % compute_streams_.size()] = false;
399  } else {
400  break;
401  }
402  }
403  if (stream_token) {
404  *stream_token = stream_i;
405  }
406  return compute_streams_[stream_i % compute_streams_.size()];
407 }
408 
410  const pi_event *event_wait_list,
411  _pi_stream_guard &guard,
412  pi_uint32 *stream_token) {
413  for (pi_uint32 i = 0; i < num_events_in_wait_list; i++) {
414  pi_uint32 token = event_wait_list[i]->get_stream_token();
415  if (event_wait_list[i]->get_queue() == this && can_reuse_stream(token)) {
416  std::unique_lock<std::mutex> compute_sync_guard(
418  // redo the check after lock to avoid data races on
419  // last_sync_compute_streams_
420  if (can_reuse_stream(token)) {
421  delay_compute_[token % delay_compute_.size()] = true;
422  if (stream_token) {
423  *stream_token = token;
424  }
425  guard = _pi_stream_guard{std::move(compute_sync_guard)};
426  return event_wait_list[i]->get_stream();
427  }
428  }
429  }
430  guard = {};
431  return get_next_compute_stream(stream_token);
432 }
433 
435  if (transfer_streams_.empty()) { // for example in in-order queue
436  return get_next_compute_stream();
437  }
439  // the check above is for performance - so as not to lock mutex every time
440  std::lock_guard<std::mutex> guard(transfer_stream_mutex_);
441  // The second check is done after mutex is locked so other threads can not
442  // change num_transfer_streams_ after that
444  PI_CHECK_ERROR(
445  cuStreamCreate(&transfer_streams_[num_transfer_streams_++], flags_));
446  }
447  }
449 }
450 
452  CUstream stream, pi_uint32 stream_token)
453  : commandType_{type}, refCount_{1}, has_ownership_{true},
454  hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
455  streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr},
456  evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} {
457 
458  bool profilingEnabled = queue_->properties_ & PI_QUEUE_PROFILING_ENABLE;
459 
460  PI_CHECK_ERROR(cuEventCreate(
461  &evEnd_, profilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
462 
463  if (profilingEnabled) {
464  PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
465  PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
466  }
467 
468  if (queue_ != nullptr) {
469  cuda_piQueueRetain(queue_);
470  }
471  cuda_piContextRetain(context_);
472 }
473 
475  : commandType_{PI_COMMAND_TYPE_USER}, refCount_{1}, has_ownership_{false},
476  hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
477  streamToken_{std::numeric_limits<pi_uint32>::max()}, evEnd_{eventNative},
478  evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, context_{
479  context} {}
480 
482  if (queue_ != nullptr) {
483  cuda_piQueueRelease(queue_);
484  }
485  cuda_piContextRelease(context_);
486 }
487 
489  assert(!is_started());
490  pi_result result = PI_SUCCESS;
491 
492  try {
493  if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
494  // NOTE: This relies on the default stream to be unused.
495  result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0));
496  result = PI_CHECK_ERROR(cuEventRecord(evStart_, stream_));
497  }
498  } catch (pi_result error) {
499  result = error;
500  }
501 
502  isStarted_ = true;
503  return result;
504 }
505 
506 bool _pi_event::is_completed() const noexcept {
507  if (!isRecorded_) {
508  return false;
509  }
510  if (!hasBeenWaitedOn_) {
511  const CUresult ret = cuEventQuery(evEnd_);
512  if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_READY) {
513  PI_CHECK_ERROR(ret);
514  return false;
515  }
516  if (ret == CUDA_ERROR_NOT_READY) {
517  return false;
518  }
519  }
520  return true;
521 }
522 
524  float miliSeconds = 0.0f;
525  assert(is_started());
526 
527  PI_CHECK_ERROR(
528  cuEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evQueued_));
529  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
530 }
531 
533  float miliSeconds = 0.0f;
534  assert(is_started());
535 
536  PI_CHECK_ERROR(
537  cuEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evStart_));
538  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
539 }
540 
542  float miliSeconds = 0.0f;
543  assert(is_started() && is_recorded());
544 
545  PI_CHECK_ERROR(
546  cuEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evEnd_));
547  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
548 }
549 
551 
552  if (is_recorded() || !is_started()) {
553  return PI_ERROR_INVALID_EVENT;
554  }
555 
556  pi_result result = PI_ERROR_INVALID_OPERATION;
557 
558  if (!queue_) {
559  return PI_ERROR_INVALID_QUEUE;
560  }
561 
562  try {
563  eventId_ = queue_->get_next_event_id();
564  if (eventId_ == 0) {
566  "Unrecoverable program state reached in event identifier overflow");
567  }
568  result = PI_CHECK_ERROR(cuEventRecord(evEnd_, stream_));
569  } catch (pi_result error) {
570  result = error;
571  }
572 
573  if (result == PI_SUCCESS) {
574  isRecorded_ = true;
575  }
576 
577  return result;
578 }
579 
581  pi_result retErr;
582  try {
583  retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
584  hasBeenWaitedOn_ = true;
585  } catch (pi_result error) {
586  retErr = error;
587  }
588 
589  return retErr;
590 }
591 
593  if (!backend_has_ownership())
594  return PI_SUCCESS;
595 
596  assert(queue_ != nullptr);
597 
598  PI_CHECK_ERROR(cuEventDestroy(evEnd_));
599 
600  if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
601  PI_CHECK_ERROR(cuEventDestroy(evQueued_));
602  PI_CHECK_ERROR(cuEventDestroy(evStart_));
603  }
604 
605  return PI_SUCCESS;
606 }
607 
608 // makes all future work submitted to queue wait for all work captured in event.
610  // for native events, the cuStreamWaitEvent call is used.
611  // This makes all future work submitted to stream wait for all
612  // work captured in event.
613  queue->for_each_stream([e = event->get()](CUstream s) {
614  PI_CHECK_ERROR(cuStreamWaitEvent(s, e, 0));
615  });
616  return PI_SUCCESS;
617 }
618 
620  : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1},
621  context_{ctxt}, kernelReqdWorkGroupSizeMD_{} {
622  cuda_piContextRetain(context_);
623 }
624 
626 
627 bool get_kernel_metadata(std::string metadataName, const char *tag,
628  std::string &kernelName) {
629  const size_t tagLength = strlen(tag);
630  const size_t metadataNameLength = metadataName.length();
631  if (metadataNameLength >= tagLength &&
632  metadataName.compare(metadataNameLength - tagLength, tagLength, tag) ==
633  0) {
634  kernelName = metadataName.substr(0, metadataNameLength - tagLength);
635  return true;
636  }
637  return false;
638 }
639 
641  size_t length) {
642  for (size_t i = 0; i < length; ++i) {
643  const pi_device_binary_property metadataElement = metadata[i];
644  std::string metadataElementName{metadataElement->Name};
645  std::string kernelName;
646 
647  // If metadata is reqd_work_group_size record it for the corresponding
648  // kernel name.
649  if (get_kernel_metadata(metadataElementName,
651  kernelName)) {
652  assert(metadataElement->ValSize ==
653  sizeof(std::uint64_t) + sizeof(std::uint32_t) * 3 &&
654  "Unexpected size for reqd_work_group_size metadata");
655 
656  // Get pointer to data, skipping 64-bit size at the start of the data.
657  const auto *reqdWorkGroupElements =
658  reinterpret_cast<const std::uint32_t *>(metadataElement->ValAddr) + 2;
659  kernelReqdWorkGroupSizeMD_[kernelName] =
660  std::make_tuple(reqdWorkGroupElements[0], reqdWorkGroupElements[1],
661  reqdWorkGroupElements[2]);
662  }
663  }
664  return PI_SUCCESS;
665 }
666 
667 pi_result _pi_program::set_binary(const char *source, size_t length) {
668  assert((binary_ == nullptr && binarySizeInBytes_ == 0) &&
669  "Re-setting program binary data which has already been set");
670  binary_ = source;
672  return PI_SUCCESS;
673 }
674 
675 pi_result _pi_program::build_program(const char *build_options) {
676 
677  this->buildOptions_ = build_options;
678 
679  constexpr const unsigned int numberOfOptions = 4u;
680 
681  CUjit_option options[numberOfOptions];
682  void *optionVals[numberOfOptions];
683 
684  // Pass a buffer for info messages
685  options[0] = CU_JIT_INFO_LOG_BUFFER;
686  optionVals[0] = (void *)infoLog_;
687  // Pass the size of the info buffer
688  options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
689  optionVals[1] = (void *)(long)MAX_LOG_SIZE;
690  // Pass a buffer for error message
691  options[2] = CU_JIT_ERROR_LOG_BUFFER;
692  optionVals[2] = (void *)errorLog_;
693  // Pass the size of the error buffer
694  options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
695  optionVals[3] = (void *)(long)MAX_LOG_SIZE;
696 
697  auto result = PI_CHECK_ERROR(
698  cuModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
699  numberOfOptions, options, optionVals));
700 
701  const auto success = (result == PI_SUCCESS);
702 
703  buildStatus_ =
705 
706  // If no exception, result is correct
707  return success ? PI_SUCCESS : PI_ERROR_BUILD_PROGRAM_FAILURE;
708 }
709 
718  cl::sycl::detail::pi::die("getKernelNames not implemented");
719  return {};
720 }
721 
726 template <typename T> class ReleaseGuard {
727 private:
728  T Captive;
729 
730  static pi_result callRelease(pi_device Captive) {
731  return cuda_piDeviceRelease(Captive);
732  }
733 
734  static pi_result callRelease(pi_context Captive) {
735  return cuda_piContextRelease(Captive);
736  }
737 
738  static pi_result callRelease(pi_mem Captive) {
739  return cuda_piMemRelease(Captive);
740  }
741 
742  static pi_result callRelease(pi_program Captive) {
743  return cuda_piProgramRelease(Captive);
744  }
745 
746  static pi_result callRelease(pi_kernel Captive) {
747  return cuda_piKernelRelease(Captive);
748  }
749 
750  static pi_result callRelease(pi_queue Captive) {
751  return cuda_piQueueRelease(Captive);
752  }
753 
754  static pi_result callRelease(pi_event Captive) {
755  return cuda_piEventRelease(Captive);
756  }
757 
758 public:
759  ReleaseGuard() = delete;
761  explicit ReleaseGuard(T Obj) : Captive(Obj) {}
762  ReleaseGuard(ReleaseGuard &&Other) noexcept : Captive(Other.Captive) {
763  Other.Captive = nullptr;
764  }
765 
766  ReleaseGuard(const ReleaseGuard &) = delete;
767 
771  if (Captive != nullptr) {
772  pi_result ret = callRelease(Captive);
773  if (ret != PI_SUCCESS) {
774  // A reported CUDA error is either an implementation or an asynchronous
775  // CUDA error for which it is unclear if the function that reported it
776  // succeeded or not. Either way, the state of the program is compromised
777  // and likely unrecoverable.
779  "Unrecoverable program state reached in cuda_piMemRelease");
780  }
781  }
782  }
783 
784  ReleaseGuard &operator=(const ReleaseGuard &) = delete;
785 
787  Captive = Other.Captive;
788  Other.Captive = nullptr;
789  return *this;
790  }
791 
794  void dismiss() { Captive = nullptr; }
795 };
796 
797 //-- PI API implementation
798 extern "C" {
799 
801  size_t param_value_size, void *param_value,
802  size_t *param_value_size_ret);
803 
813  pi_uint32 *num_platforms) {
814 
815  try {
816  static std::once_flag initFlag;
817  static pi_uint32 numPlatforms = 1;
818  static std::vector<_pi_platform> platformIds;
819 
820  if (num_entries == 0 && platforms != nullptr) {
821  return PI_ERROR_INVALID_VALUE;
822  }
823  if (platforms == nullptr && num_platforms == nullptr) {
824  return PI_ERROR_INVALID_VALUE;
825  }
826 
827  pi_result err = PI_SUCCESS;
828 
829  std::call_once(
830  initFlag,
831  [](pi_result &err) {
832  if (cuInit(0) != CUDA_SUCCESS) {
833  numPlatforms = 0;
834  return;
835  }
836  int numDevices = 0;
837  err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices));
838  if (numDevices == 0) {
839  numPlatforms = 0;
840  return;
841  }
842  try {
843  // make one platform per device
844  numPlatforms = numDevices;
845  platformIds.resize(numDevices);
846 
847  for (int i = 0; i < numDevices; ++i) {
849  err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
850  platformIds[i].devices_.emplace_back(
851  new _pi_device{device, &platformIds[i]});
852 
853  {
854  const auto &dev = platformIds[i].devices_.back().get();
855  size_t maxWorkGroupSize = 0u;
856  size_t maxThreadsPerBlock[3] = {};
857  pi_result retError = cuda_piDeviceGetInfo(
859  sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
860  assert(retError == PI_SUCCESS);
861  (void)retError;
862 
863  retError = cuda_piDeviceGetInfo(
865  sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr);
866  assert(retError == PI_SUCCESS);
867 
868  dev->save_max_work_item_sizes(sizeof(maxThreadsPerBlock),
869  maxThreadsPerBlock);
870  dev->save_max_work_group_size(maxWorkGroupSize);
871  }
872  }
873  } catch (const std::bad_alloc &) {
874  // Signal out-of-memory situation
875  for (int i = 0; i < numDevices; ++i) {
876  platformIds[i].devices_.clear();
877  }
878  platformIds.clear();
879  err = PI_ERROR_OUT_OF_HOST_MEMORY;
880  } catch (...) {
881  // Clear and rethrow to allow retry
882  for (int i = 0; i < numDevices; ++i) {
883  platformIds[i].devices_.clear();
884  }
885  platformIds.clear();
886  throw;
887  }
888  },
889  err);
890 
891  if (num_platforms != nullptr) {
892  *num_platforms = numPlatforms;
893  }
894 
895  if (platforms != nullptr) {
896  for (unsigned i = 0; i < std::min(num_entries, numPlatforms); ++i) {
897  platforms[i] = &platformIds[i];
898  }
899  }
900 
901  return err;
902  } catch (pi_result err) {
903  return err;
904  } catch (...) {
905  return PI_ERROR_OUT_OF_RESOURCES;
906  }
907 }
908 
910  pi_platform_info param_name,
911  size_t param_value_size, void *param_value,
912  size_t *param_value_size_ret) {
913  assert(platform != nullptr);
914 
915  switch (param_name) {
917  return getInfo(param_value_size, param_value, param_value_size_ret,
918  "NVIDIA CUDA BACKEND");
920  return getInfo(param_value_size, param_value, param_value_size_ret,
921  "NVIDIA Corporation");
923  return getInfo(param_value_size, param_value, param_value_size_ret,
924  "FULL PROFILE");
926  auto version = getCudaVersionString();
927  return getInfo(param_value_size, param_value, param_value_size_ret,
928  version.c_str());
929  }
931  return getInfo(param_value_size, param_value, param_value_size_ret, "");
932  }
933  default:
935  }
936  cl::sycl::detail::pi::die("Platform info request not implemented");
937  return {};
938 }
939 
946  pi_uint32 num_entries, pi_device *devices,
947  pi_uint32 *num_devices) {
948 
949  pi_result err = PI_SUCCESS;
950  const bool askingForDefault = device_type == PI_DEVICE_TYPE_DEFAULT;
951  const bool askingForGPU = device_type & PI_DEVICE_TYPE_GPU;
952  const bool returnDevices = askingForDefault || askingForGPU;
953 
954  size_t numDevices = returnDevices ? platform->devices_.size() : 0;
955 
956  try {
957  if (num_devices) {
958  *num_devices = numDevices;
959  }
960 
961  if (returnDevices && devices) {
962  for (size_t i = 0; i < std::min(size_t(num_entries), numDevices); ++i) {
963  devices[i] = platform->devices_[i].get();
964  }
965  }
966 
967  return err;
968  } catch (pi_result err) {
969  return err;
970  } catch (...) {
971  return PI_ERROR_OUT_OF_RESOURCES;
972  }
973 }
974 
977 pi_result cuda_piDeviceRetain(pi_device) { return PI_SUCCESS; }
978 
980  size_t param_value_size, void *param_value,
981  size_t *param_value_size_ret) {
982 
983  switch (param_name) {
985  return getInfo(param_value_size, param_value, param_value_size_ret, 1);
987  return getInfo(param_value_size, param_value, param_value_size_ret,
988  context->get_device());
990  return getInfo(param_value_size, param_value, param_value_size_ret,
991  context->get_reference_count());
993  pi_memory_order_capabilities capabilities =
996  return getInfo(param_value_size, param_value, param_value_size_ret,
997  capabilities);
998  }
1000  int major = 0;
1002  cuDeviceGetAttribute(&major,
1003  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1004  context->get_device()->get()) == CUDA_SUCCESS);
1005  pi_memory_order_capabilities capabilities =
1011  return getInfo(param_value_size, param_value, param_value_size_ret,
1012  capabilities);
1013  }
1014  default:
1016  }
1017 
1018  return PI_ERROR_OUT_OF_RESOURCES;
1019 }
1020 
1022  assert(context != nullptr);
1023  assert(context->get_reference_count() > 0);
1024 
1025  context->increment_reference_count();
1026  return PI_SUCCESS;
1027 }
1028 
1030  pi_context context, pi_context_extended_deleter function, void *user_data) {
1031  context->set_extended_deleter(function, user_data);
1032  return PI_SUCCESS;
1033 }
1034 
1038  pi_uint32, pi_device *, pi_uint32 *) {
1039  return {};
1040 }
1041 
1045  pi_device_binary *binaries,
1046  pi_uint32 num_binaries,
1047  pi_uint32 *selected_binary) {
1048  // Ignore unused parameter
1049  (void)device;
1050 
1051  if (!binaries) {
1052  cl::sycl::detail::pi::die("No list of device images provided");
1053  }
1054  if (num_binaries < 1) {
1055  cl::sycl::detail::pi::die("No binary images in the list");
1056  }
1057 
1058  // Look for an image for the NVPTX64 target, and return the first one that is
1059  // found
1060  for (pi_uint32 i = 0; i < num_binaries; i++) {
1061  if (strcmp(binaries[i]->DeviceTargetSpec,
1063  *selected_binary = i;
1064  return PI_SUCCESS;
1065  }
1066  }
1067 
1068  // No image can be loaded for the given device
1069  return PI_ERROR_INVALID_BINARY;
1070 }
1071 
1073  pi_program program,
1074  const char *func_name,
1075  pi_uint64 *func_pointer_ret) {
1076  // Check if device passed is the same the device bound to the context
1077  assert(device == program->get_context()->get_device());
1078  assert(func_pointer_ret != nullptr);
1079 
1080  CUfunction func;
1081  CUresult ret = cuModuleGetFunction(&func, program->get(), func_name);
1082  *func_pointer_ret = reinterpret_cast<pi_uint64>(func);
1083  pi_result retError = PI_SUCCESS;
1084 
1085  if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_FOUND)
1086  retError = PI_CHECK_ERROR(ret);
1087  if (ret == CUDA_ERROR_NOT_FOUND) {
1088  *func_pointer_ret = 0;
1089  retError = PI_ERROR_INVALID_KERNEL_NAME;
1090  }
1091 
1092  return retError;
1093 }
1094 
1098 
1100  size_t param_value_size, void *param_value,
1101  size_t *param_value_size_ret) {
1102 
1103  static constexpr pi_uint32 max_work_item_dimensions = 3u;
1104 
1105  assert(device != nullptr);
1106 
1107  switch (param_name) {
1108  case PI_DEVICE_INFO_TYPE: {
1109  return getInfo(param_value_size, param_value, param_value_size_ret,
1111  }
1112  case PI_DEVICE_INFO_VENDOR_ID: {
1113  return getInfo(param_value_size, param_value, param_value_size_ret, 4318u);
1114  }
1116  int compute_units = 0;
1118  cuDeviceGetAttribute(&compute_units,
1119  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1120  device->get()) == CUDA_SUCCESS);
1121  cl::sycl::detail::pi::assertion(compute_units >= 0);
1122  return getInfo(param_value_size, param_value, param_value_size_ret,
1123  pi_uint32(compute_units));
1124  }
1126  return getInfo(param_value_size, param_value, param_value_size_ret,
1127  max_work_item_dimensions);
1128  }
1130  size_t return_sizes[max_work_item_dimensions];
1131 
1132  int max_x = 0, max_y = 0, max_z = 0;
1134  cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
1135  device->get()) == CUDA_SUCCESS);
1136  cl::sycl::detail::pi::assertion(max_x >= 0);
1137 
1139  cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
1140  device->get()) == CUDA_SUCCESS);
1141  cl::sycl::detail::pi::assertion(max_y >= 0);
1142 
1144  cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
1145  device->get()) == CUDA_SUCCESS);
1146  cl::sycl::detail::pi::assertion(max_z >= 0);
1147 
1148  return_sizes[0] = size_t(max_x);
1149  return_sizes[1] = size_t(max_y);
1150  return_sizes[2] = size_t(max_z);
1151  return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
1152  param_value_size_ret, return_sizes);
1153  }
1154 
1156  size_t return_sizes[max_work_item_dimensions];
1157  int max_x = 0, max_y = 0, max_z = 0;
1159  cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
1160  device->get()) == CUDA_SUCCESS);
1161  cl::sycl::detail::pi::assertion(max_x >= 0);
1162 
1164  cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
1165  device->get()) == CUDA_SUCCESS);
1166  cl::sycl::detail::pi::assertion(max_y >= 0);
1167 
1169  cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
1170  device->get()) == CUDA_SUCCESS);
1171  cl::sycl::detail::pi::assertion(max_z >= 0);
1172 
1173  return_sizes[0] = size_t(max_x);
1174  return_sizes[1] = size_t(max_y);
1175  return_sizes[2] = size_t(max_z);
1176  return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
1177  param_value_size_ret, return_sizes);
1178  }
1179 
1181  int max_work_group_size = 0;
1183  cuDeviceGetAttribute(&max_work_group_size,
1184  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1185  device->get()) == CUDA_SUCCESS);
1186 
1187  cl::sycl::detail::pi::assertion(max_work_group_size >= 0);
1188 
1189  return getInfo(param_value_size, param_value, param_value_size_ret,
1190  size_t(max_work_group_size));
1191  }
1193  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1194  }
1196  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1197  }
1199  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1200  }
1202  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1203  }
1205  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1206  }
1208  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1209  }
1211  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1212  }
1214  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1215  }
1217  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1218  }
1220  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1221  }
1223  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1224  }
1226  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1227  }
1229  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
1230  }
1232  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1233  }
1235  // Number of sub-groups = max block size / warp size + possible remainder
1236  int max_threads = 0;
1238  cuDeviceGetAttribute(&max_threads,
1239  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1240  device->get()) == CUDA_SUCCESS);
1241  int warpSize = 0;
1243  cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
1244  device->get()) == CUDA_SUCCESS);
1245  int maxWarps = (max_threads + warpSize - 1) / warpSize;
1246  return getInfo(param_value_size, param_value, param_value_size_ret,
1247  static_cast<uint32_t>(maxWarps));
1248  }
1250  // Volta provides independent thread scheduling
1251  // TODO: Revisit for previous generation GPUs
1252  int major = 0;
1254  cuDeviceGetAttribute(&major,
1255  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1256  device->get()) == CUDA_SUCCESS);
1257  bool ifp = (major >= 7);
1258  return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
1259  }
1260 
1261  case PI_DEVICE_INFO_ATOMIC_64: {
1262  int major = 0;
1264  cuDeviceGetAttribute(&major,
1265  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1266  device->get()) == CUDA_SUCCESS);
1267 
1268  bool atomic64 = (major >= 6) ? true : false;
1269  return getInfo(param_value_size, param_value, param_value_size_ret,
1270  atomic64);
1271  }
1273  pi_memory_order_capabilities capabilities =
1276  return getInfo(param_value_size, param_value, param_value_size_ret,
1277  capabilities);
1278  }
1280  int major = 0;
1282  cuDeviceGetAttribute(&major,
1283  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1284  device->get()) == CUDA_SUCCESS);
1285  pi_memory_order_capabilities capabilities =
1291  return getInfo(param_value_size, param_value, param_value_size_ret,
1292  capabilities);
1293  }
1295  int major = 0;
1297  cuDeviceGetAttribute(&major,
1298  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1299  device->get()) == CUDA_SUCCESS);
1300 
1301  bool bfloat16 = (major >= 8) ? true : false;
1302  return getInfo(param_value_size, param_value, param_value_size_ret,
1303  bfloat16);
1304  }
1306  // NVIDIA devices only support one sub-group size (the warp size)
1307  int warpSize = 0;
1309  cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
1310  device->get()) == CUDA_SUCCESS);
1311  size_t sizes[1] = {static_cast<size_t>(warpSize)};
1312  return getInfoArray<size_t>(1, param_value_size, param_value,
1313  param_value_size_ret, sizes);
1314  }
1316  int clock_freq = 0;
1318  cuDeviceGetAttribute(&clock_freq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
1319  device->get()) == CUDA_SUCCESS);
1320  cl::sycl::detail::pi::assertion(clock_freq >= 0);
1321  return getInfo(param_value_size, param_value, param_value_size_ret,
1322  pi_uint32(clock_freq) / 1000u);
1323  }
1325  auto bits = pi_uint32{std::numeric_limits<uintptr_t>::digits};
1326  return getInfo(param_value_size, param_value, param_value_size_ret, bits);
1327  }
1329  // Max size of memory object allocation in bytes.
1330  // The minimum value is max(min(1024 × 1024 ×
1331  // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE),
1332  // 32 × 1024 × 1024) for devices that are not of type
1333  // CL_DEVICE_TYPE_CUSTOM.
1334 
1335  size_t global = 0;
1336  cl::sycl::detail::pi::assertion(cuDeviceTotalMem(&global, device->get()) ==
1337  CUDA_SUCCESS);
1338 
1339  auto quarter_global = static_cast<pi_uint32>(global / 4u);
1340 
1341  auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global),
1342  32u * 1024u * 1024u);
1343 
1344  return getInfo(param_value_size, param_value, param_value_size_ret,
1345  pi_uint64{max_alloc});
1346  }
1348  pi_bool enabled = PI_FALSE;
1349 
1350  if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) {
1351  enabled = PI_TRUE;
1352  } else {
1354  "Images are not fully supported by the CUDA BE, their support is "
1355  "disabled by default. Their partial support can be activated by "
1356  "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at "
1357  "runtime.");
1358  }
1359 
1360  return getInfo(param_value_size, param_value, param_value_size_ret,
1361  enabled);
1362  }
1364  // This call doesn't match to CUDA as it doesn't have images, but instead
1365  // surfaces and textures. No clear call in the CUDA API to determine this,
1366  // but some searching found as of SM 2.x 128 are supported.
1367  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1368  }
1370  // This call doesn't match to CUDA as it doesn't have images, but instead
1371  // surfaces and textures. No clear call in the CUDA API to determine this,
1372  // but some searching found as of SM 2.x 128 are supported.
1373  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1374  }
1376  // Take the smaller of maximum surface and maximum texture height.
1377  int tex_height = 0;
1379  cuDeviceGetAttribute(&tex_height,
1380  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT,
1381  device->get()) == CUDA_SUCCESS);
1382  cl::sycl::detail::pi::assertion(tex_height >= 0);
1383  int surf_height = 0;
1385  cuDeviceGetAttribute(&surf_height,
1386  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT,
1387  device->get()) == CUDA_SUCCESS);
1388  cl::sycl::detail::pi::assertion(surf_height >= 0);
1389 
1390  int min = std::min(tex_height, surf_height);
1391 
1392  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1393  }
1395  // Take the smaller of maximum surface and maximum texture width.
1396  int tex_width = 0;
1398  cuDeviceGetAttribute(&tex_width,
1399  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH,
1400  device->get()) == CUDA_SUCCESS);
1401  cl::sycl::detail::pi::assertion(tex_width >= 0);
1402  int surf_width = 0;
1404  cuDeviceGetAttribute(&surf_width,
1405  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH,
1406  device->get()) == CUDA_SUCCESS);
1407  cl::sycl::detail::pi::assertion(surf_width >= 0);
1408 
1409  int min = std::min(tex_width, surf_width);
1410 
1411  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1412  }
1414  // Take the smaller of maximum surface and maximum texture height.
1415  int tex_height = 0;
1417  cuDeviceGetAttribute(&tex_height,
1418  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT,
1419  device->get()) == CUDA_SUCCESS);
1420  cl::sycl::detail::pi::assertion(tex_height >= 0);
1421  int surf_height = 0;
1423  cuDeviceGetAttribute(&surf_height,
1424  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT,
1425  device->get()) == CUDA_SUCCESS);
1426  cl::sycl::detail::pi::assertion(surf_height >= 0);
1427 
1428  int min = std::min(tex_height, surf_height);
1429 
1430  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1431  }
1433  // Take the smaller of maximum surface and maximum texture width.
1434  int tex_width = 0;
1436  cuDeviceGetAttribute(&tex_width,
1437  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH,
1438  device->get()) == CUDA_SUCCESS);
1439  cl::sycl::detail::pi::assertion(tex_width >= 0);
1440  int surf_width = 0;
1442  cuDeviceGetAttribute(&surf_width,
1443  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH,
1444  device->get()) == CUDA_SUCCESS);
1445  cl::sycl::detail::pi::assertion(surf_width >= 0);
1446 
1447  int min = std::min(tex_width, surf_width);
1448 
1449  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1450  }
1452  // Take the smaller of maximum surface and maximum texture depth.
1453  int tex_depth = 0;
1455  cuDeviceGetAttribute(&tex_depth,
1456  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH,
1457  device->get()) == CUDA_SUCCESS);
1458  cl::sycl::detail::pi::assertion(tex_depth >= 0);
1459  int surf_depth = 0;
1461  cuDeviceGetAttribute(&surf_depth,
1462  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH,
1463  device->get()) == CUDA_SUCCESS);
1464  cl::sycl::detail::pi::assertion(surf_depth >= 0);
1465 
1466  int min = std::min(tex_depth, surf_depth);
1467 
1468  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1469  }
1471  // Take the smaller of maximum surface and maximum texture width.
1472  int tex_width = 0;
1474  cuDeviceGetAttribute(&tex_width,
1475  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH,
1476  device->get()) == CUDA_SUCCESS);
1477  cl::sycl::detail::pi::assertion(tex_width >= 0);
1478  int surf_width = 0;
1480  cuDeviceGetAttribute(&surf_width,
1481  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH,
1482  device->get()) == CUDA_SUCCESS);
1483  cl::sycl::detail::pi::assertion(surf_width >= 0);
1484 
1485  int min = std::min(tex_width, surf_width);
1486 
1487  return getInfo(param_value_size, param_value, param_value_size_ret, min);
1488  }
1490  return getInfo(param_value_size, param_value, param_value_size_ret,
1491  size_t(0));
1492  }
1494  // This call is kind of meaningless for cuda, as samplers don't exist.
1495  // Closest thing is textures, which is 128.
1496  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1497  }
1499  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters
1500  // __global__ function parameters are passed to the device via constant
1501  // memory and are limited to 4 KB.
1502  return getInfo(param_value_size, param_value, param_value_size_ret,
1503  size_t{4000u});
1504  }
1506  int mem_base_addr_align = 0;
1508  cuDeviceGetAttribute(&mem_base_addr_align,
1509  CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
1510  device->get()) == CUDA_SUCCESS);
1511  // Multiply by 8 as clGetDeviceInfo returns this value in bits
1512  mem_base_addr_align *= 8;
1513  return getInfo(param_value_size, param_value, param_value_size_ret,
1514  mem_base_addr_align);
1515  }
1517  // TODO: is this config consistent across all NVIDIA GPUs?
1518  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1519  }
1521  // TODO: is this config consistent across all NVIDIA GPUs?
1525  return getInfo(param_value_size, param_value, param_value_size_ret, config);
1526  }
1528  // TODO: is this config consistent across all NVIDIA GPUs?
1531  return getInfo(param_value_size, param_value, param_value_size_ret, config);
1532  }
1534  // TODO: is this config consistent across all NVIDIA GPUs?
1535  return getInfo(param_value_size, param_value, param_value_size_ret,
1537  }
1539  // The value is documented for all existing GPUs in the CUDA programming
1540  // guidelines, section "H.3.2. Global Memory".
1541  return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
1542  }
1544  int cache_size = 0;
1546  cuDeviceGetAttribute(&cache_size, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1547  device->get()) == CUDA_SUCCESS);
1548  cl::sycl::detail::pi::assertion(cache_size >= 0);
1549  // The L2 cache is global to the GPU.
1550  return getInfo(param_value_size, param_value, param_value_size_ret,
1551  pi_uint64(cache_size));
1552  }
1554  size_t bytes = 0;
1555  // Runtime API has easy access to this value, driver API info is scarse.
1556  cl::sycl::detail::pi::assertion(cuDeviceTotalMem(&bytes, device->get()) ==
1557  CUDA_SUCCESS);
1558  return getInfo(param_value_size, param_value, param_value_size_ret,
1559  pi_uint64{bytes});
1560  }
1562  int constant_memory = 0;
1564  cuDeviceGetAttribute(&constant_memory,
1565  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
1566  device->get()) == CUDA_SUCCESS);
1567  cl::sycl::detail::pi::assertion(constant_memory >= 0);
1568 
1569  return getInfo(param_value_size, param_value, param_value_size_ret,
1570  pi_uint64(constant_memory));
1571  }
1573  // TODO: is there a way to retrieve this from CUDA driver API?
1574  // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
1575  // 1060 3GB
1576  return getInfo(param_value_size, param_value, param_value_size_ret, 9u);
1577  }
1579  return getInfo(param_value_size, param_value, param_value_size_ret,
1581  }
1583  // OpenCL's "local memory" maps most closely to CUDA's "shared memory".
1584  // CUDA has its own definition of "local memory", which maps to OpenCL's
1585  // "private memory".
1586  int local_mem_size = 0;
1588  cuDeviceGetAttribute(&local_mem_size,
1589  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
1590  device->get()) == CUDA_SUCCESS);
1591  cl::sycl::detail::pi::assertion(local_mem_size >= 0);
1592  return getInfo(param_value_size, param_value, param_value_size_ret,
1593  pi_uint64(local_mem_size));
1594  }
1596  int ecc_enabled = 0;
1598  cuDeviceGetAttribute(&ecc_enabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
1599  device->get()) == CUDA_SUCCESS);
1600 
1601  cl::sycl::detail::pi::assertion((ecc_enabled == 0) | (ecc_enabled == 1));
1602  auto result = static_cast<pi_bool>(ecc_enabled);
1603  return getInfo(param_value_size, param_value, param_value_size_ret, result);
1604  }
1606  int is_integrated = 0;
1608  cuDeviceGetAttribute(&is_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
1609  device->get()) == CUDA_SUCCESS);
1610 
1611  cl::sycl::detail::pi::assertion((is_integrated == 0) |
1612  (is_integrated == 1));
1613  auto result = static_cast<pi_bool>(is_integrated);
1614  return getInfo(param_value_size, param_value, param_value_size_ret, result);
1615  }
1617  // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
1618  // 1060 3GB
1619  return getInfo(param_value_size, param_value, param_value_size_ret,
1620  size_t{1000u});
1621  }
1623  return getInfo(param_value_size, param_value, param_value_size_ret,
1624  PI_TRUE);
1625  }
1626  case PI_DEVICE_INFO_AVAILABLE: {
1627  return getInfo(param_value_size, param_value, param_value_size_ret,
1628  PI_TRUE);
1629  }
1631  return getInfo(param_value_size, param_value, param_value_size_ret,
1632  PI_TRUE);
1633  }
1635  return getInfo(param_value_size, param_value, param_value_size_ret,
1636  PI_TRUE);
1637  }
1639  return getInfo(param_value_size, param_value, param_value_size_ret,
1640  PI_TRUE);
1641  }
1643  auto capability = PI_DEVICE_EXEC_CAPABILITIES_KERNEL;
1644  return getInfo(param_value_size, param_value, param_value_size_ret,
1645  capability);
1646  }
1648  // The mandated minimum capability:
1649  auto capability =
1651  return getInfo(param_value_size, param_value, param_value_size_ret,
1652  capability);
1653  }
1655  // The mandated minimum capability:
1656  auto capability = PI_QUEUE_PROFILING_ENABLE;
1657  return getInfo(param_value_size, param_value, param_value_size_ret,
1658  capability);
1659  }
1661  // An empty string is returned if no built-in kernels are supported by the
1662  // device.
1663  return getInfo(param_value_size, param_value, param_value_size_ret, "");
1664  }
1665  case PI_DEVICE_INFO_PLATFORM: {
1666  return getInfo(param_value_size, param_value, param_value_size_ret,
1667  device->get_platform());
1668  }
1669  case PI_DEVICE_INFO_NAME: {
1670  static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u;
1671  char name[MAX_DEVICE_NAME_LENGTH];
1673  cuDeviceGetName(name, MAX_DEVICE_NAME_LENGTH, device->get()) ==
1674  CUDA_SUCCESS);
1675  return getInfoArray(strlen(name) + 1, param_value_size, param_value,
1676  param_value_size_ret, name);
1677  }
1678  case PI_DEVICE_INFO_VENDOR: {
1679  return getInfo(param_value_size, param_value, param_value_size_ret,
1680  "NVIDIA Corporation");
1681  }
1683  auto version = getCudaVersionString();
1684  return getInfo(param_value_size, param_value, param_value_size_ret,
1685  version.c_str());
1686  }
1687  case PI_DEVICE_INFO_PROFILE: {
1688  return getInfo(param_value_size, param_value, param_value_size_ret, "CUDA");
1689  }
1691  return getInfo(param_value_size, param_value, param_value_size_ret,
1692  device->get_reference_count());
1693  }
1694  case PI_DEVICE_INFO_VERSION: {
1695  return getInfo(param_value_size, param_value, param_value_size_ret,
1696  "PI 0.0");
1697  }
1699  return getInfo(param_value_size, param_value, param_value_size_ret, "");
1700  }
1702 
1703  std::string SupportedExtensions = "cl_khr_fp64 ";
1704  SupportedExtensions += PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT;
1705  SupportedExtensions += " ";
1706 
1707  int major = 0;
1708  int minor = 0;
1709 
1711  cuDeviceGetAttribute(&major,
1712  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1713  device->get()) == CUDA_SUCCESS);
1715  cuDeviceGetAttribute(&minor,
1716  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
1717  device->get()) == CUDA_SUCCESS);
1718 
1719  if ((major >= 6) || ((major == 5) && (minor >= 3))) {
1720  SupportedExtensions += "cl_khr_fp16 ";
1721  }
1722 
1723  return getInfo(param_value_size, param_value, param_value_size_ret,
1724  SupportedExtensions.c_str());
1725  }
1727  // The minimum value for the FULL profile is 1 MB.
1728  return getInfo(param_value_size, param_value, param_value_size_ret,
1729  size_t{1024u});
1730  }
1732  return getInfo(param_value_size, param_value, param_value_size_ret,
1733  PI_TRUE);
1734  }
1736  return getInfo(param_value_size, param_value, param_value_size_ret,
1737  nullptr);
1738  }
1740  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1741  }
1743  return getInfo(param_value_size, param_value, param_value_size_ret,
1744  static_cast<pi_device_partition_property>(0u));
1745  }
1747  return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
1748  }
1750  return getInfo(param_value_size, param_value, param_value_size_ret,
1751  static_cast<pi_device_partition_property>(0u));
1752  }
1753 
1754  // Intel USM extensions
1755 
1757  // from cl_intel_unified_shared_memory: "The host memory access capabilities
1758  // apply to any host allocation."
1759  //
1760  // query if/how the device can access page-locked host memory, possibly
1761  // through PCIe, using the same pointer as the host
1762  pi_bitfield value = {};
1763  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
1764  // the device shares a unified address space with the host
1765  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
1766  6) {
1767  // compute capability 6.x introduces operations that are atomic with
1768  // respect to other CPUs and GPUs in the system
1771  } else {
1772  // on GPU architectures with compute capability lower than 6.x, atomic
1773  // operations from the GPU to CPU memory will not be atomic with respect
1774  // to CPU initiated atomic operations
1776  }
1777  }
1778  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1779  }
1781  // from cl_intel_unified_shared_memory:
1782  // "The device memory access capabilities apply to any device allocation
1783  // associated with this device."
1784  //
1785  // query how the device can access memory allocated on the device itself (?)
1789  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1790  }
1792  // from cl_intel_unified_shared_memory:
1793  // "The single device shared memory access capabilities apply to any shared
1794  // allocation associated with this device."
1795  //
1796  // query if/how the device can access managed memory associated to it
1797  pi_bitfield value = {};
1798  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
1799  // the device can allocate managed memory on this system
1801  }
1802  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
1803  // the device can coherently access managed memory concurrently with the
1804  // CPU
1805  value |= PI_USM_CONCURRENT_ACCESS;
1806  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
1807  6) {
1808  // compute capability 6.x introduces operations that are atomic with
1809  // respect to other CPUs and GPUs in the system
1811  }
1812  }
1813  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1814  }
1816  // from cl_intel_unified_shared_memory:
1817  // "The cross-device shared memory access capabilities apply to any shared
1818  // allocation associated with this device, or to any shared memory
1819  // allocation on another device that also supports the same cross-device
1820  // shared memory access capability."
1821  //
1822  // query if/how the device can access managed memory associated to other
1823  // devices
1824  pi_bitfield value = {};
1825  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
1826  // the device can allocate managed memory on this system
1827  value |= PI_USM_ACCESS;
1828  }
1829  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
1830  // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
1831  // attribute can coherently access managed memory concurrently with the
1832  // CPU
1833  value |= PI_USM_CONCURRENT_ACCESS;
1834  }
1835  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
1836  6) {
1837  // compute capability 6.x introduces operations that are atomic with
1838  // respect to other CPUs and GPUs in the system
1839  if (value & PI_USM_ACCESS)
1840  value |= PI_USM_ATOMIC_ACCESS;
1841  if (value & PI_USM_CONCURRENT_ACCESS)
1843  }
1844  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1845  }
1847  // from cl_intel_unified_shared_memory:
1848  // "The shared system memory access capabilities apply to any allocations
1849  // made by a system allocator, such as malloc or new."
1850  //
1851  // query if/how the device can access pageable host memory allocated by the
1852  // system allocator
1853  pi_bitfield value = {};
1854  if (getAttribute(device, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) {
1855  // the device suppports coherently accessing pageable memory without
1856  // calling cuMemHostRegister/cudaHostRegister on it
1857  if (getAttribute(device,
1858  CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) {
1859  // the link between the device and the host supports native atomic
1860  // operations
1863  } else {
1864  // the link between the device and the host does not support native
1865  // atomic operations
1867  }
1868  }
1869  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1870  }
1872  int value =
1873  getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8;
1874  return getInfo(param_value_size, param_value, param_value_size_ret, value);
1875  }
1877  int major =
1878  getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
1879  int minor =
1880  getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
1881  std::string result = std::to_string(major) + "." + std::to_string(minor);
1882  return getInfo(param_value_size, param_value, param_value_size_ret,
1883  result.c_str());
1884  }
1885 
1886  // TODO: Investigate if this information is available on CUDA.
1895  // TODO: Check if Intel device UUID extension is utilized for CUDA.
1896  // For details about this extension, see
1897  // sycl/doc/extensions/supported/sycl_ext_intel_device_info.md
1898  case PI_DEVICE_INFO_UUID:
1899  return PI_ERROR_INVALID_VALUE;
1900 
1901  default:
1903  }
1904  cl::sycl::detail::pi::die("Device info request not implemented");
1905  return {};
1906 }
1907 
1915  pi_native_handle *nativeHandle) {
1916  *nativeHandle = static_cast<pi_native_handle>(device->get());
1917  return PI_SUCCESS;
1918 }
1919 
1930  pi_device *piDevice) {
1931  assert(piDevice != nullptr);
1932 
1933  CUdevice cu_device = static_cast<CUdevice>(nativeHandle);
1934 
1935  auto is_device = [=](std::unique_ptr<_pi_device> &dev) {
1936  return dev->get() == cu_device;
1937  };
1938 
1939  // If a platform is provided just check if the device is in it
1940  if (platform) {
1941  auto search_res = std::find_if(begin(platform->devices_),
1942  end(platform->devices_), is_device);
1943  if (search_res != end(platform->devices_)) {
1944  *piDevice = (*search_res).get();
1945  return PI_SUCCESS;
1946  }
1947  }
1948 
1949  // Get list of platforms
1950  pi_uint32 num_platforms;
1951  pi_result result = cuda_piPlatformsGet(0, nullptr, &num_platforms);
1952  if (result != PI_SUCCESS)
1953  return result;
1954 
1955  pi_platform *plat =
1956  static_cast<pi_platform *>(malloc(num_platforms * sizeof(pi_platform)));
1957  result = cuda_piPlatformsGet(num_platforms, plat, nullptr);
1958  if (result != PI_SUCCESS)
1959  return result;
1960 
1961  // Iterate through platforms to find device that matches nativeHandle
1962  for (pi_uint32 j = 0; j < num_platforms; ++j) {
1963  auto search_res = std::find_if(begin(plat[j]->devices_),
1964  end(plat[j]->devices_), is_device);
1965  if (search_res != end(plat[j]->devices_)) {
1966  *piDevice = (*search_res).get();
1967  return PI_SUCCESS;
1968  }
1969  }
1970 
1971  // If the provided nativeHandle cannot be matched to an
1972  // existing device return error
1973  return PI_ERROR_INVALID_OPERATION;
1974 }
1975 
1976 /* Context APIs */
1977 
1997  pi_uint32 num_devices, const pi_device *devices,
1998  void (*pfn_notify)(const char *errinfo,
1999  const void *private_info,
2000  size_t cb, void *user_data),
2001  void *user_data, pi_context *retcontext) {
2002 
2003  assert(devices != nullptr);
2004  // TODO: How to implement context callback?
2005  assert(pfn_notify == nullptr);
2006  assert(user_data == nullptr);
2007  assert(num_devices == 1);
2008  // Need input context
2009  assert(retcontext != nullptr);
2010  pi_result errcode_ret = PI_SUCCESS;
2011 
2012  // Parse properties.
2013  bool property_cuda_primary = false;
2014  while (properties && (0 != *properties)) {
2015  // Consume property ID.
2016  pi_context_properties id = *properties;
2017  ++properties;
2018  // Consume property value.
2019  pi_context_properties value = *properties;
2020  ++properties;
2021  switch (id) {
2023  assert(value == PI_FALSE || value == PI_TRUE);
2024  property_cuda_primary = static_cast<bool>(value);
2025  break;
2026  default:
2027  // Unknown property.
2029  "Unknown piContextCreate property in property list");
2030  return PI_ERROR_INVALID_VALUE;
2031  }
2032  }
2033 
2034  std::unique_ptr<_pi_context> piContextPtr{nullptr};
2035  try {
2036  CUcontext current = nullptr;
2037 
2038  if (property_cuda_primary) {
2039  // Use the CUDA primary context and assume that we want to use it
2040  // immediately as we want to forge context switches.
2041  CUcontext Ctxt;
2042  errcode_ret =
2043  PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&Ctxt, devices[0]->get()));
2044  piContextPtr = std::unique_ptr<_pi_context>(
2045  new _pi_context{_pi_context::kind::primary, Ctxt, *devices});
2046  errcode_ret = PI_CHECK_ERROR(cuCtxPushCurrent(Ctxt));
2047  } else {
2048  // Create a scoped context.
2049  CUcontext newContext;
2050  PI_CHECK_ERROR(cuCtxGetCurrent(&current));
2051  errcode_ret = PI_CHECK_ERROR(
2052  cuCtxCreate(&newContext, CU_CTX_MAP_HOST, devices[0]->get()));
2053  piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{
2054  _pi_context::kind::user_defined, newContext, *devices});
2055  }
2056 
2057  static std::once_flag initFlag;
2058  std::call_once(
2059  initFlag,
2060  [](pi_result &err) {
2061  // Use default stream to record base event counter
2062  PI_CHECK_ERROR(
2063  cuEventCreate(&_pi_platform::evBase_, CU_EVENT_DEFAULT));
2064  PI_CHECK_ERROR(cuEventRecord(_pi_platform::evBase_, 0));
2065  },
2066  errcode_ret);
2067 
2068  // For non-primary scoped contexts keep the last active on top of the stack
2069  // as `cuCtxCreate` replaces it implicitly otherwise.
2070  // Primary contexts are kept on top of the stack, so the previous context
2071  // is not queried and therefore not recovered.
2072  if (current != nullptr) {
2073  PI_CHECK_ERROR(cuCtxSetCurrent(current));
2074  }
2075 
2076  *retcontext = piContextPtr.release();
2077  } catch (pi_result err) {
2078  errcode_ret = err;
2079  } catch (...) {
2080  errcode_ret = PI_ERROR_OUT_OF_RESOURCES;
2081  }
2082  return errcode_ret;
2083 }
2084 
2086 
2087  assert(ctxt != nullptr);
2088 
2089  if (ctxt->decrement_reference_count() > 0) {
2090  return PI_SUCCESS;
2091  }
2092  ctxt->invoke_extended_deleters();
2093 
2094  std::unique_ptr<_pi_context> context{ctxt};
2095 
2096  if (!ctxt->backend_has_ownership())
2097  return PI_SUCCESS;
2098 
2099  if (!ctxt->is_primary()) {
2100  CUcontext cuCtxt = ctxt->get();
2101  CUcontext current = nullptr;
2102  cuCtxGetCurrent(&current);
2103  if (cuCtxt != current) {
2104  PI_CHECK_ERROR(cuCtxPushCurrent(cuCtxt));
2105  }
2106  PI_CHECK_ERROR(cuCtxSynchronize());
2107  cuCtxGetCurrent(&current);
2108  if (cuCtxt == current) {
2109  PI_CHECK_ERROR(cuCtxPopCurrent(&current));
2110  }
2111  return PI_CHECK_ERROR(cuCtxDestroy(cuCtxt));
2112  }
2113 
2114  // Primary context is not destroyed, but released
2115  CUdevice cuDev = ctxt->get_device()->get();
2116  CUcontext current;
2117  cuCtxPopCurrent(&current);
2118  return PI_CHECK_ERROR(cuDevicePrimaryCtxRelease(cuDev));
2119 }
2120 
2128  pi_native_handle *nativeHandle) {
2129  *nativeHandle = reinterpret_cast<pi_native_handle>(context->get());
2130  return PI_SUCCESS;
2131 }
2132 
2141  pi_uint32 num_devices,
2142  const pi_device *devices,
2143  bool ownNativeHandle,
2144  pi_context *piContext) {
2145  (void)num_devices;
2146  (void)devices;
2147  (void)ownNativeHandle;
2148  assert(piContext != nullptr);
2149  assert(ownNativeHandle == false);
2150 
2151  CUcontext newContext = reinterpret_cast<CUcontext>(nativeHandle);
2152 
2153  ScopedContext active(newContext);
2154 
2155  // Get context's native device
2156  CUdevice cu_device;
2157  pi_result retErr = PI_CHECK_ERROR(cuCtxGetDevice(&cu_device));
2158 
2159  // Create a SYCL device from the ctx device
2160  pi_device device = nullptr;
2161  retErr = cuda_piextDeviceCreateWithNativeHandle(cu_device, nullptr, &device);
2162 
2163  // Create sycl context
2164  *piContext = new _pi_context{_pi_context::kind::user_defined, newContext,
2165  device, /*backend_owns*/ false};
2166 
2167  return retErr;
2168 }
2169 
2175  size_t size, void *host_ptr, pi_mem *ret_mem,
2176  const pi_mem_properties *properties) {
2177  // Need input memory object
2178  assert(ret_mem != nullptr);
2179  assert((properties == nullptr || *properties == 0) &&
2180  "no mem properties goes to cuda RT yet");
2181  // Currently, USE_HOST_PTR is not implemented using host register
2182  // since this triggers a weird segfault after program ends.
2183  // Setting this constant to true enables testing that behavior.
2184  const bool enableUseHostPtr = false;
2185  const bool performInitialCopy =
2186  (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
2187  ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && !enableUseHostPtr);
2188  pi_result retErr = PI_SUCCESS;
2189  pi_mem retMemObj = nullptr;
2190 
2191  try {
2192  ScopedContext active(context);
2193  CUdeviceptr ptr;
2196 
2197  if ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && enableUseHostPtr) {
2198  retErr = PI_CHECK_ERROR(
2199  cuMemHostRegister(host_ptr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
2200  retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, host_ptr, 0));
2202  } else if (flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
2203  retErr = PI_CHECK_ERROR(cuMemAllocHost(&host_ptr, size));
2204  retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, host_ptr, 0));
2206  } else {
2207  retErr = PI_CHECK_ERROR(cuMemAlloc(&ptr, size));
2208  if (flags & PI_MEM_FLAGS_HOST_PTR_COPY) {
2210  }
2211  }
2212 
2213  if (retErr == PI_SUCCESS) {
2214  pi_mem parentBuffer = nullptr;
2215 
2216  auto piMemObj = std::unique_ptr<_pi_mem>(
2217  new _pi_mem{context, parentBuffer, allocMode, ptr, host_ptr, size});
2218  if (piMemObj != nullptr) {
2219  retMemObj = piMemObj.release();
2220  if (performInitialCopy) {
2221  // Operates on the default stream of the current CUDA context.
2222  retErr = PI_CHECK_ERROR(cuMemcpyHtoD(ptr, host_ptr, size));
2223  // Synchronize with default stream implicitly used by cuMemcpyHtoD
2224  // to make buffer data available on device before any other PI call
2225  // uses it.
2226  if (retErr == PI_SUCCESS) {
2227  CUstream defaultStream = 0;
2228  retErr = PI_CHECK_ERROR(cuStreamSynchronize(defaultStream));
2229  }
2230  }
2231  } else {
2232  retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
2233  }
2234  }
2235  } catch (pi_result err) {
2236  retErr = err;
2237  } catch (...) {
2238  retErr = PI_ERROR_OUT_OF_RESOURCES;
2239  }
2240 
2241  *ret_mem = retMemObj;
2242 
2243  return retErr;
2244 }
2245 
2251  assert((memObj != nullptr) && "PI_ERROR_INVALID_MEM_OBJECTS");
2252 
2253  pi_result ret = PI_SUCCESS;
2254 
2255  try {
2256 
2257  // Do nothing if there are other references
2258  if (memObj->decrement_reference_count() > 0) {
2259  return PI_SUCCESS;
2260  }
2261 
2262  // make sure memObj is released in case PI_CHECK_ERROR throws
2263  std::unique_ptr<_pi_mem> uniqueMemObj(memObj);
2264 
2265  if (memObj->is_sub_buffer()) {
2266  return PI_SUCCESS;
2267  }
2268 
2269  ScopedContext active(uniqueMemObj->get_context());
2270 
2271  if (memObj->mem_type_ == _pi_mem::mem_type::buffer) {
2272  switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) {
2275  ret = PI_CHECK_ERROR(cuMemFree(uniqueMemObj->mem_.buffer_mem_.ptr_));
2276  break;
2278  ret = PI_CHECK_ERROR(
2279  cuMemHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
2280  break;
2282  ret = PI_CHECK_ERROR(
2283  cuMemFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
2284  };
2285  } else if (memObj->mem_type_ == _pi_mem::mem_type::surface) {
2286  ret = PI_CHECK_ERROR(
2287  cuSurfObjectDestroy(uniqueMemObj->mem_.surface_mem_.get_surface()));
2288  ret = PI_CHECK_ERROR(
2289  cuArrayDestroy(uniqueMemObj->mem_.surface_mem_.get_array()));
2290  }
2291 
2292  } catch (pi_result err) {
2293  ret = err;
2294  } catch (...) {
2295  ret = PI_ERROR_OUT_OF_RESOURCES;
2296  }
2297 
2298  if (ret != PI_SUCCESS) {
2299  // A reported CUDA error is either an implementation or an asynchronous CUDA
2300  // error for which it is unclear if the function that reported it succeeded
2301  // or not. Either way, the state of the program is compromised and likely
2302  // unrecoverable.
2304  "Unrecoverable program state reached in cuda_piMemRelease");
2305  }
2306 
2307  return PI_SUCCESS;
2308 }
2309 
2315  pi_buffer_create_type buffer_create_type,
2316  void *buffer_create_info, pi_mem *memObj) {
2317  assert((parent_buffer != nullptr) && "PI_ERROR_INVALID_MEM_OBJECT");
2318  assert(parent_buffer->is_buffer() && "PI_ERROR_INVALID_MEM_OBJECTS");
2319  assert(!parent_buffer->is_sub_buffer() && "PI_ERROR_INVALID_MEM_OBJECT");
2320 
2321  // Default value for flags means PI_MEM_FLAGS_ACCCESS_RW.
2322  if (flags == 0) {
2323  flags = PI_MEM_FLAGS_ACCESS_RW;
2324  }
2325 
2326  assert((flags == PI_MEM_FLAGS_ACCESS_RW) && "PI_ERROR_INVALID_VALUE");
2327  assert((buffer_create_type == PI_BUFFER_CREATE_TYPE_REGION) &&
2328  "PI_ERROR_INVALID_VALUE");
2329  assert((buffer_create_info != nullptr) && "PI_ERROR_INVALID_VALUE");
2330  assert(memObj != nullptr);
2331 
2332  const auto bufferRegion =
2333  *reinterpret_cast<pi_buffer_region>(buffer_create_info);
2334  assert((bufferRegion.size != 0u) && "PI_ERROR_INVALID_BUFFER_SIZE");
2335 
2336  assert((bufferRegion.origin <= (bufferRegion.origin + bufferRegion.size)) &&
2337  "Overflow");
2338  assert(((bufferRegion.origin + bufferRegion.size) <=
2339  parent_buffer->mem_.buffer_mem_.get_size()) &&
2340  "PI_ERROR_INVALID_BUFFER_SIZE");
2341  // Retained indirectly due to retaining parent buffer below.
2342  pi_context context = parent_buffer->context_;
2345 
2346  assert(parent_buffer->mem_.buffer_mem_.ptr_ !=
2349  parent_buffer->mem_.buffer_mem_.ptr_ + bufferRegion.origin;
2350 
2351  void *hostPtr = nullptr;
2352  if (parent_buffer->mem_.buffer_mem_.hostPtr_) {
2353  hostPtr = static_cast<char *>(parent_buffer->mem_.buffer_mem_.hostPtr_) +
2354  bufferRegion.origin;
2355  }
2356 
2357  ReleaseGuard<pi_mem> releaseGuard(parent_buffer);
2358 
2359  std::unique_ptr<_pi_mem> retMemObj{nullptr};
2360  try {
2361  ScopedContext active(context);
2362 
2363  retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{
2364  context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}};
2365  } catch (pi_result err) {
2366  *memObj = nullptr;
2367  return err;
2368  } catch (...) {
2369  *memObj = nullptr;
2370  return PI_ERROR_OUT_OF_HOST_MEMORY;
2371  }
2372 
2373  releaseGuard.dismiss();
2374  *memObj = retMemObj.release();
2375  return PI_SUCCESS;
2376 }
2377 
2378 pi_result cuda_piMemGetInfo(pi_mem, pi_mem_info, size_t, void *, size_t *) {
2379  cl::sycl::detail::pi::die("cuda_piMemGetInfo not implemented");
2380 }
2381 
2389  pi_native_handle *nativeHandle) {
2390  *nativeHandle = static_cast<pi_native_handle>(mem->mem_.buffer_mem_.get());
2391  return PI_SUCCESS;
2392 }
2393 
2407  bool ownNativeHandle,
2408  pi_mem *mem) {
2410  "Creation of PI mem from native handle not implemented");
2411  return {};
2412 }
2413 
2421  pi_queue_properties properties, pi_queue *queue) {
2422  try {
2423  std::unique_ptr<_pi_queue> queueImpl{nullptr};
2424 
2425  if (context->get_device() != device) {
2426  *queue = nullptr;
2427  return PI_ERROR_INVALID_DEVICE;
2428  }
2429 
2430  unsigned int flags = 0;
2431  if (properties == __SYCL_PI_CUDA_USE_DEFAULT_STREAM) {
2432  flags = CU_STREAM_DEFAULT;
2433  } else if (properties == __SYCL_PI_CUDA_SYNC_WITH_DEFAULT) {
2434  flags = 0;
2435  } else {
2436  flags = CU_STREAM_NON_BLOCKING;
2437  }
2438 
2439  const bool is_out_of_order =
2441 
2442  std::vector<CUstream> computeCuStreams(
2443  is_out_of_order ? _pi_queue::default_num_compute_streams : 1);
2444  std::vector<CUstream> transferCuStreams(
2445  is_out_of_order ? _pi_queue::default_num_transfer_streams : 0);
2446 
2447  queueImpl = std::unique_ptr<_pi_queue>(
2448  new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams),
2449  context, device, properties, flags});
2450 
2451  *queue = queueImpl.release();
2452 
2453  return PI_SUCCESS;
2454  } catch (pi_result err) {
2455 
2456  return err;
2457 
2458  } catch (...) {
2459 
2460  return PI_ERROR_OUT_OF_RESOURCES;
2461  }
2462 }
2463 
2465  size_t param_value_size, void *param_value,
2466  size_t *param_value_size_ret) {
2467  assert(command_queue != nullptr);
2468 
2469  switch (param_name) {
2470  case PI_QUEUE_INFO_CONTEXT:
2471  return getInfo(param_value_size, param_value, param_value_size_ret,
2472  command_queue->context_);
2473  case PI_QUEUE_INFO_DEVICE:
2474  return getInfo(param_value_size, param_value, param_value_size_ret,
2475  command_queue->device_);
2477  return getInfo(param_value_size, param_value, param_value_size_ret,
2478  command_queue->get_reference_count());
2480  return getInfo(param_value_size, param_value, param_value_size_ret,
2481  command_queue->properties_);
2482  default:
2484  }
2485  cl::sycl::detail::pi::die("Queue info request not implemented");
2486  return {};
2487 }
2488 
2490  assert(command_queue != nullptr);
2491  assert(command_queue->get_reference_count() > 0);
2492 
2493  command_queue->increment_reference_count();
2494  return PI_SUCCESS;
2495 }
2496 
2498  assert(command_queue != nullptr);
2499 
2500  if (command_queue->decrement_reference_count() > 0) {
2501  return PI_SUCCESS;
2502  }
2503 
2504  try {
2505  std::unique_ptr<_pi_queue> queueImpl(command_queue);
2506 
2507  if (!command_queue->backend_has_ownership())
2508  return PI_SUCCESS;
2509 
2510  ScopedContext active(command_queue->get_context());
2511 
2512  command_queue->for_each_stream([](CUstream s) {
2513  PI_CHECK_ERROR(cuStreamSynchronize(s));
2514  PI_CHECK_ERROR(cuStreamDestroy(s));
2515  });
2516 
2517  return PI_SUCCESS;
2518  } catch (pi_result err) {
2519  return err;
2520  } catch (...) {
2521  return PI_ERROR_OUT_OF_RESOURCES;
2522  }
2523 }
2524 
2526  pi_result result = PI_SUCCESS;
2527 
2528  try {
2529 
2530  assert(command_queue !=
2531  nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code
2532  ScopedContext active(command_queue->get_context());
2533 
2534  command_queue->sync_streams([&result](CUstream s) {
2535  result = PI_CHECK_ERROR(cuStreamSynchronize(s));
2536  });
2537 
2538  } catch (pi_result err) {
2539 
2540  result = err;
2541 
2542  } catch (...) {
2543 
2544  result = PI_ERROR_OUT_OF_RESOURCES;
2545  }
2546 
2547  return result;
2548 }
2549 
2550 // There is no CUDA counterpart for queue flushing and we don't run into the
2551 // same problem of having to flush cross-queue dependencies as some of the
2552 // other plugins, so it can be left as no-op.
2554  (void)command_queue;
2555  return PI_SUCCESS;
2556 }
2557 
2565  pi_native_handle *nativeHandle) {
2566  ScopedContext active(queue->get_context());
2567  *nativeHandle =
2568  reinterpret_cast<pi_native_handle>(queue->get_next_compute_stream());
2569  return PI_SUCCESS;
2570 }
2571 
2584  pi_device device,
2585  bool ownNativeHandle,
2586  pi_queue *queue) {
2587  (void)device;
2588  (void)ownNativeHandle;
2589  assert(ownNativeHandle == false);
2590 
2591  unsigned int flags;
2592  CUstream cuStream = reinterpret_cast<CUstream>(nativeHandle);
2593 
2594  auto retErr = PI_CHECK_ERROR(cuStreamGetFlags(cuStream, &flags));
2595 
2596  pi_queue_properties properties = 0;
2597  if (flags == CU_STREAM_DEFAULT)
2598  properties = __SYCL_PI_CUDA_USE_DEFAULT_STREAM;
2599  else if (flags == CU_STREAM_NON_BLOCKING)
2600  properties = __SYCL_PI_CUDA_SYNC_WITH_DEFAULT;
2601  else
2602  cl::sycl::detail::pi::die("Unknown cuda stream");
2603 
2604  std::vector<CUstream> computeCuStreams(1, cuStream);
2605  std::vector<CUstream> transferCuStreams(0);
2606 
2607  // Create queue and set num_compute_streams to 1, as computeCuStreams has
2608  // valid stream
2609  *queue = new _pi_queue{std::move(computeCuStreams),
2610  std::move(transferCuStreams),
2611  context,
2612  context->get_device(),
2613  properties,
2614  flags,
2615  /*backend_owns*/ false};
2616  (*queue)->num_compute_streams_ = 1;
2617 
2618  return retErr;
2619 }
2620 
2622  pi_bool blocking_write, size_t offset,
2623  size_t size, const void *ptr,
2624  pi_uint32 num_events_in_wait_list,
2625  const pi_event *event_wait_list,
2626  pi_event *event) {
2627 
2628  assert(buffer != nullptr);
2629  assert(command_queue != nullptr);
2630  pi_result retErr = PI_SUCCESS;
2631  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
2632  std::unique_ptr<_pi_event> retImplEv{nullptr};
2633 
2634  try {
2635  ScopedContext active(command_queue->get_context());
2636  CUstream cuStream = command_queue->get_next_transfer_stream();
2637 
2638  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
2639  event_wait_list);
2640 
2641  if (event) {
2642  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
2643  PI_COMMAND_TYPE_MEM_BUFFER_WRITE, command_queue, cuStream));
2644  retImplEv->start();
2645  }
2646 
2647  retErr =
2648  PI_CHECK_ERROR(cuMemcpyHtoDAsync(devPtr + offset, ptr, size, cuStream));
2649 
2650  if (event) {
2651  retErr = retImplEv->record();
2652  }
2653 
2654  if (blocking_write) {
2655  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
2656  }
2657 
2658  if (event) {
2659  *event = retImplEv.release();
2660  }
2661  } catch (pi_result err) {
2662  retErr = err;
2663  }
2664  return retErr;
2665 }
2666 
2668  pi_bool blocking_read, size_t offset,
2669  size_t size, void *ptr,
2670  pi_uint32 num_events_in_wait_list,
2671  const pi_event *event_wait_list,
2672  pi_event *event) {
2673 
2674  assert(buffer != nullptr);
2675  assert(command_queue != nullptr);
2676  pi_result retErr = PI_SUCCESS;
2677  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
2678  std::unique_ptr<_pi_event> retImplEv{nullptr};
2679 
2680  try {
2681  ScopedContext active(command_queue->get_context());
2682  CUstream cuStream = command_queue->get_next_transfer_stream();
2683 
2684  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
2685  event_wait_list);
2686 
2687  if (event) {
2688  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
2689  PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue, cuStream));
2690  retImplEv->start();
2691  }
2692 
2693  retErr =
2694  PI_CHECK_ERROR(cuMemcpyDtoHAsync(ptr, devPtr + offset, size, cuStream));
2695 
2696  if (event) {
2697  retErr = retImplEv->record();
2698  }
2699 
2700  if (blocking_read) {
2701  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
2702  }
2703 
2704  if (event) {
2705  *event = retImplEv.release();
2706  }
2707 
2708  } catch (pi_result err) {
2709  retErr = err;
2710  }
2711  return retErr;
2712 }
2713 
2714 pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
2715 
2716  try {
2717  assert(num_events != 0);
2718  assert(event_list);
2719  if (num_events == 0) {
2720  return PI_ERROR_INVALID_VALUE;
2721  }
2722 
2723  if (!event_list) {
2724  return PI_ERROR_INVALID_EVENT;
2725  }
2726 
2727  auto context = event_list[0]->get_context();
2728  ScopedContext active(context);
2729 
2730  auto waitFunc = [context](pi_event event) -> pi_result {
2731  if (!event) {
2732  return PI_ERROR_INVALID_EVENT;
2733  }
2734 
2735  if (event->get_context() != context) {
2736  return PI_ERROR_INVALID_CONTEXT;
2737  }
2738 
2739  return event->wait();
2740  };
2741  return forLatestEvents(event_list, num_events, waitFunc);
2742  } catch (pi_result err) {
2743  return err;
2744  } catch (...) {
2745  return PI_ERROR_OUT_OF_RESOURCES;
2746  }
2747 }
2748 
2749 pi_result cuda_piKernelCreate(pi_program program, const char *kernel_name,
2750  pi_kernel *kernel) {
2751  assert(kernel != nullptr);
2752  assert(program != nullptr);
2753 
2754  pi_result retErr = PI_SUCCESS;
2755  std::unique_ptr<_pi_kernel> retKernel{nullptr};
2756 
2757  try {
2758  ScopedContext active(program->get_context());
2759 
2760  CUfunction cuFunc;
2761  retErr = PI_CHECK_ERROR(
2762  cuModuleGetFunction(&cuFunc, program->get(), kernel_name));
2763 
2764  std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset";
2765  CUfunction cuFuncWithOffsetParam;
2766  CUresult offsetRes = cuModuleGetFunction(
2767  &cuFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str());
2768 
2769  // If there is no kernel with global offset parameter we mark it as missing
2770  if (offsetRes == CUDA_ERROR_NOT_FOUND) {
2771  cuFuncWithOffsetParam = nullptr;
2772  } else {
2773  retErr = PI_CHECK_ERROR(offsetRes);
2774  }
2775 
2776  retKernel = std::unique_ptr<_pi_kernel>(
2777  new _pi_kernel{cuFunc, cuFuncWithOffsetParam, kernel_name, program,
2778  program->get_context()});
2779  } catch (pi_result err) {
2780  retErr = err;
2781  } catch (...) {
2782  retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
2783  }
2784 
2785  *kernel = retKernel.release();
2786  return retErr;
2787 }
2788 
2790  size_t arg_size, const void *arg_value) {
2791 
2792  assert(kernel != nullptr);
2793  pi_result retErr = PI_SUCCESS;
2794  try {
2795  if (arg_value) {
2796  kernel->set_kernel_arg(arg_index, arg_size, arg_value);
2797  } else {
2798  kernel->set_kernel_local_arg(arg_index, arg_size);
2799  }
2800  } catch (pi_result err) {
2801  retErr = err;
2802  }
2803  return retErr;
2804 }
2805 
2807  const pi_mem *arg_value) {
2808 
2809  assert(kernel != nullptr);
2810  assert(arg_value != nullptr);
2811 
2812  pi_result retErr = PI_SUCCESS;
2813  try {
2814  pi_mem arg_mem = *arg_value;
2815  if (arg_mem->mem_type_ == _pi_mem::mem_type::surface) {
2816  CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
2817  PI_CHECK_ERROR(cuArray3DGetDescriptor(
2818  &arrayDesc, arg_mem->mem_.surface_mem_.get_array()));
2819  if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 &&
2820  arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 &&
2821  arrayDesc.Format != CU_AD_FORMAT_HALF &&
2822  arrayDesc.Format != CU_AD_FORMAT_FLOAT) {
2824  "PI CUDA kernels only support images with channel types int32, "
2825  "uint32, float, and half.");
2826  }
2827  CUsurfObject cuSurf = arg_mem->mem_.surface_mem_.get_surface();
2828  kernel->set_kernel_arg(arg_index, sizeof(cuSurf), (void *)&cuSurf);
2829  } else {
2830  CUdeviceptr cuPtr = arg_mem->mem_.buffer_mem_.get();
2831  kernel->set_kernel_arg(arg_index, sizeof(CUdeviceptr), (void *)&cuPtr);
2832  }
2833  } catch (pi_result err) {
2834  retErr = err;
2835  }
2836  return retErr;
2837 }
2838 
2840  const pi_sampler *arg_value) {
2841 
2842  assert(kernel != nullptr);
2843  assert(arg_value != nullptr);
2844 
2845  pi_result retErr = PI_SUCCESS;
2846  try {
2847  pi_uint32 samplerProps = (*arg_value)->props_;
2848  kernel->set_kernel_arg(arg_index, sizeof(pi_uint32), (void *)&samplerProps);
2849  } catch (pi_result err) {
2850  retErr = err;
2851  }
2852  return retErr;
2853 }
2854 
2856  pi_kernel_group_info param_name,
2857  size_t param_value_size, void *param_value,
2858  size_t *param_value_size_ret) {
2859 
2860  // Here we want to query about a kernel's cuda blocks!
2861 
2862  if (kernel != nullptr) {
2863 
2864  switch (param_name) {
2866  int max_threads = 0;
2868  cuFuncGetAttribute(&max_threads,
2869  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
2870  kernel->get()) == CUDA_SUCCESS);
2871  return getInfo(param_value_size, param_value, param_value_size_ret,
2872  size_t(max_threads));
2873  }
2875  size_t group_size[3] = {0, 0, 0};
2876  const auto &reqd_wg_size_md_map =
2877  kernel->program_->kernelReqdWorkGroupSizeMD_;
2878  const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_);
2879  if (reqd_wg_size_md != reqd_wg_size_md_map.end()) {
2880  const auto reqd_wg_size = reqd_wg_size_md->second;
2881  group_size[0] = std::get<0>(reqd_wg_size);
2882  group_size[1] = std::get<1>(reqd_wg_size);
2883  group_size[2] = std::get<2>(reqd_wg_size);
2884  }
2885  return getInfoArray(3, param_value_size, param_value,
2886  param_value_size_ret, group_size);
2887  }
2889  // OpenCL LOCAL == CUDA SHARED
2890  int bytes = 0;
2892  cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
2893  kernel->get()) == CUDA_SUCCESS);
2894  return getInfo(param_value_size, param_value, param_value_size_ret,
2895  pi_uint64(bytes));
2896  }
2898  // Work groups should be multiples of the warp size
2899  int warpSize = 0;
2901  cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
2902  device->get()) == CUDA_SUCCESS);
2903  return getInfo(param_value_size, param_value, param_value_size_ret,
2904  static_cast<size_t>(warpSize));
2905  }
2907  // OpenCL PRIVATE == CUDA LOCAL
2908  int bytes = 0;
2910  cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
2911  kernel->get()) == CUDA_SUCCESS);
2912  return getInfo(param_value_size, param_value, param_value_size_ret,
2913  pi_uint64(bytes));
2914  }
2916  int numRegs = 0;
2918  cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
2919  kernel->get()) == CUDA_SUCCESS);
2920  return getInfo(param_value_size, param_value, param_value_size_ret,
2921  pi_uint32(numRegs));
2922  }
2923  default:
2925  }
2926  }
2927 
2928  return PI_ERROR_INVALID_KERNEL;
2929 }
2930 
2932  pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim,
2933  const size_t *global_work_offset, const size_t *global_work_size,
2934  const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
2935  const pi_event *event_wait_list, pi_event *event) {
2936 
2937  // Preconditions
2938  assert(command_queue != nullptr);
2939  assert(command_queue->get_context() == kernel->get_context());
2940  assert(kernel != nullptr);
2941  assert(global_work_offset != nullptr);
2942  assert(work_dim > 0);
2943  assert(work_dim < 4);
2944 
2945  // Set the number of threads per block to the number of threads per warp
2946  // by default unless user has provided a better number
2947  size_t threadsPerBlock[3] = {32u, 1u, 1u};
2948  size_t maxWorkGroupSize = 0u;
2949  size_t maxThreadsPerBlock[3] = {};
2950  bool providedLocalWorkGroupSize = (local_work_size != nullptr);
2951  pi_uint32 local_size = kernel->get_local_size();
2952  pi_result retError = PI_SUCCESS;
2953 
2954  try {
2955  // Set the active context here as guessLocalWorkSize needs an active context
2956  ScopedContext active(command_queue->get_context());
2957  {
2958  size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_;
2959  maxWorkGroupSize = command_queue->device_->get_max_work_group_size();
2960  command_queue->device_->get_max_work_item_sizes(
2961  sizeof(maxThreadsPerBlock), maxThreadsPerBlock);
2962 
2963  if (providedLocalWorkGroupSize) {
2964  auto isValid = [&](int dim) {
2965  if (reqdThreadsPerBlock[dim] != 0 &&
2966  local_work_size[dim] != reqdThreadsPerBlock[dim])
2967  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2968 
2969  if (local_work_size[dim] > maxThreadsPerBlock[dim])
2970  return PI_ERROR_INVALID_WORK_ITEM_SIZE;
2971  // Checks that local work sizes are a divisor of the global work sizes
2972  // which includes that the local work sizes are neither larger than
2973  // the global work sizes and not 0.
2974  if (0u == local_work_size[dim])
2975  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2976  if (0u != (global_work_size[dim] % local_work_size[dim]))
2977  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2978  threadsPerBlock[dim] = local_work_size[dim];
2979  return PI_SUCCESS;
2980  };
2981 
2982  for (size_t dim = 0; dim < work_dim; dim++) {
2983  auto err = isValid(dim);
2984  if (err != PI_SUCCESS)
2985  return err;
2986  }
2987  } else {
2988  guessLocalWorkSize(threadsPerBlock, global_work_size,
2989  maxThreadsPerBlock, kernel, local_size);
2990  }
2991  }
2992 
2993  if (maxWorkGroupSize <
2994  size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
2995  return PI_ERROR_INVALID_WORK_GROUP_SIZE;
2996  }
2997 
2998  size_t blocksPerGrid[3] = {1u, 1u, 1u};
2999 
3000  for (size_t i = 0; i < work_dim; i++) {
3001  blocksPerGrid[i] =
3002  (global_work_size[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i];
3003  }
3004 
3005  std::unique_ptr<_pi_event> retImplEv{nullptr};
3006 
3007  pi_uint32 stream_token;
3008  _pi_stream_guard guard;
3009  CUstream cuStream = command_queue->get_next_compute_stream(
3010  num_events_in_wait_list, event_wait_list, guard, &stream_token);
3011  CUfunction cuFunc = kernel->get();
3012 
3013  retError = enqueueEventsWait(command_queue, cuStream,
3014  num_events_in_wait_list, event_wait_list);
3015 
3016  // Set the implicit global offset parameter if kernel has offset variant
3017  if (kernel->get_with_offset_parameter()) {
3018  std::uint32_t cuda_implicit_offset[3] = {0, 0, 0};
3019  if (global_work_offset) {
3020  for (size_t i = 0; i < work_dim; i++) {
3021  cuda_implicit_offset[i] =
3022  static_cast<std::uint32_t>(global_work_offset[i]);
3023  if (global_work_offset[i] != 0) {
3024  cuFunc = kernel->get_with_offset_parameter();
3025  }
3026  }
3027  }
3028  kernel->set_implicit_offset_arg(sizeof(cuda_implicit_offset),
3029  cuda_implicit_offset);
3030  }
3031 
3032  auto &argIndices = kernel->get_arg_indices();
3033 
3034  if (event) {
3035  retImplEv = std::unique_ptr<_pi_event>(
3037  cuStream, stream_token));
3038  retImplEv->start();
3039  }
3040 
3041  // Set local mem max size if env var is present
3042  static const char *local_mem_sz_ptr =
3043  std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE");
3044 
3045  if (local_mem_sz_ptr) {
3046  int device_max_local_mem = 0;
3047  cuDeviceGetAttribute(
3048  &device_max_local_mem,
3049  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
3050  command_queue->get_device()->get());
3051 
3052  static const int env_val = std::atoi(local_mem_sz_ptr);
3053  if (env_val <= 0 || env_val > device_max_local_mem) {
3054  setErrorMessage("Invalid value specified for "
3055  "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
3056  PI_ERROR_PLUGIN_SPECIFIC_ERROR);
3057  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
3058  }
3059  PI_CHECK_ERROR(cuFuncSetAttribute(
3060  cuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, env_val));
3061  }
3062 
3063  retError = PI_CHECK_ERROR(cuLaunchKernel(
3064  cuFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2],
3065  threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], local_size,
3066  cuStream, const_cast<void **>(argIndices.data()), nullptr));
3067  if (local_size != 0)
3068  kernel->clear_local_size();
3069 
3070  if (event) {
3071  retError = retImplEv->record();
3072  *event = retImplEv.release();
3073  }
3074  } catch (pi_result err) {
3075  retError = err;
3076  }
3077  return retError;
3078 }
3079 
3081 pi_result cuda_piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t,
3082  pi_uint32, const pi_mem *, const void **,
3083  pi_uint32, const pi_event *, pi_event *) {
3084  cl::sycl::detail::pi::die("Not implemented in CUDA backend");
3085  return {};
3086 }
3087 
3089  pi_program, bool,
3090  pi_kernel *) {
3091  sycl::detail::pi::die("Unsupported operation");
3092  return PI_SUCCESS;
3093 }
3094 
3097  const pi_image_format *image_format,
3098  const pi_image_desc *image_desc, void *host_ptr,
3099  pi_mem *ret_mem) {
3100  // Need input memory object
3101  assert(ret_mem != nullptr);
3102  const bool performInitialCopy = (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
3103  ((flags & PI_MEM_FLAGS_HOST_PTR_USE));
3104  pi_result retErr = PI_SUCCESS;
3105 
3106  // We only support RBGA channel order
3107  // TODO: check SYCL CTS and spec. May also have to support BGRA
3108  if (image_format->image_channel_order !=
3111  "cuda_piMemImageCreate only supports RGBA channel order");
3112  }
3113 
3114  // We have to use cuArray3DCreate, which has some caveats. The height and
3115  // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc gives
3116  // a minimum value of 1, so we need to convert the answer.
3117  CUDA_ARRAY3D_DESCRIPTOR array_desc;
3118  array_desc.NumChannels = 4; // Only support 4 channel image
3119  array_desc.Flags = 0; // No flags required
3120  array_desc.Width = image_desc->image_width;
3121  if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
3122  array_desc.Height = 0;
3123  array_desc.Depth = 0;
3124  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
3125  array_desc.Height = image_desc->image_height;
3126  array_desc.Depth = 0;
3127  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
3128  array_desc.Height = image_desc->image_height;
3129  array_desc.Depth = image_desc->image_depth;
3130  }
3131 
3132  // We need to get this now in bytes for calculating the total image size later
3133  size_t pixel_type_size_bytes;
3134 
3135  switch (image_format->image_channel_data_type) {
3138  array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
3139  pixel_type_size_bytes = 1;
3140  break;
3142  array_desc.Format = CU_AD_FORMAT_SIGNED_INT8;
3143  pixel_type_size_bytes = 1;
3144  break;
3147  array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
3148  pixel_type_size_bytes = 2;
3149  break;
3151  array_desc.Format = CU_AD_FORMAT_SIGNED_INT16;
3152  pixel_type_size_bytes = 2;
3153  break;
3155  array_desc.Format = CU_AD_FORMAT_HALF;
3156  pixel_type_size_bytes = 2;
3157  break;
3159  array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
3160  pixel_type_size_bytes = 4;
3161  break;
3163  array_desc.Format = CU_AD_FORMAT_SIGNED_INT32;
3164  pixel_type_size_bytes = 4;
3165  break;
3167  array_desc.Format = CU_AD_FORMAT_FLOAT;
3168  pixel_type_size_bytes = 4;
3169  break;
3170  default:
3172  "cuda_piMemImageCreate given unsupported image_channel_data_type");
3173  }
3174 
3175  // When a dimension isn't used image_desc has the size set to 1
3176  size_t pixel_size_bytes =
3177  pixel_type_size_bytes * 4; // 4 is the only number of channels we support
3178  size_t image_size_bytes = pixel_size_bytes * image_desc->image_width *
3179  image_desc->image_height * image_desc->image_depth;
3180 
3181  ScopedContext active(context);
3182  CUarray image_array;
3183  retErr = PI_CHECK_ERROR(cuArray3DCreate(&image_array, &array_desc));
3184 
3185  try {
3186  if (performInitialCopy) {
3187  // We have to use a different copy function for each image dimensionality
3188  if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
3189  retErr = PI_CHECK_ERROR(
3190  cuMemcpyHtoA(image_array, 0, host_ptr, image_size_bytes));
3191  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
3192  CUDA_MEMCPY2D cpy_desc;
3193  memset(&cpy_desc, 0, sizeof(cpy_desc));
3194  cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
3195  cpy_desc.srcHost = host_ptr;
3196  cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
3197  cpy_desc.dstArray = image_array;
3198  cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
3199  cpy_desc.Height = image_desc->image_height;
3200  retErr = PI_CHECK_ERROR(cuMemcpy2D(&cpy_desc));
3201  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
3202  CUDA_MEMCPY3D cpy_desc;
3203  memset(&cpy_desc, 0, sizeof(cpy_desc));
3204  cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
3205  cpy_desc.srcHost = host_ptr;
3206  cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
3207  cpy_desc.dstArray = image_array;
3208  cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
3209  cpy_desc.Height = image_desc->image_height;
3210  cpy_desc.Depth = image_desc->image_depth;
3211  retErr = PI_CHECK_ERROR(cuMemcpy3D(&cpy_desc));
3212  }
3213  }
3214 
3215  // CUDA_RESOURCE_DESC is a union of different structs, shown here
3216  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
3217  // We need to fill it as described here to use it for a surface or texture
3218  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html
3219  // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and
3220  // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array
3221  // handle.
3222  // CUDA_RESOURCE_DESC::flags must be set to zero
3223 
3224  CUDA_RESOURCE_DESC image_res_desc;
3225  image_res_desc.res.array.hArray = image_array;
3226  image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
3227  image_res_desc.flags = 0;
3228 
3229  CUsurfObject surface;
3230  retErr = PI_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc));
3231 
3232  auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{
3233  context, image_array, surface, image_desc->image_type, host_ptr});
3234 
3235  if (piMemObj == nullptr) {
3236  return PI_ERROR_OUT_OF_HOST_MEMORY;
3237  }
3238 
3239  *ret_mem = piMemObj.release();
3240  } catch (pi_result err) {
3241  cuArrayDestroy(image_array);
3242  return err;
3243  } catch (...) {
3244  cuArrayDestroy(image_array);
3245  return PI_ERROR_UNKNOWN;
3246  }
3247 
3248  return retErr;
3249 }
3250 
3253  size_t *) {
3254  cl::sycl::detail::pi::die("cuda_piMemImageGetInfo not implemented");
3255  return {};
3256 }
3257 
3259  assert(mem != nullptr);
3260  assert(mem->get_reference_count() > 0);
3262  return PI_SUCCESS;
3263 }
3264 
3269  const size_t *, pi_program *) {
3271  "cuda_piclProgramCreateWithSource not implemented");
3272  return PI_ERROR_INVALID_OPERATION;
3273 }
3274 
3280  const pi_device *device_list, const char *options,
3281  void (*pfn_notify)(pi_program program,
3282  void *user_data),
3283  void *user_data) {
3284 
3285  assert(program != nullptr);
3286  assert(num_devices == 1 || num_devices == 0);
3287  assert(device_list != nullptr || num_devices == 0);
3288  assert(pfn_notify == nullptr);
3289  assert(user_data == nullptr);
3290  pi_result retError = PI_SUCCESS;
3291 
3292  try {
3293  ScopedContext active(program->get_context());
3294 
3295  program->build_program(options);
3296 
3297  } catch (pi_result err) {
3298  retError = err;
3299  }
3300  return retError;
3301 }
3302 
3305  cl::sycl::detail::pi::die("cuda_piProgramCreate not implemented");
3306  return {};
3307 }
3308 
3316  pi_context context, pi_uint32 num_devices, const pi_device *device_list,
3317  const size_t *lengths, const unsigned char **binaries,
3318  size_t num_metadata_entries, const pi_device_binary_property *metadata,
3319  pi_int32 *binary_status, pi_program *program) {
3320  // Ignore unused parameter
3321  (void)binary_status;
3322 
3323  assert(context != nullptr);
3324  assert(binaries != nullptr);
3325  assert(program != nullptr);
3326  assert(device_list != nullptr);
3327  assert(num_devices == 1 && "CUDA contexts are for a single device");
3328  assert((context->get_device()->get() == device_list[0]->get()) &&
3329  "Mismatch between devices context and passed context when creating "
3330  "program from binary");
3331 
3332  pi_result retError = PI_SUCCESS;
3333 
3334  std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
3335 
3336  retProgram->set_metadata(metadata, num_metadata_entries);
3337 
3338  const bool has_length = (lengths != nullptr);
3339  size_t length = has_length
3340  ? lengths[0]
3341  : strlen(reinterpret_cast<const char *>(binaries[0])) + 1;
3342 
3343  assert(length != 0);
3344 
3345  retProgram->set_binary(reinterpret_cast<const char *>(binaries[0]), length);
3346 
3347  *program = retProgram.release();
3348 
3349  return retError;
3350 }
3351 
3353  size_t param_value_size, void *param_value,
3354  size_t *param_value_size_ret) {
3355  assert(program != nullptr);
3356 
3357  switch (param_name) {
3359  return getInfo(param_value_size, param_value, param_value_size_ret,
3360  program->get_reference_count());
3362  return getInfo(param_value_size, param_value, param_value_size_ret,
3363  program->context_);
3365  return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
3367  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3368  &program->context_->deviceId_);
3370  return getInfo(param_value_size, param_value, param_value_size_ret,
3371  program->binary_);
3373  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3374  &program->binarySizeInBytes_);
3376  return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
3377  &program->binary_);
3379  return getInfo(param_value_size, param_value, param_value_size_ret,
3380  getKernelNames(program).c_str());
3381  }
3382  default:
3384  }
3385  cl::sycl::detail::pi::die("Program info request not implemented");
3386  return {};
3387 }
3388 
3394  const pi_device *device_list, const char *options,
3395  pi_uint32 num_input_programs,
3396  const pi_program *input_programs,
3397  void (*pfn_notify)(pi_program program,
3398  void *user_data),
3399  void *user_data, pi_program *ret_program) {
3400 
3401  assert(ret_program != nullptr);
3402  assert(num_devices == 1 || num_devices == 0);
3403  assert(device_list != nullptr || num_devices == 0);
3404  assert(pfn_notify == nullptr);
3405  assert(user_data == nullptr);
3406  pi_result retError = PI_SUCCESS;
3407 
3408  try {
3409  ScopedContext active(context);
3410 
3411  CUlinkState state;
3412  std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
3413 
3414  retError = PI_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &state));
3415  try {
3416  for (size_t i = 0; i < num_input_programs; ++i) {
3417  pi_program program = input_programs[i];
3418  retError = PI_CHECK_ERROR(cuLinkAddData(
3419  state, CU_JIT_INPUT_PTX, const_cast<char *>(program->binary_),
3420  program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr));
3421  }
3422  void *cubin = nullptr;
3423  size_t cubinSize = 0;
3424  retError = PI_CHECK_ERROR(cuLinkComplete(state, &cubin, &cubinSize));
3425 
3426  retError =
3427  retProgram->set_binary(static_cast<const char *>(cubin), cubinSize);
3428 
3429  if (retError != PI_SUCCESS) {
3430  return retError;
3431  }
3432 
3433  retError = retProgram->build_program(options);
3434 
3435  if (retError != PI_SUCCESS) {
3436  return retError;
3437  }
3438  } catch (...) {
3439  // Upon error attempt cleanup
3440  PI_CHECK_ERROR(cuLinkDestroy(state));
3441  throw;
3442  }
3443 
3444  retError = PI_CHECK_ERROR(cuLinkDestroy(state));
3445  *ret_program = retProgram.release();
3446 
3447  } catch (pi_result err) {
3448  retError = err;
3449  }
3450  return retError;
3451 }
3452 
3458  pi_program program, pi_uint32 num_devices, const pi_device *device_list,
3459  const char *options, pi_uint32 num_input_headers,
3460  const pi_program *input_headers, const char **header_include_names,
3461  void (*pfn_notify)(pi_program program, void *user_data), void *user_data) {
3462  // Ignore unused parameters
3463  (void)header_include_names;
3464  (void)input_headers;
3465 
3466  assert(program != nullptr);
3467  assert(num_devices == 1 || num_devices == 0);
3468  assert(device_list != nullptr || num_devices == 0);
3469  assert(pfn_notify == nullptr);
3470  assert(user_data == nullptr);
3471  assert(num_input_headers == 0);
3472  pi_result retError = PI_SUCCESS;
3473 
3474  try {
3475  ScopedContext active(program->get_context());
3476 
3477  program->build_program(options);
3478 
3479  } catch (pi_result err) {
3480  retError = err;
3481  }
3482  return retError;
3483 }
3484 
3486  pi_program_build_info param_name,
3487  size_t param_value_size, void *param_value,
3488  size_t *param_value_size_ret) {
3489  // Ignore unused parameter
3490  (void)device;
3491 
3492  assert(program != nullptr);
3493 
3494  switch (param_name) {
3496  return getInfo(param_value_size, param_value, param_value_size_ret,
3497  program->buildStatus_);
3498  }
3500  return getInfo(param_value_size, param_value, param_value_size_ret,
3501  program->buildOptions_.c_str());
3503  return getInfoArray(program->MAX_LOG_SIZE, param_value_size, param_value,
3504  param_value_size_ret, program->infoLog_);
3505  default:
3507  }
3508  cl::sycl::detail::pi::die("Program Build info request not implemented");
3509  return {};
3510 }
3511 
3513  assert(program != nullptr);
3514  assert(program->get_reference_count() > 0);
3515  program->increment_reference_count();
3516  return PI_SUCCESS;
3517 }
3518 
3523  assert(program != nullptr);
3524 
3525  // double delete or someone is messing with the ref count.
3526  // either way, cannot safely proceed.
3527  assert(program->get_reference_count() != 0 &&
3528  "Reference count overflow detected in cuda_piProgramRelease.");
3529 
3530  // decrement ref count. If it is 0, delete the program.
3531  if (program->decrement_reference_count() == 0) {
3532 
3533  std::unique_ptr<_pi_program> program_ptr{program};
3534 
3535  pi_result result = PI_ERROR_INVALID_PROGRAM;
3536 
3537  try {
3538  ScopedContext active(program->get_context());
3539  auto cuModule = program->get();
3540  result = PI_CHECK_ERROR(cuModuleUnload(cuModule));
3541  } catch (...) {
3542  result = PI_ERROR_OUT_OF_RESOURCES;
3543  }
3544 
3545  return result;
3546  }
3547 
3548  return PI_SUCCESS;
3549 }
3550 
3558  pi_native_handle *nativeHandle) {
3559  *nativeHandle = reinterpret_cast<pi_native_handle>(program->get());
3560  return PI_SUCCESS;
3561 }
3562 
3573  bool, pi_program *) {
3575  "Creation of PI program from native handle not implemented");
3576  return {};
3577 }
3578 
3580  size_t param_value_size, void *param_value,
3581  size_t *param_value_size_ret) {
3582 
3583  if (kernel != nullptr) {
3584 
3585  switch (param_name) {
3587  return getInfo(param_value_size, param_value, param_value_size_ret,
3588  kernel->get_name());
3590  return getInfo(param_value_size, param_value, param_value_size_ret,
3591  kernel->get_num_args());
3593  return getInfo(param_value_size, param_value, param_value_size_ret,
3594  kernel->get_reference_count());
3595  case PI_KERNEL_INFO_CONTEXT: {
3596  return getInfo(param_value_size, param_value, param_value_size_ret,
3597  kernel->get_context());
3598  }
3599  case PI_KERNEL_INFO_PROGRAM: {
3600  return getInfo(param_value_size, param_value, param_value_size_ret,
3601  kernel->get_program());
3602  }
3604  return getInfo(param_value_size, param_value, param_value_size_ret, "");
3605  }
3606  default: {
3608  }
3609  }
3610  }
3611 
3612  return PI_ERROR_INVALID_KERNEL;
3613 }
3614 
3617  size_t input_value_size, const void *input_value, size_t param_value_size,
3618  void *param_value, size_t *param_value_size_ret) {
3619  // Ignore unused parameters
3620  (void)input_value_size;
3621  (void)input_value;
3622 
3623  if (kernel != nullptr) {
3624  switch (param_name) {
3626  // Sub-group size is equivalent to warp size
3627  int warpSize = 0;
3629  cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
3630  device->get()) == CUDA_SUCCESS);
3631  return getInfo(param_value_size, param_value, param_value_size_ret,
3632  static_cast<uint32_t>(warpSize));
3633  }
3635  // Number of sub-groups = max block size / warp size + possible remainder
3636  int max_threads = 0;
3638  cuFuncGetAttribute(&max_threads,
3639  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
3640  kernel->get()) == CUDA_SUCCESS);
3641  int warpSize = 0;
3643  0, nullptr, sizeof(uint32_t), &warpSize,
3644  nullptr);
3645  int maxWarps = (max_threads + warpSize - 1) / warpSize;
3646  return getInfo(param_value_size, param_value, param_value_size_ret,
3647  static_cast<uint32_t>(maxWarps));
3648  }
3650  // Return value of 0 => not specified
3651  // TODO: Revisit if PTX is generated for compile-time work-group sizes
3652  return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3653  }
3655  // Return value of 0 => unspecified or "auto" sub-group size
3656  // Correct for now, since warp size may be read from special register
3657  // TODO: Return warp size once default is primary sub-group size
3658  // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX
3659  return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3660  }
3661  default:
3663  }
3664  }
3665  return PI_ERROR_INVALID_KERNEL;
3666 }
3667 
3669  assert(kernel != nullptr);
3670  assert(kernel->get_reference_count() > 0u);
3671 
3672  kernel->increment_reference_count();
3673  return PI_SUCCESS;
3674 }
3675 
3677  assert(kernel != nullptr);
3678 
3679  // double delete or someone is messing with the ref count.
3680  // either way, cannot safely proceed.
3681  assert(kernel->get_reference_count() != 0 &&
3682  "Reference count overflow detected in cuda_piKernelRelease.");
3683 
3684  // decrement ref count. If it is 0, delete the program.
3685  if (kernel->decrement_reference_count() == 0) {
3686  // no internal cuda resources to clean up. Just delete it.
3687  delete kernel;
3688  return PI_SUCCESS;
3689  }
3690 
3691  return PI_SUCCESS;
3692 }
3693 
3694 // A NOP for the CUDA backend
3696  const void *) {
3697  return PI_SUCCESS;
3698 }
3699 
3701  size_t, const void *) {
3702  // This entry point is only used for native specialization constants (SPIR-V),
3703  // and the CUDA plugin is AOT only so this entry point is not supported.
3705  "Native specialization constants are not supported");
3706  return {};
3707 }
3708 
3710  size_t arg_size,
3711  const void *arg_value) {
3712  kernel->set_kernel_arg(arg_index, arg_size, arg_value);
3713  return PI_SUCCESS;
3714 }
3715 
3716 //
3717 // Events
3718 //
3720  cl::sycl::detail::pi::die("PI Event Create not implemented in CUDA backend");
3721 }
3722 
3724  size_t param_value_size, void *param_value,
3725  size_t *param_value_size_ret) {
3726  assert(event != nullptr);
3727 
3728  switch (param_name) {
3730  return getInfo(param_value_size, param_value, param_value_size_ret,
3731  event->get_queue());
3733  return getInfo(param_value_size, param_value, param_value_size_ret,
3734  event->get_command_type());
3736  return getInfo(param_value_size, param_value, param_value_size_ret,
3737  event->get_reference_count());
3739  return getInfo(param_value_size, param_value, param_value_size_ret,
3740  static_cast<pi_event_status>(event->get_execution_status()));
3741  }
3742  case PI_EVENT_INFO_CONTEXT:
3743  return getInfo(param_value_size, param_value, param_value_size_ret,
3744  event->get_context());
3745  default:
3747  }
3748 
3749  return PI_ERROR_INVALID_EVENT;
3750 }
3751 
3755  pi_profiling_info param_name,
3756  size_t param_value_size,
3757  void *param_value,
3758  size_t *param_value_size_ret) {
3759 
3760  assert(event != nullptr);
3761 
3762  pi_queue queue = event->get_queue();
3763  if (queue == nullptr || !(queue->properties_ & PI_QUEUE_PROFILING_ENABLE)) {
3764  return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE;
3765  }
3766 
3767  switch (param_name) {
3770  return getInfo<pi_uint64>(param_value_size, param_value,
3771  param_value_size_ret, event->get_queued_time());
3773  return getInfo<pi_uint64>(param_value_size, param_value,
3774  param_value_size_ret, event->get_start_time());
3776  return getInfo<pi_uint64>(param_value_size, param_value,
3777  param_value_size_ret, event->get_end_time());
3778  default:
3780  }
3781  cl::sycl::detail::pi::die("Event Profiling info request not implemented");
3782  return {};
3783 }
3784 
3786  cl::sycl::detail::pi::die("Event Callback not implemented in CUDA backend");
3787  return PI_SUCCESS;
3788 }
3789 
3791  cl::sycl::detail::pi::die("Event Set Status not implemented in CUDA backend");
3792  return PI_ERROR_INVALID_VALUE;
3793 }
3794 
3796  assert(event != nullptr);
3797 
3798  const auto refCount = event->increment_reference_count();
3799 
3801  refCount != 0,
3802  "Reference count overflow detected in cuda_piEventRetain.");
3803 
3804  return PI_SUCCESS;
3805 }
3806 
3808  assert(event != nullptr);
3809 
3810  // double delete or someone is messing with the ref count.
3811  // either way, cannot safely proceed.
3813  event->get_reference_count() != 0,
3814  "Reference count overflow detected in cuda_piEventRelease.");
3815 
3816  // decrement ref count. If it is 0, delete the event.
3817  if (event->decrement_reference_count() == 0) {
3818  std::unique_ptr<_pi_event> event_ptr{event};
3819  pi_result result = PI_ERROR_INVALID_EVENT;
3820  try {
3821  ScopedContext active(event->get_context());
3822  result = event->release();
3823  } catch (...) {
3824  result = PI_ERROR_OUT_OF_RESOURCES;
3825  }
3826  return result;
3827  }
3828 
3829  return PI_SUCCESS;
3830 }
3831 
3838  pi_uint32 num_events_in_wait_list,
3839  const pi_event *event_wait_list,
3840  pi_event *event) {
3842  command_queue, num_events_in_wait_list, event_wait_list, event);
3843 }
3844 
3858  pi_uint32 num_events_in_wait_list,
3859  const pi_event *event_wait_list,
3860  pi_event *event) {
3861  if (!command_queue) {
3862  return PI_ERROR_INVALID_QUEUE;
3863  }
3864 
3865  try {
3866  ScopedContext active(command_queue->get_context());
3867 
3868  if (event_wait_list) {
3869  auto result =
3870  forLatestEvents(event_wait_list, num_events_in_wait_list,
3871  [command_queue](pi_event event) -> pi_result {
3872  if (event->get_queue()->has_been_synchronized(
3873  event->get_stream_token())) {
3874  return PI_SUCCESS;
3875  } else {
3876  return enqueueEventWait(command_queue, event);
3877  }
3878  });
3879 
3880  if (result != PI_SUCCESS) {
3881  return result;
3882  }
3883  }
3884 
3885  if (event) {
3886  pi_uint32 stream_token;
3887  _pi_stream_guard guard;
3888  CUstream cuStream = command_queue->get_next_compute_stream(
3889  num_events_in_wait_list, event_wait_list, guard, &stream_token);
3890  *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue,
3891  cuStream, stream_token);
3892  (*event)->start();
3893  (*event)->record();
3894  }
3895 
3896  return PI_SUCCESS;
3897  } catch (pi_result err) {
3898  return err;
3899  } catch (...) {
3900  return PI_ERROR_UNKNOWN;
3901  }
3902 }
3903 
3911  pi_native_handle *nativeHandle) {
3912  *nativeHandle = reinterpret_cast<pi_native_handle>(event->get());
3913  return PI_SUCCESS;
3914 }
3915 
3926  bool ownNativeHandle,
3927  pi_event *event) {
3928  (void)ownNativeHandle;
3929  assert(!ownNativeHandle);
3930 
3931  std::unique_ptr<_pi_event> event_ptr{nullptr};
3932 
3934  reinterpret_cast<CUevent>(nativeHandle));
3935 
3936  return PI_SUCCESS;
3937 }
3938 
3949  const pi_sampler_properties *sampler_properties,
3950  pi_sampler *result_sampler) {
3951  std::unique_ptr<_pi_sampler> retImplSampl{new _pi_sampler(context)};
3952 
3953  bool propSeen[3] = {false, false, false};
3954  for (size_t i = 0; sampler_properties[i] != 0; i += 2) {
3955  switch (sampler_properties[i]) {
3957  if (propSeen[0]) {
3958  return PI_ERROR_INVALID_VALUE;
3959  }
3960  propSeen[0] = true;
3961  retImplSampl->props_ |= sampler_properties[i + 1];
3962  break;
3964  if (propSeen[1]) {
3965  return PI_ERROR_INVALID_VALUE;
3966  }
3967  propSeen[1] = true;
3968  retImplSampl->props_ |=
3969  (sampler_properties[i + 1] - PI_SAMPLER_FILTER_MODE_NEAREST) << 1;
3970  break;
3972  if (propSeen[2]) {
3973  return PI_ERROR_INVALID_VALUE;
3974  }
3975  propSeen[2] = true;
3976  retImplSampl->props_ |=
3977  (sampler_properties[i + 1] - PI_SAMPLER_ADDRESSING_MODE_NONE) << 2;
3978  break;
3979  default:
3980  return PI_ERROR_INVALID_VALUE;
3981  }
3982  }
3983 
3984  if (!propSeen[0]) {
3985  retImplSampl->props_ |= PI_TRUE;
3986  }
3987  // Default filter mode to PI_SAMPLER_FILTER_MODE_NEAREST
3988  if (!propSeen[2]) {
3989  retImplSampl->props_ |=
3991  << 2;
3992  }
3993 
3994  *result_sampler = retImplSampl.release();
3995  return PI_SUCCESS;
3996 }
3997 
4008  size_t param_value_size, void *param_value,
4009  size_t *param_value_size_ret) {
4010  assert(sampler != nullptr);
4011 
4012  switch (param_name) {
4014  return getInfo(param_value_size, param_value, param_value_size_ret,
4015  sampler->get_reference_count());
4017  return getInfo(param_value_size, param_value, param_value_size_ret,
4018  sampler->context_);
4020  pi_bool norm_coords_prop = static_cast<pi_bool>(sampler->props_ & 0x1);
4021  return getInfo(param_value_size, param_value, param_value_size_ret,
4022  norm_coords_prop);
4023  }
4025  pi_sampler_filter_mode filter_prop = static_cast<pi_sampler_filter_mode>(
4026  ((sampler->props_ >> 1) & 0x1) + PI_SAMPLER_FILTER_MODE_NEAREST);
4027  return getInfo(param_value_size, param_value, param_value_size_ret,
4028  filter_prop);
4029  }
4031  pi_sampler_addressing_mode addressing_prop =
4032  static_cast<pi_sampler_addressing_mode>(
4033  (sampler->props_ >> 2) + PI_SAMPLER_ADDRESSING_MODE_NONE);
4034  return getInfo(param_value_size, param_value, param_value_size_ret,
4035  addressing_prop);
4036  }
4037  default:
4039  }
4040  return {};
4041 }
4042 
4049  assert(sampler != nullptr);
4050  sampler->increment_reference_count();
4051  return PI_SUCCESS;
4052 }
4053 
4061  assert(sampler != nullptr);
4062 
4063  // double delete or someone is messing with the ref count.
4064  // either way, cannot safely proceed.
4066  sampler->get_reference_count() != 0,
4067  "Reference count overflow detected in cuda_piSamplerRelease.");
4068 
4069  // decrement ref count. If it is 0, delete the sampler.
4070  if (sampler->decrement_reference_count() == 0) {
4071  delete sampler;
4072  }
4073 
4074  return PI_SUCCESS;
4075 }
4076 
4083  CUstream cu_stream, pi_buff_rect_region region, const void *src_ptr,
4084  const CUmemorytype_enum src_type, pi_buff_rect_offset src_offset,
4085  size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr,
4086  const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset,
4087  size_t dst_row_pitch, size_t dst_slice_pitch) {
4088 
4089  assert(region != nullptr);
4090  assert(src_offset != nullptr);
4091  assert(dst_offset != nullptr);
4092 
4093  assert(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST);
4094  assert(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST);
4095 
4096  src_row_pitch = (!src_row_pitch) ? region->width_bytes + src_offset->x_bytes
4097  : src_row_pitch;
4098  src_slice_pitch =
4099  (!src_slice_pitch)
4100  ? ((region->height_scalar + src_offset->y_scalar) * src_row_pitch)
4101  : src_slice_pitch;
4102  dst_row_pitch = (!dst_row_pitch) ? region->width_bytes + dst_offset->x_bytes
4103  : dst_row_pitch;
4104  dst_slice_pitch =
4105  (!dst_slice_pitch)
4106  ? ((region->height_scalar + dst_offset->y_scalar) * dst_row_pitch)
4107  : dst_slice_pitch;
4108 
4109  CUDA_MEMCPY3D params = {};
4110 
4111  params.WidthInBytes = region->width_bytes;
4112  params.Height = region->height_scalar;
4113  params.Depth = region->depth_scalar;
4114 
4115  params.srcMemoryType = src_type;
4116  params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE
4117  ? *static_cast<const CUdeviceptr *>(src_ptr)
4118  : 0;
4119  params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr;
4120  params.srcXInBytes = src_offset->x_bytes;
4121  params.srcY = src_offset->y_scalar;
4122  params.srcZ = src_offset->z_scalar;
4123  params.srcPitch = src_row_pitch;
4124  params.srcHeight = src_slice_pitch / src_row_pitch;
4125 
4126  params.dstMemoryType = dst_type;
4127  params.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE
4128  ? *static_cast<CUdeviceptr *>(dst_ptr)
4129  : 0;
4130  params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr;
4131  params.dstXInBytes = dst_offset->x_bytes;
4132  params.dstY = dst_offset->y_scalar;
4133  params.dstZ = dst_offset->z_scalar;
4134  params.dstPitch = dst_row_pitch;
4135  params.dstHeight = dst_slice_pitch / dst_row_pitch;
4136 
4137  return PI_CHECK_ERROR(cuMemcpy3DAsync(&params, cu_stream));
4138 }
4139 
4141  pi_queue command_queue, pi_mem buffer, pi_bool blocking_read,
4142  pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
4143  pi_buff_rect_region region, size_t buffer_row_pitch,
4144  size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
4145  void *ptr, pi_uint32 num_events_in_wait_list,
4146  const pi_event *event_wait_list, pi_event *event) {
4147 
4148  assert(buffer != nullptr);
4149  assert(command_queue != nullptr);
4150 
4151  pi_result retErr = PI_SUCCESS;
4152  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
4153  std::unique_ptr<_pi_event> retImplEv{nullptr};
4154 
4155  try {
4156  ScopedContext active(command_queue->get_context());
4157  CUstream cuStream = command_queue->get_next_transfer_stream();
4158 
4159  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4160  event_wait_list);
4161 
4162  if (event) {
4163  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4164  PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, command_queue, cuStream));
4165  retImplEv->start();
4166  }
4167 
4169  cuStream, region, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset,
4170  buffer_row_pitch, buffer_slice_pitch, ptr, CU_MEMORYTYPE_HOST,
4171  host_offset, host_row_pitch, host_slice_pitch);
4172 
4173  if (event) {
4174  retErr = retImplEv->record();
4175  }
4176 
4177  if (blocking_read) {
4178  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
4179  }
4180 
4181  if (event) {
4182  *event = retImplEv.release();
4183  }
4184 
4185  } catch (pi_result err) {
4186  retErr = err;
4187  }
4188  return retErr;
4189 }
4190 
4192  pi_queue command_queue, pi_mem buffer, pi_bool blocking_write,
4193  pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
4194  pi_buff_rect_region region, size_t buffer_row_pitch,
4195  size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
4196  const void *ptr, pi_uint32 num_events_in_wait_list,
4197  const pi_event *event_wait_list, pi_event *event) {
4198 
4199  assert(buffer != nullptr);
4200  assert(command_queue != nullptr);
4201 
4202  pi_result retErr = PI_SUCCESS;
4203  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
4204  std::unique_ptr<_pi_event> retImplEv{nullptr};
4205 
4206  try {
4207  ScopedContext active(command_queue->get_context());
4208  CUstream cuStream = command_queue->get_next_transfer_stream();
4209  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4210  event_wait_list);
4211 
4212  if (event) {
4213  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4214  PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, command_queue, cuStream));
4215  retImplEv->start();
4216  }
4217 
4219  cuStream, region, ptr, CU_MEMORYTYPE_HOST, host_offset, host_row_pitch,
4220  host_slice_pitch, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset,
4221  buffer_row_pitch, buffer_slice_pitch);
4222 
4223  if (event) {
4224  retErr = retImplEv->record();
4225  }
4226 
4227  if (blocking_write) {
4228  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
4229  }
4230 
4231  if (event) {
4232  *event = retImplEv.release();
4233  }
4234 
4235  } catch (pi_result err) {
4236  retErr = err;
4237  }
4238  return retErr;
4239 }
4240 
4242  pi_mem dst_buffer, size_t src_offset,
4243  size_t dst_offset, size_t size,
4244  pi_uint32 num_events_in_wait_list,
4245  const pi_event *event_wait_list,
4246  pi_event *event) {
4247  if (!command_queue) {
4248  return PI_ERROR_INVALID_QUEUE;
4249  }
4250 
4251  std::unique_ptr<_pi_event> retImplEv{nullptr};
4252 
4253  try {
4254  ScopedContext active(command_queue->get_context());
4255  pi_result result;
4256 
4257  auto stream = command_queue->get_next_transfer_stream();
4258  result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
4259  event_wait_list);
4260 
4261  if (event) {
4262  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4263  PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue, stream));
4264  result = retImplEv->start();
4265  }
4266 
4267  auto src = src_buffer->mem_.buffer_mem_.get() + src_offset;
4268  auto dst = dst_buffer->mem_.buffer_mem_.get() + dst_offset;
4269 
4270  result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream));
4271 
4272  if (event) {
4273  result = retImplEv->record();
4274  *event = retImplEv.release();
4275  }
4276 
4277  return result;
4278  } catch (pi_result err) {
4279  return err;
4280  } catch (...) {
4281  return PI_ERROR_UNKNOWN;
4282  }
4283 }
4284 
4286  pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer,
4287  pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin,
4288  pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch,
4289  size_t dst_row_pitch, size_t dst_slice_pitch,
4290  pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
4291  pi_event *event) {
4292 
4293  assert(src_buffer != nullptr);
4294  assert(dst_buffer != nullptr);
4295  assert(command_queue != nullptr);
4296 
4297  pi_result retErr = PI_SUCCESS;
4298  CUdeviceptr srcPtr = src_buffer->mem_.buffer_mem_.get();
4299  CUdeviceptr dstPtr = dst_buffer->mem_.buffer_mem_.get();
4300  std::unique_ptr<_pi_event> retImplEv{nullptr};
4301 
4302  try {
4303  ScopedContext active(command_queue->get_context());
4304  CUstream cuStream = command_queue->get_next_transfer_stream();
4305  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4306  event_wait_list);
4307 
4308  if (event) {
4309  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4310  PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue, cuStream));
4311  retImplEv->start();
4312  }
4313 
4315  cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin,
4316  src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE,
4317  dst_origin, dst_row_pitch, dst_slice_pitch);
4318 
4319  if (event) {
4320  retImplEv->record();
4321  *event = retImplEv.release();
4322  }
4323 
4324  } catch (pi_result err) {
4325  retErr = err;
4326  }
4327  return retErr;
4328 }
4329 
4331  const void *pattern, size_t pattern_size,
4332  size_t offset, size_t size,
4333  pi_uint32 num_events_in_wait_list,
4334  const pi_event *event_wait_list,
4335  pi_event *event) {
4336  assert(command_queue != nullptr);
4337 
4338  auto args_are_multiples_of_pattern_size =
4339  (offset % pattern_size == 0) || (size % pattern_size == 0);
4340 
4341  auto pattern_is_valid = (pattern != nullptr);
4342 
4343  auto pattern_size_is_valid =
4344  ((pattern_size & (pattern_size - 1)) == 0) && // is power of two
4345  (pattern_size > 0) && (pattern_size <= 128); // falls within valid range
4346 
4347  assert(args_are_multiples_of_pattern_size && pattern_is_valid &&
4348  pattern_size_is_valid);
4349  (void)args_are_multiples_of_pattern_size;
4350  (void)pattern_is_valid;
4351  (void)pattern_size_is_valid;
4352 
4353  std::unique_ptr<_pi_event> retImplEv{nullptr};
4354 
4355  try {
4356  ScopedContext active(command_queue->get_context());
4357 
4358  auto stream = command_queue->get_next_transfer_stream();
4359  pi_result result;
4360  result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
4361  event_wait_list);
4362 
4363  if (event) {
4364  retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
4365  PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue, stream));
4366  result = retImplEv->start();
4367  }
4368 
4369  auto dstDevice = buffer->mem_.buffer_mem_.get() + offset;
4370  auto N = size / pattern_size;
4371 
4372  // pattern size in bytes
4373  switch (pattern_size) {
4374  case 1: {
4375  auto value = *static_cast<const uint8_t *>(pattern);
4376  result = PI_CHECK_ERROR(cuMemsetD8Async(dstDevice, value, N, stream));
4377  break;
4378  }
4379  case 2: {
4380  auto value = *static_cast<const uint16_t *>(pattern);
4381  result = PI_CHECK_ERROR(cuMemsetD16Async(dstDevice, value, N, stream));
4382  break;
4383  }
4384  case 4: {
4385  auto value = *static_cast<const uint32_t *>(pattern);
4386  result = PI_CHECK_ERROR(cuMemsetD32Async(dstDevice, value, N, stream));
4387  break;
4388  }
4389  default: {
4390  // CUDA has no memset functions that allow setting values more than 4
4391  // bytes. PI API lets you pass an arbitrary "pattern" to the buffer
4392  // fill, which can be more than 4 bytes. We must break up the pattern
4393  // into 4 byte values, and set the buffer using multiple strided calls.
4394  // This means that one cuMemsetD2D32Async call is made for every 4 bytes
4395  // in the pattern.
4396 
4397  auto number_of_steps = pattern_size / sizeof(uint32_t);
4398 
4399  // we walk up the pattern in 4-byte steps, and call cuMemset for each
4400  // 4-byte chunk of the pattern.
4401  for (auto step = 0u; step < number_of_steps; ++step) {
4402  // take 4 bytes of the pattern
4403  auto value = *(static_cast<const uint32_t *>(pattern) + step);
4404 
4405  // offset the pointer to the part of the buffer we want to write to
4406  auto offset_ptr = dstDevice + (step * sizeof(uint32_t));
4407 
4408  // set all of the pattern chunks
4409  result = PI_CHECK_ERROR(
4410  cuMemsetD2D32Async(offset_ptr, pattern_size, value, 1, N, stream));
4411  }
4412 
4413  break;
4414  }
4415  }
4416 
4417  if (event) {
4418  result = retImplEv->record();
4419  *event = retImplEv.release();
4420  }
4421 
4422  return result;
4423  } catch (pi_result err) {
4424  return err;
4425  } catch (...) {
4426  return PI_ERROR_UNKNOWN;
4427  }
4428 }
4429 
4430 static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR array_desc) {
4431  switch (array_desc.Format) {
4432  case CU_AD_FORMAT_UNSIGNED_INT8:
4433  case CU_AD_FORMAT_SIGNED_INT8:
4434  return 1;
4435  case CU_AD_FORMAT_UNSIGNED_INT16:
4436  case CU_AD_FORMAT_SIGNED_INT16:
4437  case CU_AD_FORMAT_HALF:
4438  return 2;
4439  case CU_AD_FORMAT_UNSIGNED_INT32:
4440  case CU_AD_FORMAT_SIGNED_INT32:
4441  case CU_AD_FORMAT_FLOAT:
4442  return 4;
4443  default:
4444  cl::sycl::detail::pi::die("Invalid image format.");
4445  return 0;
4446  }
4447 }
4448 
4455  CUstream cu_stream, pi_mem_type img_type, const size_t *region,
4456  const void *src_ptr, const CUmemorytype_enum src_type,
4457  const size_t *src_offset, void *dst_ptr, const CUmemorytype_enum dst_type,
4458  const size_t *dst_offset) {
4459  assert(region != nullptr);
4460 
4461  assert(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST);
4462  assert(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST);
4463 
4464  if (img_type == PI_MEM_TYPE_IMAGE2D) {
4465  CUDA_MEMCPY2D cpyDesc;
4466  memset(&cpyDesc, 0, sizeof(cpyDesc));
4467  cpyDesc.srcMemoryType = src_type;
4468  if (src_type == CU_MEMORYTYPE_ARRAY) {
4469  cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
4470  cpyDesc.srcXInBytes = src_offset[0];
4471  cpyDesc.srcY = src_offset[1];
4472  } else {
4473  cpyDesc.srcHost = src_ptr;
4474  }
4475  cpyDesc.dstMemoryType = dst_type;
4476  if (dst_type == CU_MEMORYTYPE_ARRAY) {
4477  cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
4478  cpyDesc.dstXInBytes = dst_offset[0];
4479  cpyDesc.dstY = dst_offset[1];
4480  } else {
4481  cpyDesc.dstHost = dst_ptr;
4482  }
4483  cpyDesc.WidthInBytes = region[0];
4484  cpyDesc.Height = region[1];
4485  return PI_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cu_stream));
4486  }
4487  if (img_type == PI_MEM_TYPE_IMAGE3D) {
4488  CUDA_MEMCPY3D cpyDesc;
4489  memset(&cpyDesc, 0, sizeof(cpyDesc));
4490  cpyDesc.srcMemoryType = src_type;
4491  if (src_type == CU_MEMORYTYPE_ARRAY) {
4492  cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
4493  cpyDesc.srcXInBytes = src_offset[0];
4494  cpyDesc.srcY = src_offset[1];
4495  cpyDesc.srcZ = src_offset[2];
4496  } else {
4497  cpyDesc.srcHost = src_ptr;
4498  }
4499  cpyDesc.dstMemoryType = dst_type;
4500  if (dst_type == CU_MEMORYTYPE_ARRAY) {
4501  cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
4502  cpyDesc.dstXInBytes = dst_offset[0];
4503  cpyDesc.dstY = dst_offset[1];
4504  cpyDesc.dstZ = dst_offset[2];
4505  } else {
4506  cpyDesc.dstHost = dst_ptr;
4507  }
4508  cpyDesc.WidthInBytes = region[0];
4509  cpyDesc.Height = region[1];
4510  cpyDesc.Depth = region[2];
4511  return PI_CHECK_ERROR(cuMemcpy3DAsync(&cpyDesc, cu_stream));
4512  }
4513  return PI_ERROR_INVALID_VALUE;
4514 }
4515 
4517  pi_queue command_queue, pi_mem image, pi_bool blocking_read,
4518  const size_t *origin, const size_t *region, size_t row_pitch,
4519  size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list,
4520  const pi_event *event_wait_list, pi_event *event) {
4521  // Ignore unused parameters
4522  (void)row_pitch;
4523  (void)slice_pitch;
4524 
4525  assert(command_queue != nullptr);
4526  assert(image != nullptr);
4527  assert(image->mem_type_ == _pi_mem::mem_type::surface);
4528 
4529  pi_result retErr = PI_SUCCESS;
4530 
4531  try {
4532  ScopedContext active(command_queue->get_context());
4533  CUstream cuStream = command_queue->get_next_transfer_stream();
4534  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4535  event_wait_list);
4536 
4537  CUarray array = image->mem_.surface_mem_.get_array();
4538 
4539  CUDA_ARRAY_DESCRIPTOR arrayDesc;
4540  retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
4541 
4542  int elementByteSize = imageElementByteSize(arrayDesc);
4543 
4544  size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels;
4545  size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0];
4546 
4547  pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
4548  if (imgType == PI_MEM_TYPE_IMAGE1D) {
4549  retErr = PI_CHECK_ERROR(
4550  cuMemcpyAtoHAsync(ptr, array, byteOffsetX, bytesToCopy, cuStream));
4551  } else {
4552  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4553  size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]};
4554 
4555  retErr = commonEnqueueMemImageNDCopy(
4556  cuStream, imgType, adjustedRegion, &array, CU_MEMORYTYPE_ARRAY,
4557  srcOffset, ptr, CU_MEMORYTYPE_HOST, nullptr);
4558 
4559  if (retErr != PI_SUCCESS) {
4560  return retErr;
4561  }
4562  }
4563 
4564  if (event) {
4566  command_queue, cuStream);
4567  new_event->record();
4568  *event = new_event;
4569  }
4570 
4571  if (blocking_read) {
4572  retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
4573  }
4574  } catch (pi_result err) {
4575  return err;
4576  } catch (...) {
4577  return PI_ERROR_UNKNOWN;
4578  }
4579 
4580  return retErr;
4581 }
4582 
4583 pi_result
4585  pi_bool blocking_write, const size_t *origin,
4586  const size_t *region, size_t input_row_pitch,
4587  size_t input_slice_pitch, const void *ptr,
4588  pi_uint32 num_events_in_wait_list,
4589  const pi_event *event_wait_list, pi_event *event) {
4590  // Ignore unused parameters
4591  (void)blocking_write;
4592  (void)input_row_pitch;
4593  (void)input_slice_pitch;
4594 
4595  assert(command_queue != nullptr);
4596  assert(image != nullptr);
4597  assert(image->mem_type_ == _pi_mem::mem_type::surface);
4598 
4599  pi_result retErr = PI_SUCCESS;
4600 
4601  try {
4602  ScopedContext active(command_queue->get_context());
4603  CUstream cuStream = command_queue->get_next_transfer_stream();
4604  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4605  event_wait_list);
4606 
4607  CUarray array = image->mem_.surface_mem_.get_array();
4608 
4609  CUDA_ARRAY_DESCRIPTOR arrayDesc;
4610  retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
4611 
4612  int elementByteSize = imageElementByteSize(arrayDesc);
4613 
4614  size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels;
4615  size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0];
4616 
4617  pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
4618  if (imgType == PI_MEM_TYPE_IMAGE1D) {
4619  retErr = PI_CHECK_ERROR(
4620  cuMemcpyHtoAAsync(array, byteOffsetX, ptr, bytesToCopy, cuStream));
4621  } else {
4622  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4623  size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]};
4624 
4625  retErr = commonEnqueueMemImageNDCopy(
4626  cuStream, imgType, adjustedRegion, ptr, CU_MEMORYTYPE_HOST, nullptr,
4627  &array, CU_MEMORYTYPE_ARRAY, dstOffset);
4628 
4629  if (retErr != PI_SUCCESS) {
4630  return retErr;
4631  }
4632  }
4633 
4634  if (event) {
4636  command_queue, cuStream);
4637  new_event->record();
4638  *event = new_event;
4639  }
4640  } catch (pi_result err) {
4641  return err;
4642  } catch (...) {
4643  return PI_ERROR_UNKNOWN;
4644  }
4645 
4646  return retErr;
4647 }
4648 
4650  pi_mem dst_image, const size_t *src_origin,
4651  const size_t *dst_origin,
4652  const size_t *region,
4653  pi_uint32 num_events_in_wait_list,
4654  const pi_event *event_wait_list,
4655  pi_event *event) {
4656  assert(src_image->mem_type_ == _pi_mem::mem_type::surface);
4657  assert(dst_image->mem_type_ == _pi_mem::mem_type::surface);
4658  assert(src_image->mem_.surface_mem_.get_image_type() ==
4659  dst_image->mem_.surface_mem_.get_image_type());
4660 
4661  pi_result retErr = PI_SUCCESS;
4662 
4663  try {
4664  ScopedContext active(command_queue->get_context());
4665  CUstream cuStream = command_queue->get_next_transfer_stream();
4666  retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
4667  event_wait_list);
4668 
4669  CUarray srcArray = src_image->mem_.surface_mem_.get_array();
4670  CUarray dstArray = dst_image->mem_.surface_mem_.get_array();
4671 
4672  CUDA_ARRAY_DESCRIPTOR srcArrayDesc;
4673  retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&srcArrayDesc, srcArray));
4674  CUDA_ARRAY_DESCRIPTOR dstArrayDesc;
4675  retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&dstArrayDesc, dstArray));
4676 
4677  assert(srcArrayDesc.Format == dstArrayDesc.Format);
4678  assert(srcArrayDesc.NumChannels == dstArrayDesc.NumChannels);
4679 
4680  int elementByteSize = imageElementByteSize(srcArrayDesc);
4681 
4682  size_t dstByteOffsetX =
4683  dst_origin[0] * elementByteSize * srcArrayDesc.NumChannels;
4684  size_t srcByteOffsetX =
4685  src_origin[0] * elementByteSize * dstArrayDesc.NumChannels;
4686  size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0];
4687 
4688  pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type();
4689  if (imgType == PI_MEM_TYPE_IMAGE1D) {
4690  retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray,
4691  srcByteOffsetX, bytesToCopy));
4692  } else {
4693  size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
4694  size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
4695  size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
4696 
4697  retErr = commonEnqueueMemImageNDCopy(
4698  cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY,
4699  srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset);
4700 
4701  if (retErr != PI_SUCCESS) {
4702  return retErr;
4703  }
4704  }
4705 
4706  if (event) {
4708  command_queue, cuStream);
4709  new_event->record();
4710  *event = new_event;
4711  }
4712  } catch (pi_result err) {
4713  return err;
4714  } catch (...) {
4715  return PI_ERROR_UNKNOWN;
4716  }
4717 
4718  return retErr;
4719 }
4720 
4723  const size_t *, const size_t *, pi_uint32,
4724  const pi_event *, pi_event *) {
4725  cl::sycl::detail::pi::die("cuda_piEnqueueMemImageFill not implemented");
4726  return {};
4727 }
4728 
4735  pi_bool blocking_map,
4736  pi_map_flags map_flags, size_t offset,
4737  size_t size,
4738  pi_uint32 num_events_in_wait_list,
4739  const pi_event *event_wait_list,
4740  pi_event *event, void **ret_map) {
4741  assert(ret_map != nullptr);
4742  assert(command_queue != nullptr);
4743  assert(buffer != nullptr);
4744  assert(buffer->mem_type_ == _pi_mem::mem_type::buffer);
4745 
4746  pi_result ret_err = PI_ERROR_INVALID_OPERATION;
4747  const bool is_pinned = buffer->mem_.buffer_mem_.allocMode_ ==
4749 
4750  // Currently no support for overlapping regions
4751  if (buffer->mem_.buffer_mem_.get_map_ptr() != nullptr) {
4752  return ret_err;
4753  }
4754 
4755  // Allocate a pointer in the host to store the mapped information
4756  auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags);
4757  *ret_map = buffer->mem_.buffer_mem_.get_map_ptr();
4758  if (hostPtr) {
4759  ret_err = PI_SUCCESS;
4760  }
4761 
4762  if (!is_pinned && ((map_flags & PI_MAP_READ) || (map_flags & PI_MAP_WRITE))) {
4763  // Pinned host memory is already on host so it doesn't need to be read.
4764  ret_err = cuda_piEnqueueMemBufferRead(
4765  command_queue, buffer, blocking_map, offset, size, hostPtr,
4766  num_events_in_wait_list, event_wait_list, event);
4767  } else {
4768  ScopedContext active(command_queue->get_context());
4769 
4770  if (is_pinned) {
4771  ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
4772  event_wait_list, nullptr);
4773  }
4774 
4775  if (event) {
4776  try {
4777  *event = _pi_event::make_native(
4778  PI_COMMAND_TYPE_MEM_BUFFER_MAP, command_queue,
4779  command_queue->get_next_transfer_stream());
4780  (*event)->start();
4781  (*event)->record();
4782  } catch (pi_result error) {
4783  ret_err = error;
4784  }
4785  }
4786  }
4787 
4788  return ret_err;
4789 }
4790 
4796  void *mapped_ptr,
4797  pi_uint32 num_events_in_wait_list,
4798  const pi_event *event_wait_list,
4799  pi_event *event) {
4800  pi_result ret_err = PI_SUCCESS;
4801 
4802  assert(command_queue != nullptr);
4803  assert(mapped_ptr != nullptr);
4804  assert(memobj != nullptr);
4805  assert(memobj->mem_type_ == _pi_mem::mem_type::buffer);
4806  assert(memobj->mem_.buffer_mem_.get_map_ptr() != nullptr);
4807  assert(memobj->mem_.buffer_mem_.get_map_ptr() == mapped_ptr);
4808 
4809  const bool is_pinned = memobj->mem_.buffer_mem_.allocMode_ ==
4811 
4812  if (!is_pinned &&
4813  ((memobj->mem_.buffer_mem_.get_map_flags() & PI_MAP_WRITE) ||
4814  (memobj->mem_.buffer_mem_.get_map_flags() &
4816  // Pinned host memory is only on host so it doesn't need to be written to.
4817  ret_err = cuda_piEnqueueMemBufferWrite(
4818  command_queue, memobj, true,
4819  memobj->mem_.buffer_mem_.get_map_offset(mapped_ptr),
4820  memobj->mem_.buffer_mem_.get_size(), mapped_ptr,
4821  num_events_in_wait_list, event_wait_list, event);
4822  } else {
4823  ScopedContext active(command_queue->get_context());
4824 
4825  if (is_pinned) {
4826  ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
4827  event_wait_list, nullptr);
4828  }
4829 
4830  if (event) {
4831  try {
4832  *event = _pi_event::make_native(
4833  PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, command_queue,
4834  command_queue->get_next_transfer_stream());
4835  (*event)->start();
4836  (*event)->record();
4837  } catch (pi_result error) {
4838  ret_err = error;
4839  }
4840  }
4841  }
4842 
4843  memobj->mem_.buffer_mem_.unmap(mapped_ptr);
4844  return ret_err;
4845 }
4846 
4850  pi_usm_mem_properties *properties, size_t size,
4851  pi_uint32 alignment) {
4852  assert(result_ptr != nullptr);
4853  assert(context != nullptr);
4854  assert(properties == nullptr || *properties == 0);
4855  pi_result result = PI_SUCCESS;
4856  try {
4857  ScopedContext active(context);
4858  result = PI_CHECK_ERROR(cuMemAllocHost(result_ptr, size));
4859  } catch (pi_result error) {
4860  result = error;
4861  }
4862 
4863  assert(alignment == 0 ||
4864  (result == PI_SUCCESS &&
4865  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
4866  return result;
4867 }
4868 
4872  pi_device device,
4873  pi_usm_mem_properties *properties,
4874  size_t size, pi_uint32 alignment) {
4875  assert(result_ptr != nullptr);
4876  assert(context != nullptr);
4877  assert(device != nullptr);
4878  assert(properties == nullptr || *properties == 0);
4879  pi_result result = PI_SUCCESS;
4880  try {
4881  ScopedContext active(context);
4882  result = PI_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)result_ptr, size));
4883  } catch (pi_result error) {
4884  result = error;
4885  }
4886 
4887  assert(alignment == 0 ||
4888  (result == PI_SUCCESS &&
4889  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
4890  return result;
4891 }
4892 
4896  pi_device device,
4897  pi_usm_mem_properties *properties,
4898  size_t size, pi_uint32 alignment) {
4899  assert(result_ptr != nullptr);
4900  assert(context != nullptr);
4901  assert(device != nullptr);
4902  assert(properties == nullptr || *properties == 0);
4903  pi_result result = PI_SUCCESS;
4904  try {
4905  ScopedContext active(context);
4906  result = PI_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)result_ptr, size,
4907  CU_MEM_ATTACH_GLOBAL));
4908  } catch (pi_result error) {
4909  result = error;
4910  }
4911 
4912  assert(alignment == 0 ||
4913  (result == PI_SUCCESS &&
4914  reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
4915  return result;
4916 }
4917 
4921  assert(context != nullptr);
4922  pi_result result = PI_SUCCESS;
4923  try {
4924  ScopedContext active(context);
4925  bool is_managed;
4926  unsigned int type;
4927  void *attribute_values[2] = {&is_managed, &type};
4928  CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
4929  CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
4930  result = PI_CHECK_ERROR(cuPointerGetAttributes(
4931  2, attributes, attribute_values, (CUdeviceptr)ptr));
4932  assert(type == CU_MEMORYTYPE_DEVICE || type == CU_MEMORYTYPE_HOST);
4933  if (is_managed || type == CU_MEMORYTYPE_DEVICE) {
4934  // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
4935  // with cuMemFree
4936  result = PI_CHECK_ERROR(cuMemFree((CUdeviceptr)ptr));
4937  } else {
4938  // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost
4939  result = PI_CHECK_ERROR(cuMemFreeHost(ptr));
4940  }
4941  } catch (pi_result error) {
4942  result = error;
4943  }
4944  return result;
4945 }
4946 
4948  size_t count,
4949  pi_uint32 num_events_in_waitlist,
4950  const pi_event *events_waitlist,
4951  pi_event *event) {
4952  assert(queue != nullptr);
4953  assert(ptr != nullptr);
4954  pi_result result = PI_SUCCESS;
4955  std::unique_ptr<_pi_event> event_ptr{nullptr};
4956 
4957  try {
4958  ScopedContext active(queue->get_context());
4959  pi_uint32 stream_token;
4960  _pi_stream_guard guard;
4961  CUstream cuStream = queue->get_next_compute_stream(
4962  num_events_in_waitlist, events_waitlist, guard, &stream_token);
4963  result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
4964  events_waitlist);
4965  if (event) {
4966  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
4967  PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue, cuStream, stream_token));
4968  event_ptr->start();
4969  }
4970  result = PI_CHECK_ERROR(cuMemsetD8Async(
4971  (CUdeviceptr)ptr, (unsigned char)value & 0xFF, count, cuStream));
4972  if (event) {
4973  result = event_ptr->record();
4974  *event = event_ptr.release();
4975  }
4976  } catch (pi_result err) {
4977  result = err;
4978  }
4979  return result;
4980 }
4981 
4983  void *dst_ptr, const void *src_ptr,
4984  size_t size,
4985  pi_uint32 num_events_in_waitlist,
4986  const pi_event *events_waitlist,
4987  pi_event *event) {
4988  assert(queue != nullptr);
4989  assert(dst_ptr != nullptr);
4990  assert(src_ptr != nullptr);
4991  pi_result result = PI_SUCCESS;
4992 
4993  std::unique_ptr<_pi_event> event_ptr{nullptr};
4994 
4995  try {
4996  ScopedContext active(queue->get_context());
4997  CUstream cuStream = queue->get_next_transfer_stream();
4998  result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
4999  events_waitlist);
5000  if (event) {
5001  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5003  event_ptr->start();
5004  }
5005  result = PI_CHECK_ERROR(cuMemcpyAsync(
5006  (CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr, size, cuStream));
5007  if (event) {
5008  result = event_ptr->record();
5009  }
5010  if (blocking) {
5011  result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
5012  }
5013  if (event) {
5014  *event = event_ptr.release();
5015  }
5016  } catch (pi_result err) {
5017  result = err;
5018  }
5019  return result;
5020 }
5021 
5023  size_t size,
5024  pi_usm_migration_flags flags,
5025  pi_uint32 num_events_in_waitlist,
5026  const pi_event *events_waitlist,
5027  pi_event *event) {
5028  pi_device device = queue->get_context()->get_device();
5029 
5030  // Certain cuda devices and Windows do not have support for some Unified
5031  // Memory features. cuMemPrefetchAsync requires concurrent memory access
5032  // for managed memory. Therfore, ignore prefetch hint if concurrent managed
5033  // memory access is not available.
5034  if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
5035  setErrorMessage("Prefetch hint ignored as device does not support "
5036  "concurrent managed access",
5037  PI_SUCCESS);
5038  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5039  }
5040 
5041  unsigned int is_managed;
5042  PI_CHECK_ERROR(cuPointerGetAttribute(
5043  &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr));
5044  if (!is_managed) {
5045  setErrorMessage("Prefetch hint ignored as prefetch only works with USM",
5046  PI_SUCCESS);
5047  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5048  }
5049 
5050  // flags is currently unused so fail if set
5051  if (flags != 0)
5052  return PI_ERROR_INVALID_VALUE;
5053  assert(queue != nullptr);
5054  assert(ptr != nullptr);
5055  pi_result result = PI_SUCCESS;
5056  std::unique_ptr<_pi_event> event_ptr{nullptr};
5057 
5058  try {
5059  ScopedContext active(queue->get_context());
5060  CUstream cuStream = queue->get_next_transfer_stream();
5061  result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
5062  events_waitlist);
5063  if (event) {
5064  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5066  event_ptr->start();
5067  }
5068  result = PI_CHECK_ERROR(
5069  cuMemPrefetchAsync((CUdeviceptr)ptr, size, device->get(), cuStream));
5070  if (event) {
5071  result = event_ptr->record();
5072  *event = event_ptr.release();
5073  }
5074  } catch (pi_result err) {
5075  result = err;
5076  }
5077  return result;
5078 }
5079 
5082  size_t length, pi_mem_advice advice,
5083  pi_event *event) {
5084  assert(queue != nullptr);
5085  assert(ptr != nullptr);
5086 
5087  // Certain cuda devices and Windows do not have support for some Unified
5088  // Memory features. Passing CU_MEM_ADVISE_[UN]SET_PREFERRED_LOCATION and
5089  // CU_MEM_ADVISE_[UN]SET_ACCESSED_BY to cuMemAdvise on a GPU device requires
5090  // the GPU device to report a non-zero value for
5091  // CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Therfore, ignore memory
5092  // advise if concurrent managed memory access is not available.
5097  pi_device device = queue->get_context()->get_device();
5098  if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
5099  setErrorMessage("Mem advise ignored as device does not support "
5100  "concurrent managed access",
5101  PI_SUCCESS);
5102  return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
5103  }
5104 
5105  // TODO: If ptr points to valid system-allocated pageable memory we should
5106  // check that the device also has the
5107  // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property.
5108  }
5109 
5110  pi_result result = PI_SUCCESS;
5111  std::unique_ptr<_pi_event> event_ptr{nullptr};
5112 
5113  try {
5114  ScopedContext active(queue->get_context());
5115 
5116  if (event) {
5117  event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
5118  PI_COMMAND_TYPE_USER, queue, queue->get_next_transfer_stream()));
5119  event_ptr->start();
5120  }
5121 
5122  switch (advice) {
5129  result = PI_CHECK_ERROR(cuMemAdvise(
5130  (CUdeviceptr)ptr, length,
5131  (CUmem_advise)(advice - PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY + 1),
5132  queue->get_context()->get_device()->get()));
5133  break;
5138  result = PI_CHECK_ERROR(cuMemAdvise(
5139  (CUdeviceptr)ptr, length,
5140  (CUmem_advise)(advice - PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY + 1 -
5143  CU_DEVICE_CPU));
5144  break;
5145  default:
5146  cl::sycl::detail::pi::die("Unknown advice");
5147  }
5148  if (event) {
5149  result = event_ptr->record();
5150  *event = event_ptr.release();
5151  }
5152  } catch (pi_result err) {
5153  result = err;
5154  } catch (...) {
5155  result = PI_ERROR_UNKNOWN;
5156  }
5157  return result;
5158 }
5159 
5177  pi_mem_alloc_info param_name,
5178  size_t param_value_size,
5179  void *param_value,
5180  size_t *param_value_size_ret) {
5181  assert(context != nullptr);
5182  assert(ptr != nullptr);
5183  pi_result result = PI_SUCCESS;
5184 
5185  try {
5186  ScopedContext active(context);
5187  switch (param_name) {
5188  case PI_MEM_ALLOC_TYPE: {
5189  unsigned int value;
5190  // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
5191  CUresult ret = cuPointerGetAttribute(
5192  &value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr);
5193  if (ret == CUDA_ERROR_INVALID_VALUE) {
5194  // pointer not known to the CUDA subsystem
5195  return getInfo(param_value_size, param_value, param_value_size_ret,
5197  }
5198  result = check_error(ret, __func__, __LINE__ - 5, __FILE__);
5199  if (value) {
5200  // pointer to managed memory
5201  return getInfo(param_value_size, param_value, param_value_size_ret,
5203  }
5204  result = PI_CHECK_ERROR(cuPointerGetAttribute(
5205  &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)ptr));
5206  assert(value == CU_MEMORYTYPE_DEVICE || value == CU_MEMORYTYPE_HOST);
5207  if (value == CU_MEMORYTYPE_DEVICE) {
5208  // pointer to device memory
5209  return getInfo(param_value_size, param_value, param_value_size_ret,
5211  }
5212  if (value == CU_MEMORYTYPE_HOST) {
5213  // pointer to host memory
5214  return getInfo(param_value_size, param_value, param_value_size_ret,
5216  }
5217  // should never get here
5218 #ifdef _MSC_VER
5219  __assume(0);
5220 #else
5221  __builtin_unreachable();
5222 #endif
5223  return getInfo(param_value_size, param_value, param_value_size_ret,
5225  }
5226  case PI_MEM_ALLOC_BASE_PTR: {
5227 #if __CUDA_API_VERSION >= 10020
5228  // CU_POINTER_ATTRIBUTE_RANGE_START_ADDR was introduced in CUDA 10.2
5229  unsigned int value;
5230  result = PI_CHECK_ERROR(cuPointerGetAttribute(
5231  &value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)ptr));
5232  return getInfo(param_value_size, param_value, param_value_size_ret,
5233  value);
5234 #else
5235  return PI_ERROR_INVALID_VALUE;
5236 #endif
5237  }
5238  case PI_MEM_ALLOC_SIZE: {
5239 #if __CUDA_API_VERSION >= 10020
5240  // CU_POINTER_ATTRIBUTE_RANGE_SIZE was introduced in CUDA 10.2
5241  unsigned int value;
5242  result = PI_CHECK_ERROR(cuPointerGetAttribute(
5243  &value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)ptr));
5244  return getInfo(param_value_size, param_value, param_value_size_ret,
5245  value);
5246 #else
5247  return PI_ERROR_INVALID_VALUE;
5248 #endif
5249  }
5250  case PI_MEM_ALLOC_DEVICE: {
5251  // get device index associated with this pointer
5252  unsigned int device_idx;
5253  result = PI_CHECK_ERROR(cuPointerGetAttribute(
5254  &device_idx, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr));
5255 
5256  // currently each device is in its own platform, so find the platform at
5257  // the same index
5258  std::vector<pi_platform> platforms;
5259  platforms.resize(device_idx + 1);
5260  result = cuda_piPlatformsGet(device_idx + 1, platforms.data(), nullptr);
5261 
5262  // get the device from the platform
5263  pi_device device = platforms[device_idx]->devices_[0].get();
5264  return getInfo(param_value_size, param_value, param_value_size_ret,
5265  device);
5266  }
5267  }
5268  } catch (pi_result error) {
5269  result = error;
5270  }
5271  return result;
5272 }
5273 
5274 // This API is called by Sycl RT to notify the end of the plugin lifetime.
5275 // TODO: add a global variable lifetime management code here (see
5276 // pi_level_zero.cpp for reference) Currently this is just a NOOP.
5277 pi_result cuda_piTearDown(void *) { return PI_SUCCESS; }
5278 
5280 
5282  // Check that the major version matches in PiVersion and SupportedVersion
5284 
5285  // PI interface supports higher version or the same version.
5286  size_t PluginVersionSize = sizeof(PluginInit->PluginVersion);
5287  if (strlen(SupportedVersion) >= PluginVersionSize)
5288  return PI_ERROR_INVALID_VALUE;
5289  strncpy(PluginInit->PluginVersion, SupportedVersion, PluginVersionSize);
5290 
5291  // Set whole function table to zero to make it easier to detect if
5292  // functions are not set up below.
5293  std::memset(&(PluginInit->PiFunctionTable), 0,
5294  sizeof(PluginInit->PiFunctionTable));
5295 
5296 // Forward calls to CUDA RT.
5297 #define _PI_CL(pi_api, cuda_api) \
5298  (PluginInit->PiFunctionTable).pi_api = (decltype(&::pi_api))(&cuda_api);
5299 
5300  // Platform
5303  // Device
5314  // Context
5323  // Queue
5333  // Memory
5343  // Program
5357  // Kernel
5371  // Event
5383  // Sampler
5388  // Queue commands
5406  // USM
5416 
5419  _PI_CL(piPluginGetLastError, cuda_piPluginGetLastError)
5421 
5422 #undef _PI_CL
5423 
5424  return PI_SUCCESS;
5425 }
5426 
5427 } // extern "C"
5428 
cuda_piEventRetain
pi_result cuda_piEventRetain(pi_event event)
Definition: pi_cuda.cpp:3795
_pi_sampler::context_
pi_context context_
Definition: pi_cuda.hpp:947
PI_COMMAND_TYPE_USER
@ PI_COMMAND_TYPE_USER
Definition: pi.h:373
setErrorMessage
static void setErrorMessage(const char *message, pi_result error_code)
Definition: pi_esimd_emulator.cpp:156
cuda_piProgramBuild
pi_result cuda_piProgramBuild(pi_program program, pi_uint32 num_devices, const pi_device *device_list, const char *options, void(*pfn_notify)(pi_program program, void *user_data), void *user_data)
Loads the images from a PI program into a CUmodule that can be used later on to extract functions (ke...
Definition: pi_cuda.cpp:3279
PI_PROFILING_INFO_COMMAND_START
@ PI_PROFILING_INFO_COMMAND_START
Definition: pi.h:516
piEventGetProfilingInfo
pi_result piEventGetProfilingInfo(pi_event event, pi_profiling_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1385
PI_DEVICE_INFO_HOST_UNIFIED_MEMORY
@ PI_DEVICE_INFO_HOST_UNIFIED_MEMORY
Definition: pi.h:224
PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT
#define PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT
Extension to denote native support of assert feature by an arbitrary device piDeviceGetInfo call shou...
Definition: pi.h:721
cuda_piMemBufferCreate
pi_result cuda_piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size, void *host_ptr, pi_mem *ret_mem, const pi_mem_properties *properties)
Creates a PI Memory object using a CUDA memory allocation.
Definition: pi_cuda.cpp:2174
piKernelCreate
pi_result piKernelCreate(pi_program program, const char *kernel_name, pi_kernel *ret_kernel)
Definition: pi_esimd_emulator.cpp:1343
PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR
Definition: pi.h:185
getKernelNames
std::string getKernelNames(pi_program)
Finds kernel names by searching for entry points in the PTX source, as the CUDA driver API doesn't ex...
Definition: pi_cuda.cpp:717
PI_DEVICE_INFO_OPENCL_C_VERSION
@ PI_DEVICE_INFO_OPENCL_C_VERSION
Definition: pi.h:242
_pi_mem
PI Mem mapping to CUDA memory allocations, both data and texture/surface.
Definition: pi_cuda.hpp:221
CUdeviceptr
unsigned int CUdeviceptr
Definition: backend_traits_cuda.hpp:35
pi_buff_rect_region_struct::depth_scalar
size_t depth_scalar
Definition: pi.h:827
piEventRelease
pi_result piEventRelease(pi_event event)
Definition: pi_esimd_emulator.cpp:1433
PI_MAP_WRITE
constexpr pi_map_flags PI_MAP_WRITE
Definition: pi.h:536
PI_USM_ACCESS
@ PI_USM_ACCESS
Definition: pi.h:1589
PI_DEVICE_INFO_PROFILE
@ PI_DEVICE_INFO_PROFILE
Definition: pi.h:240
piEnqueueKernelLaunch
pi_result piEnqueueKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1758
piMemGetInfo
pi_result piMemGetInfo(pi_mem mem, pi_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1073
_pi_mem_type
_pi_mem_type
Definition: pi.h:395
PI_KERNEL_INFO_REFERENCE_COUNT
@ PI_KERNEL_INFO_REFERENCE_COUNT
Definition: pi.h:321
PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH
@ PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH
Definition: pi.h:207
PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR
Definition: pi.h:192
_pi_mem::mem_::surface_mem_::get_surface
CUsurfObject get_surface() const noexcept
Definition: pi_cuda.hpp:324
cuda_piEnqueueMemBufferFill
pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, const void *pattern, size_t pattern_size, size_t offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:4330
cuda_piextUSMEnqueueMemcpy
pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, void *dst_ptr, const void *src_ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Definition: pi_cuda.cpp:4982
CUstream
struct CUstream_st * CUstream
Definition: backend_traits_cuda.hpp:27
ReleaseGuard::ReleaseGuard
ReleaseGuard(T Obj)
Obj can be nullptr.
Definition: pi_cuda.cpp:761
PI_DEVICE_INFO_DOUBLE_FP_CONFIG
@ PI_DEVICE_INFO_DOUBLE_FP_CONFIG
Definition: pi.h:183
_pi_context_info
_pi_context_info
Definition: pi.h:298
pi_buff_rect_region_struct::height_scalar
size_t height_scalar
Definition: pi.h:826
_pi_queue::compute_streams_
std::vector< native_type > compute_streams_
Definition: pi_cuda.hpp:397
PI_MEM_TYPE_IMAGE1D
@ PI_MEM_TYPE_IMAGE1D
Definition: pi.h:400
_pi_context::kind::primary
@ primary
pi_buff_rect_offset_struct
Definition: pi.h:815
_pi_context::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:205
_pi_program::context_
_pi_context * context_
Definition: pi_cuda.hpp:724
piextUSMFree
pi_result piextUSMFree(pi_context context, void *ptr)
Indicates that the allocated USM memory is no longer needed on the runtime side.
Definition: pi_esimd_emulator.cpp:1888
piKernelSetArg
pi_result piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Definition: pi_esimd_emulator.cpp:1347
_pi_event::get_context
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:630
PI_KERNEL_INFO_ATTRIBUTES
@ PI_KERNEL_INFO_ATTRIBUTES
Definition: pi.h:324
cuda_piEnqueueMemBufferRead
pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, pi_bool blocking_read, size_t offset, size_t size, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:2667
_pi_mem::mem_::buffer_mem_::hostPtr_
void * hostPtr_
Pointer associated with this device on the host.
Definition: pi_cuda.hpp:250
_pi_context::deviceId_
_pi_device * deviceId_
Definition: pi_cuda.hpp:172
PI_DEVICE_INFO_DRIVER_VERSION
@ PI_DEVICE_INFO_DRIVER_VERSION
Definition: pi.h:239
cuda_piEnqueueMemImageFill
pi_result cuda_piEnqueueMemImageFill(pi_queue, pi_mem, const void *, const size_t *, const size_t *, pi_uint32, const pi_event *, pi_event *)
\TODO Not implemented in CUDA.
Definition: pi_cuda.cpp:4722
PI_DEVICE_INFO_GPU_EU_COUNT
@ PI_DEVICE_INFO_GPU_EU_COUNT
Definition: pi.h:263
_pi_device_binary_property_struct::Name
char * Name
Definition: pi.h:662
pi_bool
pi_uint32 pi_bool
Definition: pi.h:96
_pi_program::kernelReqdWorkGroupSizeMD_
std::unordered_map< std::string, std::tuple< uint32_t, uint32_t, uint32_t > > kernelReqdWorkGroupSizeMD_
Definition: pi_cuda.hpp:728
_pi_queue::context_
_pi_context * context_
Definition: pi_cuda.hpp:404
PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT
@ PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT
Definition: pi.h:452
PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC
@ PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC
Definition: pi.h:245
T
_pi_image_format::image_channel_data_type
pi_image_channel_type image_channel_data_type
Definition: pi.h:893
cl::sycl::detail::make_tuple
constexpr tuple< Ts... > make_tuple(Ts... Args)
Definition: tuple.hpp:36
piPluginGetLastError
pi_result piPluginGetLastError(char **message)
API to get Plugin specific warning and error messages.
Definition: pi_esimd_emulator.cpp:164
ErrorMessage
thread_local char ErrorMessage[MaxMessageSize]
Definition: pi_esimd_emulator.cpp:153
PI_QUEUE_INFO_CONTEXT
@ PI_QUEUE_INFO_CONTEXT
Definition: pi.h:310
ReleaseGuard::ReleaseGuard
ReleaseGuard(ReleaseGuard &&Other) noexcept
Definition: pi_cuda.cpp:762
_pi_event::is_started
bool is_started() const noexcept
Definition: pi_cuda.hpp:614
_pi_sampler::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:952
PI_DEVICE_INFO_NAME
@ PI_DEVICE_INFO_NAME
Definition: pi.h:237
piProgramLink
pi_result piProgramLink(pi_context context, pi_uint32 num_devices, const pi_device *device_list, const char *options, pi_uint32 num_input_programs, const pi_program *input_programs, void(*pfn_notify)(pi_program program, void *user_data), void *user_data, pi_program *ret_program)
Definition: pi_opencl.cpp:794
PI_IMAGE_CHANNEL_TYPE_UNORM_INT8
@ PI_IMAGE_CHANNEL_TYPE_UNORM_INT8
Definition: pi.h:441
PI_PROFILING_INFO_COMMAND_SUBMIT
@ PI_PROFILING_INFO_COMMAND_SUBMIT
Definition: pi.h:515
PI_MEMORY_ORDER_ACQUIRE
constexpr pi_memory_order_capabilities PI_MEMORY_ORDER_ACQUIRE
Definition: pi.h:501
_pi_sampler::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:954
PI_PROGRAM_INFO_BINARY_SIZES
@ PI_PROGRAM_INFO_BINARY_SIZES
Definition: pi.h:292
cuda_piProgramLink
pi_result cuda_piProgramLink(pi_context context, pi_uint32 num_devices, const pi_device *device_list, const char *options, pi_uint32 num_input_programs, const pi_program *input_programs, void(*pfn_notify)(pi_program program, void *user_data), void *user_data, pi_program *ret_program)
Creates a new PI program object that is the outcome of linking all input programs.
Definition: pi_cuda.cpp:3393
cuda_piextKernelSetArgSampler
pi_result cuda_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, const pi_sampler *arg_value)
Definition: pi_cuda.cpp:2839
PI_MEM_FLAGS_HOST_PTR_COPY
constexpr pi_mem_flags PI_MEM_FLAGS_HOST_PTR_COPY
Definition: pi.h:530
cuda_piMemBufferPartition
pi_result cuda_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags, pi_buffer_create_type buffer_create_type, void *buffer_create_info, pi_mem *memObj)
Implements a buffer partition in the CUDA backend.
Definition: pi_cuda.cpp:2314
cuda_piEnqueueMemImageWrite
pi_result cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, pi_bool blocking_write, const size_t *origin, const size_t *region, size_t input_row_pitch, size_t input_slice_pitch, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:4584
_pi_image_format::image_channel_order
pi_image_channel_order image_channel_order
Definition: pi.h:892
cuda_piDevicesGet
pi_result cuda_piDevicesGet(pi_platform platform, pi_device_type device_type, pi_uint32 num_entries, pi_device *devices, pi_uint32 *num_devices)
Definition: pi_cuda.cpp:945
PI_MEM_ALLOC_SIZE
@ PI_MEM_ALLOC_SIZE
Definition: pi.h:1598
_pi_context::get
native_type get() const noexcept
Definition: pi_cuda.hpp:199
PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN
@ PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN
Definition: pi.h:249
piextProgramGetNativeHandle
pi_result piextProgramGetNativeHandle(pi_program program, pi_native_handle *nativeHandle)
Gets the native handle of a PI program object.
Definition: pi_esimd_emulator.cpp:1334
cl::sycl::detail::pi::assertion
void assertion(bool Condition, const char *Message=nullptr)
Definition: pi.cpp:545
_pi_queue::delay_compute_
std::vector< bool > delay_compute_
Definition: pi_cuda.hpp:403
_pi_image_desc::image_type
pi_mem_type image_type
Definition: pi.h:897
piProgramRetain
pi_result piProgramRetain(pi_program program)
Definition: pi_esimd_emulator.cpp:1330
_pi_plugin
Definition: pi.h:1776
_pi_program::get_context
pi_context get_context() const
Definition: pi_cuda.hpp:746
PI_PROGRAM_INFO_SOURCE
@ PI_PROGRAM_INFO_SOURCE
Definition: pi.h:291
cuda_piextUSMEnqueueMemset
pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, size_t count, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Definition: pi_cuda.cpp:4947
enqueueEventWait
pi_result enqueueEventWait(pi_queue queue, pi_event event)
Definition: pi_cuda.cpp:609
_pi_program::MAX_LOG_SIZE
constexpr static size_t MAX_LOG_SIZE
Definition: pi_cuda.hpp:730
PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE
@ PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE
Definition: pi.h:215
cuda_piEnqueueEventsWait
pi_result cuda_piEnqueueEventsWait(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Enqueues a wait on the given CUstream for all events.
Definition: pi_cuda.cpp:3837
piDevicePartition
pi_result piDevicePartition(pi_device device, const pi_device_partition_property *properties, pi_uint32 num_devices, pi_device *out_devices, pi_uint32 *out_num_devices)
Definition: pi_esimd_emulator.cpp:813
PI_KERNEL_COMPILE_NUM_SUB_GROUPS
@ PI_KERNEL_COMPILE_NUM_SUB_GROUPS
Definition: pi.h:351
_pi_mem_advice
_pi_mem_advice
Definition: pi.h:405
cuda_piEnqueueMemBufferCopy
pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:4241
PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES
@ PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES
Definition: pi.h:179
piEnqueueMemBufferCopy
pi_result piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1566
cuda_piEventGetProfilingInfo
pi_result cuda_piEventGetProfilingInfo(pi_event event, pi_profiling_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Obtain profiling information from PI CUDA events \TODO Timings from CUDA are only elapsed time.
Definition: pi_cuda.cpp:3754
PI_DEVICE_INFO_MAX_COMPUTE_UNITS
@ PI_DEVICE_INFO_MAX_COMPUTE_UNITS
Definition: pi.h:177
_pi_stream_guard
std::unique_lock< std::mutex > _pi_stream_guard
Definition: pi_cuda.hpp:65
cl::sycl
Definition: access.hpp:14
ReleaseGuard
RAII object that calls the reference count release function on the held PI object on destruction.
Definition: pi_cuda.cpp:726
PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION
@ PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION
Definition: pi.h:411
PI_EVENT_INFO_CONTEXT
@ PI_EVENT_INFO_CONTEXT
Definition: pi.h:357
_pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr
@ alloc_host_ptr
_pi_plugin::PluginVersion
char PluginVersion[20]
Definition: pi.h:1786
PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU
@ PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU
Definition: pi.h:275
_pi_result
_pi_result
Definition: pi.h:105
PI_MEM_ADVICE_CUDA_UNSET_READ_MOSTLY
@ PI_MEM_ADVICE_CUDA_UNSET_READ_MOSTLY
Definition: pi.h:409
PI_PROFILING_INFO_COMMAND_QUEUED
@ PI_PROFILING_INFO_COMMAND_QUEUED
Definition: pi.h:514
piextQueueGetNativeHandle
pi_result piextQueueGetNativeHandle(pi_queue queue, pi_native_handle *nativeHandle)
Gets the native handle of a PI queue object.
Definition: pi_esimd_emulator.cpp:991
piTearDown
pi_result piTearDown(void *PluginParameter)
API to notify that the plugin should clean up its resources.
Definition: pi_esimd_emulator.cpp:1969
ReleaseGuard::~ReleaseGuard
~ReleaseGuard()
Calls the related PI object release function if the object held is not nullptr or if dismiss has not ...
Definition: pi_cuda.cpp:770
PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST
@ PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST
Definition: pi.h:417
cuda_piEventCreate
pi_result cuda_piEventCreate(pi_context, pi_event *)
Definition: pi_cuda.cpp:3719
ErrorMessageCode
thread_local pi_result ErrorMessageCode
Definition: pi_esimd_emulator.cpp:152
pi_context_properties
intptr_t pi_context_properties
Definition: pi.h:485
piEnqueueMemUnmap
pi_result piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, void *mapped_ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1629
_pi_queue::for_each_stream
void for_each_stream(T &&f)
Definition: pi_cuda.hpp:488
PI_SAMPLER_INFO_FILTER_MODE
@ PI_SAMPLER_INFO_FILTER_MODE
Definition: pi.h:466
PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE
@ PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE
Definition: pi.h:329
PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8
@ PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8
Definition: pi.h:446
_pi_queue::device_
_pi_device * device_
Definition: pi_cuda.hpp:405
PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION_HOST
@ PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION_HOST
Definition: pi.h:414
piSamplerGetInfo
pi_result piSamplerGetInfo(pi_sampler sampler, pi_sampler_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1467
PI_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES
@ PI_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES
Definition: pi.h:305
cuda_piMemImageGetInfo
pi_result cuda_piMemImageGetInfo(pi_mem, pi_image_info, size_t, void *, size_t *)
\TODO Not implemented
Definition: pi_cuda.cpp:3252
_pi_event::get_queued_time
pi_uint64 get_queued_time() const
Definition: pi_cuda.cpp:523
piPlatformGetInfo
pi_result piPlatformGetInfo(pi_platform platform, pi_platform_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:443
PI_DEVICE_INFO_GPU_SLICES
@ PI_DEVICE_INFO_GPU_SLICES
Definition: pi.h:265
_pi_queue::get_next_compute_stream
native_type get_next_compute_stream(pi_uint32 *stream_token=nullptr)
Definition: pi_cuda.cpp:380
piDeviceRetain
pi_result piDeviceRetain(pi_device device)
Definition: pi_esimd_emulator.cpp:571
PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE
@ PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE
Definition: pi.h:201
piProgramCompile
pi_result piProgramCompile(pi_program program, pi_uint32 num_devices, const pi_device *device_list, const char *options, pi_uint32 num_input_headers, const pi_program *input_headers, const char **header_include_names, void(*pfn_notify)(pi_program program, void *user_data), void *user_data)
_pi_device_type
_pi_device_type
Definition: pi.h:152
cuda_piEnqueueMemBufferCopyRect
pi_result cuda_piEnqueueMemBufferCopyRect(pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:4285
cl::sycl::info::device_type
device_type
Definition: info_desc.hpp:180
piContextGetInfo
pi_result piContextGetInfo(pi_context context, pi_context_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:858
piQueueRelease
pi_result piQueueRelease(pi_queue command_queue)
Definition: pi_esimd_emulator.cpp:963
cuda_piEventSetCallback
pi_result cuda_piEventSetCallback(pi_event, pi_int32, pfn_notify, void *)
Definition: pi_cuda.cpp:3785
_pi_mem::mem_::buffer_mem_::get_map_offset
size_t get_map_offset(void *) const noexcept
Definition: pi_cuda.hpp:280
_pi_mem::context_
pi_context context_
Definition: pi_cuda.hpp:227
PI_DEVICE_INFO_REFERENCE_COUNT
@ PI_DEVICE_INFO_REFERENCE_COUNT
Definition: pi.h:235
_pi_device::get_max_work_group_size
int get_max_work_group_size() const noexcept
Definition: pi_cuda.hpp:118
cuda_piextUSMFree
pi_result cuda_piextUSMFree(pi_context context, void *ptr)
USM: Frees the given USM pointer associated with the context.
Definition: pi_cuda.cpp:4920
PI_MEMORY_SCOPE_WORK_ITEM
constexpr pi_memory_scope_capabilities PI_MEMORY_SCOPE_WORK_ITEM
Definition: pi.h:507
cl::sycl::ext::intel::experimental::esimd::line
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_fp_or_dword_type< T >::value &&std::is_floating_point< T >::value, sycl::ext::intel::esimd::simd< T, SZ > > line(sycl::ext::intel::esimd::simd< T, 4 > src0, sycl::ext::intel::esimd::simd< T, SZ > src1, Sat sat={})
Linear equation.
Definition: math.hpp:900
PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT
@ PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT
Definition: pi.h:257
cuda_piextDeviceCreateWithNativeHandle
pi_result cuda_piextDeviceCreateWithNativeHandle(pi_native_handle nativeHandle, pi_platform platform, pi_device *piDevice)
Created a PI device object from a CUDA device handle.
Definition: pi_cuda.cpp:1928
_pi_mem::is_buffer
bool is_buffer() const noexcept
Definition: pi_cuda.hpp:373
PI_DEVICE_INFO_PLATFORM
@ PI_DEVICE_INFO_PLATFORM
Definition: pi.h:234
_pi_queue::backend_has_ownership
bool backend_has_ownership() const noexcept
Definition: pi_cuda.hpp:583
PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE
@ PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE
Definition: pi.h:267
cuda_definitions.hpp
cuda_piMemGetInfo
pi_result cuda_piMemGetInfo(pi_mem, pi_mem_info, size_t, void *, size_t *)
Definition: pi_cuda.cpp:2378
PI_DEVICE_LOCAL_MEM_TYPE_LOCAL
@ PI_DEVICE_LOCAL_MEM_TYPE_LOCAL
Definition: pi.h:170
PI_PROGRAM_BUILD_INFO_OPTIONS
@ PI_PROGRAM_BUILD_INFO_OPTIONS
Definition: pi.h:130
PI_PROGRAM_BUILD_STATUS_SUCCESS
@ PI_PROGRAM_BUILD_STATUS_SUCCESS
Definition: pi.h:138
piextDeviceCreateWithNativeHandle
pi_result piextDeviceCreateWithNativeHandle(pi_native_handle nativeHandle, pi_platform platform, pi_device *device)
Creates PI device object from a native handle.
Definition: pi_esimd_emulator.cpp:822
sycl
Definition: invoke_simd.hpp:68
_pi_event::get_stream
CUstream get_stream() const noexcept
Definition: pi_cuda.hpp:604
piKernelGetSubGroupInfo
pi_result piKernelGetSubGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name, size_t input_value_size, const void *input_value, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
API to query information from the sub-group from a kernel.
Definition: pi_esimd_emulator.cpp:1369
commonEnqueueMemBufferCopyRect
static pi_result commonEnqueueMemBufferCopyRect(CUstream cu_stream, pi_buff_rect_region region, const void *src_ptr, const CUmemorytype_enum src_type, pi_buff_rect_offset src_offset, size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr, const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset, size_t dst_row_pitch, size_t dst_slice_pitch)
General 3D memory copy operation.
Definition: pi_cuda.cpp:4082
PI_DEVICE_INFO_ADDRESS_BITS
@ PI_DEVICE_INFO_ADDRESS_BITS
Definition: pi.h:200
_pi_mem::mem_::surface_mem_::get_array
CUarray get_array() const noexcept
Definition: pi_cuda.hpp:322
max
simd< _Tp, _Abi > max(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
_pi_image_desc::image_height
size_t image_height
Definition: pi.h:899
cuda_piDeviceGetInfo
pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_cuda.cpp:1099
_pi_platform
A PI platform stores all known PI devices, in the CUDA plugin this is just a vector of available devi...
Definition: pi_cuda.hpp:72
cl::sycl::host_ptr
multi_ptr< ElementType, access::address_space::ext_intel_global_host_space > host_ptr
Definition: pointers.hpp:33
PI_CONTEXT_INFO_NUM_DEVICES
@ PI_CONTEXT_INFO_NUM_DEVICES
Definition: pi.h:301
PI_DEVICE_TYPE_DEFAULT
@ PI_DEVICE_TYPE_DEFAULT
The default device available in the PI plugin.
Definition: pi.h:153
_pi_queue::can_reuse_stream
bool can_reuse_stream(pi_uint32 stream_token)
Definition: pi_cuda.hpp:466
PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
@ PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
Definition: pi.h:332
PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE
Definition: pi.h:190
PI_DEVICE_INFO_MAX_SAMPLERS
@ PI_DEVICE_INFO_MAX_SAMPLERS
Definition: pi.h:212
PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE
@ PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE
Definition: pi.h:331
piEnqueueMemImageFill
pi_result piEnqueueMemImageFill(pi_queue command_queue, pi_mem image, const void *fill_color, const size_t *origin, const size_t *region, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1746
PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS
@ PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS
Definition: pi.h:251
cl::sycl::malloc
void * malloc(size_t size, const device &dev, const context &ctxt, usm::alloc kind, const detail::code_location CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:410
PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN
@ PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN
Definition: pi.h:214
piSamplerCreate
pi_result piSamplerCreate(pi_context context, const pi_sampler_properties *sampler_properties, pi_sampler *result_sampler)
Definition: pi_esimd_emulator.cpp:1462
piextEventCreateWithNativeHandle
pi_result piextEventCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, bool ownNativeHandle, pi_event *event)
Creates PI event object from a native handle.
Definition: pi_esimd_emulator.cpp:1458
PI_DEVICE_INFO_BUILT_IN_KERNELS
@ PI_DEVICE_INFO_BUILT_IN_KERNELS
Definition: pi.h:233
cuda_piextGetDeviceFunctionPointer
pi_result cuda_piextGetDeviceFunctionPointer(pi_device device, pi_program program, const char *func_name, pi_uint64 *func_pointer_ret)
Definition: pi_cuda.cpp:1072
_pi_sampler_addressing_mode
_pi_sampler_addressing_mode
Definition: pi.h:472
piKernelRelease
pi_result piKernelRelease(pi_kernel kernel)
Definition: pi_esimd_emulator.cpp:1377
cuda_piextUSMDeviceAlloc
pi_result cuda_piextUSMDeviceAlloc(void **result_ptr, pi_context context, pi_device device, pi_usm_mem_properties *properties, size_t size, pi_uint32 alignment)
USM: Implements USM device allocations using a normal CUDA device pointer.
Definition: pi_cuda.cpp:4871
PI_DEVICE_INFO_USM_DEVICE_SUPPORT
@ PI_DEVICE_INFO_USM_DEVICE_SUPPORT
Definition: pi.h:255
PI_KERNEL_INFO_PROGRAM
@ PI_KERNEL_INFO_PROGRAM
Definition: pi.h:323
PI_PLATFORM_INFO_NAME
@ PI_PLATFORM_INFO_NAME
Definition: pi.h:122
PI_FP_ROUND_TO_NEAREST
static constexpr pi_device_fp_config PI_FP_ROUND_TO_NEAREST
Definition: pi.h:625
pi.hpp
piextUSMSharedAlloc
pi_result piextUSMSharedAlloc(void **result_ptr, pi_context context, pi_device device, pi_usm_mem_properties *properties, size_t size, pi_uint32 alignment)
Allocates memory accessible on both host and device.
Definition: pi_esimd_emulator.cpp:1849
CUcontext
struct CUctx_st * CUcontext
Definition: backend_traits_cuda.hpp:26
PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE
@ PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE
Definition: pi.h:211
cuda_piEnqueueMemBufferReadRect
pi_result cuda_piEnqueueMemBufferReadRect(pi_queue command_queue, pi_mem buffer, pi_bool blocking_read, pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, pi_buff_rect_region region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:4140
_pi_device_info
_pi_device_info
Definition: pi.h:174
PI_MEM_TYPE_DEVICE
@ PI_MEM_TYPE_DEVICE
Definition: pi.h:1605
PI_USM_ATOMIC_ACCESS
@ PI_USM_ATOMIC_ACCESS
Definition: pi.h:1590
PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE
@ PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE
Definition: pi.h:219
cuda_piKernelSetArg
pi_result cuda_piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Definition: pi_cuda.cpp:2789
_pi_image_info
_pi_image_info
Definition: pi.h:338
piEventSetStatus
pi_result piEventSetStatus(pi_event event, pi_int32 execution_status)
Definition: pi_esimd_emulator.cpp:1421
_pi_program::build_program
pi_result build_program(const char *build_options)
Definition: pi_cuda.cpp:675
piextUSMDeviceAlloc
pi_result piextUSMDeviceAlloc(void **result_ptr, pi_context context, pi_device device, pi_usm_mem_properties *properties, size_t size, pi_uint32 alignment)
Allocates device memory.
Definition: pi_esimd_emulator.cpp:1844
PI_SAMPLER_PROPERTIES_ADDRESSING_MODE
constexpr pi_sampler_properties PI_SAMPLER_PROPERTIES_ADDRESSING_MODE
Definition: pi.h:496
PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT
@ PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT
Definition: pi.h:187
PI_SAMPLER_FILTER_MODE_NEAREST
@ PI_SAMPLER_FILTER_MODE_NEAREST
Definition: pi.h:481
imageElementByteSize
static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR array_desc)
Definition: pi_cuda.cpp:4430
_pi_kernel
Implementation of a PI Kernel for CUDA.
Definition: pi_cuda.hpp:773
cuda_piQueueCreate
pi_result cuda_piQueueCreate(pi_context context, pi_device device, pi_queue_properties properties, pi_queue *queue)
Creates a pi_queue object on the CUDA backend.
Definition: pi_cuda.cpp:2420
PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT
@ PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT
Definition: pi.h:223
cuda_piEnqueueKernelLaunch
pi_result cuda_piEnqueueKernelLaunch(pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:2931
_pi_sampler::props_
pi_uint32 props_
Definition: pi_cuda.hpp:946
cuda_piKernelSetExecInfo
pi_result cuda_piKernelSetExecInfo(pi_kernel, pi_kernel_exec_info, size_t, const void *)
Definition: pi_cuda.cpp:3695
piextDeviceGetNativeHandle
pi_result piextDeviceGetNativeHandle(pi_device device, pi_native_handle *nativeHandle)
Gets the native handle of a PI device object.
Definition: pi_esimd_emulator.cpp:818
PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT
@ PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT
Definition: pi.h:206
cl::sycl::info::event
event
Definition: info_desc.hpp:289
pi_buff_rect_offset_struct::y_scalar
size_t y_scalar
Definition: pi.h:817
get_kernel_metadata
bool get_kernel_metadata(std::string metadataName, const char *tag, std::string &kernelName)
Definition: pi_cuda.cpp:627
PI_EVENT_INFO_COMMAND_TYPE
@ PI_EVENT_INFO_COMMAND_TYPE
Definition: pi.h:358
PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
constexpr pi_queue_properties PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
Definition: pi.h:560
piextContextCreateWithNativeHandle
pi_result piextContextCreateWithNativeHandle(pi_native_handle nativeHandle, pi_uint32 numDevices, const pi_device *devices, bool pluginOwnsNativeHandle, pi_context *context)
Creates PI context object from a native handle.
Definition: pi_esimd_emulator.cpp:872
cl::sycl::info::queue
queue
Definition: info_desc.hpp:229
piProgramGetInfo
pi_result piProgramGetInfo(pi_program program, pi_program_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1302
__SYCL_PI_CUDA_USE_DEFAULT_STREAM
#define __SYCL_PI_CUDA_USE_DEFAULT_STREAM
Definition: cuda_definitions.hpp:22
_pi_mem::mem_::buffer_mem_::alloc_mode
alloc_mode
alloc_mode classic: Just a normal buffer allocated on the device via cuda malloc use_host_ptr: Use an...
Definition: pi_cuda.hpp:267
_pi_plugin::PiVersion
char PiVersion[20]
Definition: pi.h:1784
PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE
@ PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE
Definition: pi.h:217
PI_DEVICE_EXEC_CAPABILITIES_KERNEL
constexpr pi_device_exec_capabilities PI_DEVICE_EXEC_CAPABILITIES_KERNEL
Definition: pi.h:488
cl::sycl::detail::memcpy
void memcpy(void *Dst, const void *Src, std::size_t Size)
cuda_piEnqueueMemBufferMap
pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, pi_bool blocking_map, pi_map_flags map_flags, size_t offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event, void **ret_map)
Implements mapping on the host using a BufferRead operation.
Definition: pi_cuda.cpp:4734
PI_CONTEXT_INFO_DEVICES
@ PI_CONTEXT_INFO_DEVICES
Definition: pi.h:299
cuda_piextDeviceSelectBinary
pi_result cuda_piextDeviceSelectBinary(pi_device device, pi_device_binary *binaries, pi_uint32 num_binaries, pi_uint32 *selected_binary)
Definition: pi_cuda.cpp:1044
cuda_piKernelRetain
pi_result cuda_piKernelRetain(pi_kernel kernel)
Definition: pi_cuda.cpp:3668
_pi_sampler_filter_mode
_pi_sampler_filter_mode
Definition: pi.h:480
_pi_queue_info
_pi_queue_info
Definition: pi.h:309
cuda_piextQueueCreateWithNativeHandle
pi_result cuda_piextQueueCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, pi_device device, bool ownNativeHandle, pi_queue *queue)
Created a PI queue object from a CUDA queue handle.
Definition: pi_cuda.cpp:2582
pi_buff_rect_region_struct::width_bytes
size_t width_bytes
Definition: pi.h:825
_pi_buffer_create_type
_pi_buffer_create_type
Definition: pi.h:456
_pi_queue::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:575
cuda_piDeviceRetain
pi_result cuda_piDeviceRetain(pi_device)
Definition: pi_cuda.cpp:977
cuda_piextDeviceGetNativeHandle
pi_result cuda_piextDeviceGetNativeHandle(pi_device device, pi_native_handle *nativeHandle)
Gets the native CUDA handle of a PI device object.
Definition: pi_cuda.cpp:1914
_pi_program::buildOptions_
std::string buildOptions_
Definition: pi_cuda.hpp:733
piextUSMEnqueueMemcpy
pi_result piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, void *dst_ptr, const void *src_ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
USM Memcpy API.
Definition: pi_esimd_emulator.cpp:1921
piMemBufferPartition
pi_result piMemBufferPartition(pi_mem buffer, pi_mem_flags flags, pi_buffer_create_type buffer_create_type, void *buffer_create_info, pi_mem *ret_mem)
Definition: pi_esimd_emulator.cpp:1752
_pi_mem::mem_type_
enum _pi_mem::mem_type mem_type_
SupportedVersion
const char SupportedVersion[]
Definition: pi_cuda.cpp:5279
cuda_piextEventCreateWithNativeHandle
pi_result cuda_piextEventCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, bool ownNativeHandle, pi_event *event)
Created a PI event object from a CUDA event handle.
Definition: pi_cuda.cpp:3924
_pi_queue::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:579
PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF
@ PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF
Definition: pi.h:198
piextUSMEnqueueMemAdvise
pi_result piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, size_t length, pi_mem_advice advice, pi_event *event)
USM Memadvise API.
Definition: pi_esimd_emulator.cpp:1926
piQueueCreate
pi_result piQueueCreate(pi_context context, pi_device device, pi_queue_properties properties, pi_queue *queue)
Definition: pi_esimd_emulator.cpp:923
_pi_queue::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:577
PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE
@ PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE
Definition: pi.h:210
_pi_program::buildStatus_
pi_program_build_status buildStatus_
Definition: pi_cuda.hpp:734
PI_DEVICE_INFO_LOCAL_MEM_SIZE
@ PI_DEVICE_INFO_LOCAL_MEM_SIZE
Definition: pi.h:222
PI_MEMORY_ORDER_RELAXED
constexpr pi_memory_order_capabilities PI_MEMORY_ORDER_RELAXED
Definition: pi.h:500
piextProgramCreateWithNativeHandle
pi_result piextProgramCreateWithNativeHandle(pi_native_handle nativeHandle, pi_context context, bool pluginOwnsNativeHandle, pi_program *program)
Creates PI program object from a native handle.
Definition: pi_esimd_emulator.cpp:1338
cl::sycl::length
float length(T p) __NOEXC
Definition: builtins.hpp:1032
cuda_piextUSMGetMemAllocInfo
pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, pi_mem_alloc_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)