DPC++ Runtime
Runtime libraries for oneAPI Data Parallel C++
pi_cuda.hpp
Go to the documentation of this file.
1 //===-- pi_cuda.hpp - CUDA Plugin -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
11 
17 
18 #ifndef PI_CUDA_HPP
19 #define PI_CUDA_HPP
20 
21 #include "CL/sycl/detail/pi.h"
22 #include <array>
23 #include <atomic>
24 #include <cassert>
25 #include <cstring>
26 #include <cuda.h>
27 #include <limits>
28 #include <numeric>
29 #include <stdint.h>
30 #include <string>
31 #include <vector>
32 #include <functional>
33 #include <mutex>
34 
35 extern "C" {
36 
51  pi_kernel_group_info param_name,
52  size_t param_value_size, void *param_value,
53  size_t *param_value_size_ret);
55 }
56 
62 struct _pi_platform {
63  std::vector<std::unique_ptr<_pi_device>> devices_;
64 };
65 
71 struct _pi_device {
72 private:
73  using native_type = CUdevice;
74 
75  native_type cuDevice_;
76  std::atomic_uint32_t refCount_;
77  pi_platform platform_;
78 
79  static constexpr pi_uint32 max_work_item_dimensions = 3u;
80  size_t max_work_item_sizes[max_work_item_dimensions];
81  int max_work_group_size;
82 
83 public:
84  _pi_device(native_type cuDevice, pi_platform platform)
85  : cuDevice_(cuDevice), refCount_{1}, platform_(platform) {}
86 
87  native_type get() const noexcept { return cuDevice_; };
88 
89  pi_uint32 get_reference_count() const noexcept { return refCount_; }
90 
91  pi_platform get_platform() const noexcept { return platform_; };
92 
93  void save_max_work_item_sizes(size_t size,
94  size_t *save_max_work_item_sizes) noexcept {
95  memcpy(max_work_item_sizes, save_max_work_item_sizes, size);
96  };
97 
98  void save_max_work_group_size(int value) noexcept {
99  max_work_group_size = value;
100  };
101 
102  void get_max_work_item_sizes(size_t ret_size,
103  size_t *ret_max_work_item_sizes) const noexcept {
104  memcpy(ret_max_work_item_sizes, max_work_item_sizes, ret_size);
105  };
106 
107  int get_max_work_group_size() const noexcept { return max_work_group_size; };
108 };
109 
148 struct _pi_context {
149 
150  struct deleter_data {
152  void *user_data;
153 
154  void operator()() { function(user_data); }
155  };
156 
158 
159  enum class kind { primary, user_defined } kind_;
162  std::atomic_uint32_t refCount_;
163 
164  CUevent evBase_; // CUDA event used as base counter
165 
167  : kind_{k}, cuContext_{ctxt}, deviceId_{devId}, refCount_{1},
168  evBase_(nullptr) {
170  };
171 
173 
175  std::lock_guard<std::mutex> guard(mutex_);
176  for (auto &deleter : extended_deleters_) {
177  deleter();
178  }
179  }
180 
182  void *user_data) {
183  std::lock_guard<std::mutex> guard(mutex_);
184  extended_deleters_.emplace_back(deleter_data{function, user_data});
185  }
186 
187  pi_device get_device() const noexcept { return deviceId_; }
188 
189  native_type get() const noexcept { return cuContext_; }
190 
191  bool is_primary() const noexcept { return kind_ == kind::primary; }
192 
194 
196 
197  pi_uint32 get_reference_count() const noexcept { return refCount_; }
198 
199 private:
200  std::mutex mutex_;
201  std::vector<deleter_data> extended_deleters_;
202 };
203 
208 struct _pi_mem {
209 
210  // TODO: Move as much shared data up as possible
212 
213  // Context where the memory object is accessibles
215 
217  std::atomic_uint32_t refCount_;
218  enum class mem_type { buffer, surface } mem_type_;
219 
225  union mem_ {
226  // Handler for plain, pointer-based CUDA allocations
227  struct buffer_mem_ {
229 
230  // If this allocation is a sub-buffer (i.e., a view on an existing
231  // allocation), this is the pointer to the parent handler structure
233  // CUDA handler for the pointer
235 
237  void *hostPtr_;
239  size_t size_;
241  size_t mapOffset_;
243  void *mapPtr_;
246 
254  enum class alloc_mode {
255  classic,
256  use_host_ptr,
257  copy_in,
259  } allocMode_;
260 
261  native_type get() const noexcept { return ptr_; }
262 
263  size_t get_size() const noexcept { return size_; }
264 
265  void *get_map_ptr() const noexcept { return mapPtr_; }
266 
267  size_t get_map_offset(void *) const noexcept { return mapOffset_; }
268 
273  void *map_to_ptr(size_t offset, pi_map_flags flags) noexcept {
274  assert(mapPtr_ == nullptr);
275  mapOffset_ = offset;
276  mapFlags_ = flags;
277  if (hostPtr_) {
278  mapPtr_ = static_cast<char *>(hostPtr_) + offset;
279  } else {
280  // TODO: Allocate only what is needed based on the offset
281  mapPtr_ = static_cast<void *>(malloc(this->get_size()));
282  }
283  return mapPtr_;
284  }
285 
287  void unmap(void *) noexcept {
288  assert(mapPtr_ != nullptr);
289 
290  if (mapPtr_ != hostPtr_) {
291  free(mapPtr_);
292  }
293  mapPtr_ = nullptr;
294  mapOffset_ = 0;
295  }
296 
297  pi_map_flags get_map_flags() const noexcept {
298  assert(mapPtr_ != nullptr);
299  return mapFlags_;
300  }
301  } buffer_mem_;
302 
303  // Handler data for surface object (i.e. Images)
304  struct surface_mem_ {
305  CUarray array_;
306  CUsurfObject surfObj_;
308 
309  CUarray get_array() const noexcept { return array_; }
310 
311  CUsurfObject get_surface() const noexcept { return surfObj_; }
312 
313  pi_mem_type get_image_type() const noexcept { return imageType_; }
314  } surface_mem_;
315  } mem_;
316 
319  CUdeviceptr ptr, void *host_ptr, size_t size)
321  mem_.buffer_mem_.ptr_ = ptr;
322  mem_.buffer_mem_.parent_ = parent;
324  mem_.buffer_mem_.size_ = size;
326  mem_.buffer_mem_.mapPtr_ = nullptr;
329  if (is_sub_buffer()) {
331  } else {
333  }
334  };
335 
337  _pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf,
338  pi_mem_type image_type, void *host_ptr)
340  // Ignore unused parameter
341  (void)host_ptr;
342 
343  mem_.surface_mem_.array_ = array;
344  mem_.surface_mem_.surfObj_ = surf;
345  mem_.surface_mem_.imageType_ = image_type;
347  }
348 
350  if (mem_type_ == mem_type::buffer) {
351  if (is_sub_buffer()) {
353  return;
354  }
355  }
357  }
358 
359  // TODO: Move as many shared funcs up as possible
360  bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; }
361 
362  bool is_sub_buffer() const noexcept {
363  return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr));
364  }
365 
366  bool is_image() const noexcept { return mem_type_ == mem_type::surface; }
367 
368  pi_context get_context() const noexcept { return context_; }
369 
371 
373 
374  pi_uint32 get_reference_count() const noexcept { return refCount_; }
375 };
376 
379 struct _pi_queue {
381 
386  std::atomic_uint32_t refCount_;
387  std::atomic_uint32_t eventCount_;
388 
390  pi_queue_properties properties)
391  : stream_{stream}, context_{context}, device_{device},
392  properties_{properties}, refCount_{1}, eventCount_{0} {
395  }
396 
400  }
401 
402  native_type get() const noexcept { return stream_; };
403 
404  _pi_context *get_context() const { return context_; };
405 
407 
409 
410  pi_uint32 get_reference_count() const noexcept { return refCount_; }
411 
412  pi_uint32 get_next_event_id() noexcept { return ++eventCount_; }
413 };
414 
415 typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
416  void *userData);
419 struct _pi_event {
420 public:
422 
423  pi_result record();
424 
425  pi_result wait();
426 
427  pi_result start();
428 
429  native_type get() const noexcept { return evEnd_; };
430 
431  pi_queue get_queue() const noexcept { return queue_; }
432 
433  pi_command_type get_command_type() const noexcept { return commandType_; }
434 
435  pi_uint32 get_reference_count() const noexcept { return refCount_; }
436 
437  bool is_recorded() const noexcept { return isRecorded_; }
438 
439  bool is_started() const noexcept { return isStarted_; }
440 
441  bool is_completed() const noexcept;
442 
443  pi_int32 get_execution_status() const noexcept {
444 
445  if (!is_recorded()) {
446  return PI_EVENT_SUBMITTED;
447  }
448 
449  if (!is_completed()) {
450  return PI_EVENT_RUNNING;
451  }
452  return PI_EVENT_COMPLETE;
453  }
454 
455  pi_context get_context() const noexcept { return context_; };
456 
457  pi_uint32 increment_reference_count() { return ++refCount_; }
458 
459  pi_uint32 decrement_reference_count() { return --refCount_; }
460 
461  pi_uint32 get_event_id() const noexcept { return eventId_; }
462 
463  // Returns the counter time when the associated command(s) were enqueued
464  //
465  pi_uint64 get_queued_time() const;
466 
467  // Returns the counter time when the associated command(s) started execution
468  //
469  pi_uint64 get_start_time() const;
470 
471  // Returns the counter time when the associated command(s) completed
472  //
473  pi_uint64 get_end_time() const;
474 
475  // construct a native CUDA. This maps closely to the underlying CUDA event.
477  return new _pi_event(type, queue->get_context(), queue);
478  }
479 
480  pi_result release();
481 
482  ~_pi_event();
483 
484 private:
485  // This constructor is private to force programmers to use the make_native /
486  // make_user static members in order to create a pi_event for CUDA.
488 
489  pi_command_type commandType_; // The type of command associated with event.
490 
491  std::atomic_uint32_t refCount_; // Event reference count.
492 
493  bool hasBeenWaitedOn_; // Signifies whether the event has been waited
494  // on through a call to wait(), which implies
495  // that it has completed.
496 
497  bool isRecorded_; // Signifies wether a native CUDA event has been recorded
498  // yet.
499  bool isStarted_; // Signifies wether the operation associated with the
500  // PI event has started or not
501  //
502 
503  pi_uint32 eventId_; // Queue identifier of the event.
504 
505  native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
506  // event, this will be nullptr.
507 
508  native_type evStart_; // CUDA event handle associated with the start
509 
510  native_type evQueued_; // CUDA event handle associated with the time
511  // the command was enqueued
512 
513  pi_queue queue_; // pi_queue associated with the event. If this is a user
514  // event, this will be nullptr.
515 
516  pi_context context_; // pi_context associated with the event. If this is a
517  // native event, this will be the same context associated
518  // with the queue_ member.
519 };
520 
523 struct _pi_program {
526  const char *binary_;
528  std::atomic_uint32_t refCount_;
530 
531  // Metadata
532  std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
534 
535  constexpr static size_t MAX_LOG_SIZE = 8192u;
536 
538  std::string buildOptions_;
540 
541  _pi_program(pi_context ctxt);
542  ~_pi_program();
543 
545  size_t length);
546 
547  pi_result set_binary(const char *binary, size_t binarySizeInBytes);
548 
549  pi_result build_program(const char* build_options);
550 
551  pi_context get_context() const { return context_; };
552 
553  native_type get() const noexcept { return module_; };
554 
556 
558 
559  pi_uint32 get_reference_count() const noexcept { return refCount_; }
560 };
561 
578 struct _pi_kernel {
579  using native_type = CUfunction;
580 
583  std::string name_;
586  std::atomic_uint32_t refCount_;
587 
590 
598  struct arguments {
599  static constexpr size_t MAX_PARAM_BYTES = 4000u;
600  using args_t = std::array<char, MAX_PARAM_BYTES>;
601  using args_size_t = std::vector<size_t>;
602  using args_index_t = std::vector<void *>;
607 
608  std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0};
609 
611  // Place the implicit offset index at the end of the indicies collection
612  indices_.emplace_back(&implicitOffsetArgs_);
613  }
614 
620  void add_arg(size_t index, size_t size, const void *arg,
621  size_t localSize = 0) {
622  if (index + 2 > indices_.size()) {
623  // Move implicit offset argument index with the end
624  indices_.resize(index + 2, indices_.back());
625  // Ensure enough space for the new argument
626  paramSizes_.resize(index + 1);
627  offsetPerIndex_.resize(index + 1);
628  }
629  paramSizes_[index] = size;
630  // calculate the insertion point on the array
631  size_t insertPos = std::accumulate(std::begin(paramSizes_),
632  std::begin(paramSizes_) + index, 0);
633  // Update the stored value for the argument
634  std::memcpy(&storage_[insertPos], arg, size);
635  indices_[index] = &storage_[insertPos];
636  offsetPerIndex_[index] = localSize;
637  }
638 
639  void add_local_arg(size_t index, size_t size) {
640  size_t localOffset = this->get_local_size();
641  add_arg(index, sizeof(size_t), (const void *)&(localOffset), size);
642  }
643 
644  void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) {
645  assert(size == sizeof(std::uint32_t) * 3);
646  std::memcpy(implicitOffsetArgs_, implicitOffset, size);
647  }
648 
650  std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0);
651  }
652 
653  const args_index_t &get_indices() const noexcept { return indices_; }
654 
656  return std::accumulate(std::begin(offsetPerIndex_),
657  std::end(offsetPerIndex_), 0);
658  }
659  } args_;
660 
661  _pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name,
662  pi_program program, pi_context ctxt)
663  : function_{func}, functionWithOffsetParam_{funcWithOffsetParam},
664  name_{name}, context_{ctxt}, program_{program}, refCount_{1} {
669  this, ctxt->get_device(), PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
670  sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr);
671  assert(retError == PI_SUCCESS);
672  }
673 
674  _pi_kernel(CUfunction func, const char *name, pi_program program,
675  pi_context ctxt)
676  : _pi_kernel{func, nullptr, name, program, ctxt} {
679  this, ctxt->get_device(), PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
680  sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr);
681  assert(retError == PI_SUCCESS);
682  }
683 
685  {
688  }
689 
690  pi_program get_program() const noexcept { return program_; }
691 
693 
695 
696  pi_uint32 get_reference_count() const noexcept { return refCount_; }
697 
698  native_type get() const noexcept { return function_; };
699 
702  };
703 
704  bool has_with_offset_parameter() const noexcept {
705  return functionWithOffsetParam_ != nullptr;
706  }
707 
708  pi_context get_context() const noexcept { return context_; };
709 
710  const char *get_name() const noexcept { return name_.c_str(); }
711 
716  pi_uint32 get_num_args() const noexcept { return args_.indices_.size() - 1; }
717 
718  void set_kernel_arg(int index, size_t size, const void *arg) {
719  args_.add_arg(index, size, arg);
720  }
721 
722  void set_kernel_local_arg(int index, size_t size) {
723  args_.add_local_arg(index, size);
724  }
725 
726  void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) {
727  args_.set_implicit_offset(size, implicitOffset);
728  }
729 
731  return args_.get_indices();
732  }
733 
734  pi_uint32 get_local_size() const noexcept { return args_.get_local_size(); }
735 
737 };
738 
744 struct _pi_sampler {
745  std::atomic_uint32_t refCount_;
748 
750  : refCount_(1), props_(0), context_(context) {}
751 
753 
755 
756  pi_uint32 get_reference_count() const noexcept { return refCount_; }
757 };
758 
759 // -------------------------------------------------------------
760 // Helper types and functions
761 //
762 
763 #endif // PI_CUDA_HPP
_pi_queue::_pi_queue
_pi_queue(CUstream stream, _pi_context *context, _pi_device *device, pi_queue_properties properties)
Definition: pi_cuda.hpp:389
_pi_sampler::context_
pi_context context_
Definition: pi_cuda.hpp:747
_pi_kernel::has_with_offset_parameter
bool has_with_offset_parameter() const noexcept
Definition: pi_cuda.hpp:704
_pi_mem::_pi_mem
_pi_mem(pi_context ctxt, pi_mem parent, mem_::buffer_mem_::alloc_mode mode, CUdeviceptr ptr, void *host_ptr, size_t size)
Constructs the PI MEM handler for a non-typed allocation ("buffer")
Definition: pi_cuda.hpp:318
_pi_kernel::arguments::MAX_PARAM_BYTES
static constexpr size_t MAX_PARAM_BYTES
Definition: pi_cuda.hpp:599
_pi_kernel::native_type
CUfunction native_type
Definition: pi_cuda.hpp:579
_pi_event::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:435
_pi_mem::mem_
A PI Memory object represents either plain memory allocations ("Buffers" in OpenCL) or typed allocati...
Definition: pi_cuda.hpp:225
_pi_mem
PI Mem mapping to CUDA memory allocations, both data and texture/surface.
Definition: pi_cuda.hpp:208
cl::sycl::malloc
void * malloc(size_t size, const device &dev, const context &ctxt, usm::alloc kind)
Definition: usm_impl.cpp:270
PI_MAP_WRITE
constexpr pi_map_flags PI_MAP_WRITE
Definition: pi.h:559
_pi_mem_type
_pi_mem_type
Definition: pi.h:431
_pi_mem::mem_::surface_mem_::get_surface
CUsurfObject get_surface() const noexcept
Definition: pi_cuda.hpp:311
PI_SUCCESS
@ PI_SUCCESS
Definition: pi.h:82
_pi_kernel::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:694
CUcontext
struct CUctx_st * CUcontext
Definition: backend_traits_cuda.hpp:26
_pi_device::_pi_device
_pi_device(native_type cuDevice, pi_platform platform)
Definition: pi_cuda.hpp:84
_pi_kernel::arguments::get_local_size
pi_uint32 get_local_size() const
Definition: pi_cuda.hpp:655
_pi_context::kind::primary
@ primary
pi.h
type
_pi_context::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:195
CUstream
struct CUstream_st * CUstream
Definition: backend_traits_cuda.hpp:27
_pi_sampler::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:745
_pi_program::context_
_pi_context * context_
Definition: pi_cuda.hpp:529
_pi_event::get_context
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:455
_pi_mem::mem_::buffer_mem_::hostPtr_
void * hostPtr_
Pointer associated with this device on the host.
Definition: pi_cuda.hpp:237
_pi_context::deviceId_
_pi_device * deviceId_
Definition: pi_cuda.hpp:161
_pi_kernel::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:696
_pi_program::kernelReqdWorkGroupSizeMD_
std::unordered_map< std::string, std::tuple< uint32_t, uint32_t, uint32_t > > kernelReqdWorkGroupSizeMD_
Definition: pi_cuda.hpp:533
_pi_queue::context_
_pi_context * context_
Definition: pi_cuda.hpp:383
_pi_mem::is_image
bool is_image() const noexcept
Definition: pi_cuda.hpp:366
cl::sycl::info::device
device
Definition: info_desc.hpp:50
_pi_kernel::arguments::args_index_t
std::vector< void * > args_index_t
Definition: pi_cuda.hpp:602
_pi_event::is_started
bool is_started() const noexcept
Definition: pi_cuda.hpp:439
_pi_sampler::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:752
_pi_device::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:89
_pi_context::_pi_context
_pi_context(kind k, CUcontext ctxt, _pi_device *devId)
Definition: pi_cuda.hpp:166
_pi_context::deleter_data
Definition: pi_cuda.hpp:150
_pi_sampler::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:754
_pi_kernel::arguments::implicitOffsetArgs_
std::uint32_t implicitOffsetArgs_[3]
Definition: pi_cuda.hpp:608
_pi_kernel::arguments::add_local_arg
void add_local_arg(size_t index, size_t size)
Definition: pi_cuda.hpp:639
_pi_kernel::get_arg_indices
const arguments::args_index_t & get_arg_indices() const
Definition: pi_cuda.hpp:730
_pi_kernel::name_
std::string name_
Definition: pi_cuda.hpp:583
_pi_queue::native_type
CUstream native_type
Definition: pi_cuda.hpp:380
_pi_context::get
native_type get() const noexcept
Definition: pi_cuda.hpp:189
_pi_mem::mem_::buffer_mem_::size_
size_t size_
Size of the allocation in bytes.
Definition: pi_cuda.hpp:239
_pi_program::get_context
pi_context get_context() const
Definition: pi_cuda.hpp:551
_pi_context::cuContext_
native_type cuContext_
Definition: pi_cuda.hpp:160
_pi_program::MAX_LOG_SIZE
constexpr static size_t MAX_LOG_SIZE
Definition: pi_cuda.hpp:535
_pi_sampler::_pi_sampler
_pi_sampler(pi_context context)
Definition: pi_cuda.hpp:749
_pi_device::get_platform
pi_platform get_platform() const noexcept
Definition: pi_cuda.hpp:91
_pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr
@ alloc_host_ptr
_pi_result
_pi_result
Definition: pi.h:81
_pi_kernel::~_pi_kernel
~_pi_kernel()
Definition: pi_cuda.hpp:684
_pi_queue::device_
_pi_device * device_
Definition: pi_cuda.hpp:384
_pi_context::native_type
CUcontext native_type
Definition: pi_cuda.hpp:157
_pi_event::get_queued_time
pi_uint64 get_queued_time() const
Definition: pi_cuda.cpp:390
_pi_mem::mem_::buffer_mem_::get_map_offset
size_t get_map_offset(void *) const noexcept
Definition: pi_cuda.hpp:267
cl::sycl::info::kernel
kernel
Definition: info_desc.hpp:229
_pi_mem::context_
pi_context context_
Definition: pi_cuda.hpp:214
_pi_device::get_max_work_group_size
int get_max_work_group_size() const noexcept
Definition: pi_cuda.hpp:107
_pi_mem::is_buffer
bool is_buffer() const noexcept
Definition: pi_cuda.hpp:360
CUdeviceptr
unsigned int CUdeviceptr
Definition: backend_traits_cuda.hpp:35
_pi_program::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:528
_pi_mem::mem_::surface_mem_::get_array
CUarray get_array() const noexcept
Definition: pi_cuda.hpp:309
PI_EVENT_RUNNING
@ PI_EVENT_RUNNING
Definition: pi.h:124
_pi_platform
A PI platform stores all known PI devices, in the CUDA plugin this is just a vector of available devi...
Definition: pi_cuda.hpp:62
_pi_kernel::arguments::paramSizes_
args_size_t paramSizes_
Definition: pi_cuda.hpp:604
_pi_context::kind
kind
Definition: pi_cuda.hpp:159
_pi_context::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:197
_pi_mem::mem_::buffer_mem_::mapOffset_
size_t mapOffset_
Offset of the active mapped region.
Definition: pi_cuda.hpp:241
_pi_platform::devices_
std::vector< std::unique_ptr< _pi_device > > devices_
Definition: pi_cuda.hpp:63
cl::sycl::info::queue
queue
Definition: info_desc.hpp:222
_pi_program::build_program
pi_result build_program(const char *build_options)
Definition: pi_cuda.cpp:535
_pi_kernel
Implementation of a PI Kernel for CUDA.
Definition: pi_cuda.hpp:578
_pi_sampler::props_
pi_uint32 props_
Definition: pi_cuda.hpp:746
_pi_queue::eventCount_
std::atomic_uint32_t eventCount_
Definition: pi_cuda.hpp:387
_pi_mem::mem_::buffer_mem_::alloc_mode
alloc_mode
alloc_mode classic: Just a normal buffer allocated on the device via cuda malloc use_host_ptr: Use an...
Definition: pi_cuda.hpp:254
_pi_context::set_extended_deleter
void set_extended_deleter(pi_context_extended_deleter function, void *user_data)
Definition: pi_cuda.hpp:181
cl::sycl::detail::memcpy
void memcpy(void *Dst, const void *Src, std::size_t Size)
cuda_piKernelRetain
pi_result cuda_piKernelRetain(pi_kernel kernel)
Definition: pi_cuda.cpp:3311
_pi_mem::~_pi_mem
~_pi_mem()
Definition: pi_cuda.hpp:349
_pi_queue::get
native_type get() const noexcept
Definition: pi_cuda.hpp:402
_pi_queue::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:406
cuda_piDeviceRetain
pi_result cuda_piDeviceRetain(pi_device)
Definition: pi_cuda.cpp:837
_pi_program::buildOptions_
std::string buildOptions_
Definition: pi_cuda.hpp:538
_pi_mem::mem_type_
enum _pi_mem::mem_type mem_type_
_pi_queue::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:410
_pi_queue::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:408
_pi_kernel::arguments::args_t
std::array< char, MAX_PARAM_BYTES > args_t
Definition: pi_cuda.hpp:600
_pi_program::buildStatus_
pi_program_build_status buildStatus_
Definition: pi_cuda.hpp:539
cl::sycl::length
float length(T p) __NOEXC
Definition: builtins.hpp:1017
_pi_kernel::functionWithOffsetParam_
native_type functionWithOffsetParam_
Definition: pi_cuda.hpp:582
_pi_program::module_
native_type module_
Definition: pi_cuda.hpp:525
_pi_queue
PI queue mapping on to CUstream objects.
Definition: pi_cuda.hpp:379
_pi_context::~_pi_context
~_pi_context()
Definition: pi_cuda.hpp:172
_pi_kernel::get_num_args
pi_uint32 get_num_args() const noexcept
Returns the number of arguments, excluding the implicit global offset.
Definition: pi_cuda.hpp:716
_pi_queue::get_next_event_id
pi_uint32 get_next_event_id() noexcept
Definition: pi_cuda.hpp:412
pi_uint32
uint32_t pi_uint32
Definition: pi.h:68
_pi_mem::mem_::buffer_mem_::ptr_
native_type ptr_
Definition: pi_cuda.hpp:234
_pi_mem::mem_::buffer_mem_::get_map_flags
pi_map_flags get_map_flags() const noexcept
Definition: pi_cuda.hpp:297
_pi_kernel::arguments::offsetPerIndex_
args_size_t offsetPerIndex_
Definition: pi_cuda.hpp:606
_pi_mem::mem_::surface_mem_
struct _pi_mem::mem_::surface_mem_ surface_mem_
_pi_event::decrement_reference_count
pi_uint32 decrement_reference_count()
Definition: pi_cuda.hpp:459
_pi_kernel::function_
native_type function_
Definition: pi_cuda.hpp:581
_pi_queue::properties_
pi_queue_properties properties_
Definition: pi_cuda.hpp:385
_pi_program::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:559
_pi_kernel::arguments::storage_
args_t storage_
Definition: pi_cuda.hpp:603
_pi_kernel::arguments::add_arg
void add_arg(size_t index, size_t size, const void *arg, size_t localSize=0)
Adds an argument to the kernel.
Definition: pi_cuda.hpp:620
_pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr
@ use_host_ptr
_pi_mem::mem_::buffer_mem_::get
native_type get() const noexcept
Definition: pi_cuda.hpp:261
cl::sycl::detail::usm::free
void free(void *Ptr, const context &Ctxt)
Definition: usm_impl.cpp:132
_pi_kernel::context_
pi_context context_
Definition: pi_cuda.hpp:584
pi_context_extended_deleter
void(* pi_context_extended_deleter)(void *user_data)
Definition: pi.h:1028
_pi_mem::mem_::surface_mem_::surfObj_
CUsurfObject surfObj_
Definition: pi_cuda.hpp:306
_pi_event::get_execution_status
pi_int32 get_execution_status() const noexcept
Definition: pi_cuda.hpp:443
cl::sycl::host_ptr
multi_ptr< ElementType, access::address_space::global_host_space > host_ptr
Definition: pointers.hpp:28
_pi_context::deleter_data::user_data
void * user_data
Definition: pi_cuda.hpp:152
_pi_context::kind::user_defined
@ user_defined
_pi_kernel::get_program
pi_program get_program() const noexcept
Definition: pi_cuda.hpp:690
_pi_mem::mem_type
mem_type
Definition: pi_cuda.hpp:218
cuda_piProgramRelease
pi_result cuda_piProgramRelease(pi_program program)
Decreases the reference count of a pi_program object.
Definition: pi_cuda.cpp:3165
_pi_kernel::_pi_kernel
_pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name, pi_program program, pi_context ctxt)
Definition: pi_cuda.hpp:661
_pi_event::get_event_id
pi_uint32 get_event_id() const noexcept
Definition: pi_cuda.hpp:461
_pi_queue::~_pi_queue
~_pi_queue()
Definition: pi_cuda.hpp:397
_pi_event::get_command_type
pi_command_type get_command_type() const noexcept
Definition: pi_cuda.hpp:433
_pi_event::native_type
CUevent native_type
Definition: pi_cuda.hpp:421
_pi_mem::mem_::buffer_mem_::get_size
size_t get_size() const noexcept
Definition: pi_cuda.hpp:263
_pi_kernel::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:692
_pi_mem::mem_::buffer_mem_::mapPtr_
void * mapPtr_
Pointer to the active mapped region, if any.
Definition: pi_cuda.hpp:243
pi_uint64
uint64_t pi_uint64
Definition: pi.h:69
cuda_piContextRetain
pi_result cuda_piContextRetain(pi_context context)
Definition: pi_cuda.cpp:859
_pi_mem::mem_::buffer_mem_::parent_
pi_mem parent_
Definition: pi_cuda.hpp:232
cl::sycl::info::platform::name
@ name
_pi_device_binary_property_struct
Definition: pi.h:648
_pi_device::save_max_work_group_size
void save_max_work_group_size(int value) noexcept
Definition: pi_cuda.hpp:98
_pi_event::wait
pi_result wait()
Definition: pi_cuda.cpp:447
_pi_program
Implementation of PI Program on CUDA Module object.
Definition: pi_cuda.hpp:523
_pi_sampler
Implementation of samplers for CUDA.
Definition: pi_cuda.hpp:744
_pi_sampler::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:756
_pi_device::get
native_type get() const noexcept
Definition: pi_cuda.hpp:87
_pi_kernel::get_context
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:708
_pi_queue::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:386
_pi_program::infoLog_
char infoLog_[MAX_LOG_SIZE]
Definition: pi_cuda.hpp:537
_pi_event::is_completed
bool is_completed() const noexcept
Definition: pi_cuda.cpp:373
cuda_piMemRetain
pi_result cuda_piMemRetain(pi_mem mem)
Definition: pi_cuda.cpp:2901
_pi_mem::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:374
_pi_event::release
pi_result release()
Definition: pi_cuda.cpp:459
_pi_mem::mem_::buffer_mem_
Definition: pi_cuda.hpp:227
CUmodule
struct CUmod_st * CUmodule
Definition: backend_traits_cuda.hpp:29
PI_EVENT_COMPLETE
@ PI_EVENT_COMPLETE
Definition: pi.h:123
_pi_mem::mem_type::buffer
@ buffer
_pi_kernel::get_local_size
pi_uint32 get_local_size() const noexcept
Definition: pi_cuda.hpp:734
_pi_mem::mem_::buffer_mem_::native_type
CUdeviceptr native_type
Definition: pi_cuda.hpp:228
_pi_device::save_max_work_item_sizes
void save_max_work_item_sizes(size_t size, size_t *save_max_work_item_sizes) noexcept
Definition: pi_cuda.hpp:93
_pi_program::~_pi_program
~_pi_program()
Definition: pi_cuda.cpp:485
_pi_event::get
native_type get() const noexcept
Definition: pi_cuda.hpp:429
_pi_kernel::_pi_kernel
_pi_kernel(CUfunction func, const char *name, pi_program program, pi_context ctxt)
Definition: pi_cuda.hpp:674
_pi_kernel::arguments::set_implicit_offset
void set_implicit_offset(size_t size, std::uint32_t *implicitOffset)
Definition: pi_cuda.hpp:644
_pi_mem::mem_::buffer_mem_::alloc_mode::copy_in
@ copy_in
_pi_program::binary_
const char * binary_
Definition: pi_cuda.hpp:526
CUdevice
int CUdevice
Definition: backend_traits_cuda.hpp:25
_pi_mem::mem_::buffer_mem_::get_map_ptr
void * get_map_ptr() const noexcept
Definition: pi_cuda.hpp:265
_pi_kernel::get_with_offset_parameter
native_type get_with_offset_parameter() const noexcept
Definition: pi_cuda.hpp:700
_pi_context::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:193
_pi_event::~_pi_event
~_pi_event()
Definition: pi_cuda.cpp:348
_pi_event::increment_reference_count
pi_uint32 increment_reference_count()
Definition: pi_cuda.hpp:457
_pi_context::deleter_data::operator()
void operator()()
Definition: pi_cuda.hpp:154
cuda_piDeviceRelease
pi_result cuda_piDeviceRelease(pi_device)
Definition: pi_cuda.cpp:937
_pi_kernel::arguments::clear_local_size
void clear_local_size()
Definition: pi_cuda.hpp:649
_pi_mem::mem_::surface_mem_::imageType_
pi_mem_type imageType_
Definition: pi_cuda.hpp:307
_pi_event::_pi_event
_pi_event()
Definition: pi_esimd_emulator.hpp:144
cl::sycl::info::event
event
Definition: info_desc.hpp:282
pi_queue_properties
pi_bitfield pi_queue_properties
Definition: pi.h:570
_pi_kernel::program_
pi_program program_
Definition: pi_cuda.hpp:585
_pi_program::_pi_program
_pi_program()
Definition: pi_esimd_emulator.hpp:153
_pi_kernel::arguments
Structure that holds the arguments to the kernel.
Definition: pi_cuda.hpp:598
_pi_mem::mem_type::surface
@ surface
_pi_mem::mem_::surface_mem_
Definition: pi_cuda.hpp:304
_pi_program::native_type
CUmodule native_type
Definition: pi_cuda.hpp:524
PI_PROGRAM_BUILD_STATUS_NONE
@ PI_PROGRAM_BUILD_STATUS_NONE
Definition: pi.h:145
_pi_event::get_start_time
pi_uint64 get_start_time() const
Definition: pi_cuda.cpp:399
_pi_event
PI Event mapping to CUevent.
Definition: pi_cuda.hpp:419
_pi_program::errorLog_
char errorLog_[MAX_LOG_SIZE]
Definition: pi_cuda.hpp:537
_pi_context::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:162
_pi_kernel::reqdThreadsPerBlock_
size_t reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS]
Definition: pi_cuda.hpp:589
_pi_mem::mem_::buffer_mem_
struct _pi_mem::mem_::buffer_mem_ buffer_mem_
_pi_queue::stream_
native_type stream_
Definition: pi_cuda.hpp:382
_pi_mem::mem_::buffer_mem_::alloc_mode::classic
@ classic
_pi_mem::refCount_
std::atomic_uint32_t refCount_
Reference counting of the handler.
Definition: pi_cuda.hpp:217
_pi_mem::mem_
union _pi_mem::mem_ mem_
_pi_kernel::set_implicit_offset_arg
void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset)
Definition: pi_cuda.hpp:726
_pi_mem::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:370
_pi_kernel::get_name
const char * get_name() const noexcept
Definition: pi_cuda.hpp:710
cuda_piProgramRetain
pi_result cuda_piProgramRetain(pi_program program)
Definition: pi_cuda.cpp:3155
_pi_event::is_recorded
bool is_recorded() const noexcept
Definition: pi_cuda.hpp:437
_pi_kernel::arguments::arguments
arguments()
Definition: pi_cuda.hpp:610
CUevent
struct CUevent_st * CUevent
Definition: backend_traits_cuda.hpp:28
_pi_device::get_max_work_item_sizes
void get_max_work_item_sizes(size_t ret_size, size_t *ret_max_work_item_sizes) const noexcept
Definition: pi_cuda.hpp:102
_pi_program::set_binary
pi_result set_binary(const char *binary, size_t binarySizeInBytes)
Definition: pi_cuda.cpp:527
_pi_kernel::get
native_type get() const noexcept
Definition: pi_cuda.hpp:698
cuda_piQueueRetain
pi_result cuda_piQueueRetain(pi_queue command_queue)
Definition: pi_cuda.cpp:2199
_pi_context::evBase_
CUevent evBase_
Definition: pi_cuda.hpp:164
_pi_mem::_pi_mem
_pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf, pi_mem_type image_type, void *host_ptr)
Constructs the PI allocation for an Image object (surface in CUDA)
Definition: pi_cuda.hpp:337
_pi_program_build_status
_pi_program_build_status
Definition: pi.h:144
_pi_kernel::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:586
cl::sycl::info::platform
platform
Definition: info_desc.hpp:30
_pi_kernel::arguments::indices_
args_index_t indices_
Definition: pi_cuda.hpp:605
_pi_kernel::args_
struct _pi_kernel::arguments args_
cl::sycl::access::mode
mode
Definition: access.hpp:28
cl::sycl::info::context
context
Definition: info_desc.hpp:41
_pi_kernel::set_kernel_local_arg
void set_kernel_local_arg(int index, size_t size)
Definition: pi_cuda.hpp:722
_pi_event::start
pi_result start()
Definition: pi_cuda.cpp:355
_pi_mem::mem_::buffer_mem_::allocMode_
enum _pi_mem::mem_::buffer_mem_::alloc_mode allocMode_
_pi_mem::get_context
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:368
cuda_piContextRelease
pi_result cuda_piContextRelease(pi_context ctxt)
Definition: pi_cuda.cpp:1822
_pi_kernel_group_info
_pi_kernel_group_info
Definition: pi.h:350
_pi_program::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:557
pfn_notify
void(* pfn_notify)(pi_event event, pi_int32 eventCommandStatus, void *userData)
Definition: pi_cuda.hpp:415
_pi_mem::is_sub_buffer
bool is_sub_buffer() const noexcept
Definition: pi_cuda.hpp:362
_pi_queue::get_context
_pi_context * get_context() const
Definition: pi_cuda.hpp:404
_pi_context::is_primary
bool is_primary() const noexcept
Definition: pi_cuda.hpp:191
_pi_mem::mem_::buffer_mem_::mapFlags_
pi_map_flags mapFlags_
Original flags for the mapped region.
Definition: pi_cuda.hpp:245
_pi_event::get_end_time
pi_uint64 get_end_time() const
Definition: pi_cuda.cpp:407
_pi_program::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:555
_pi_command_type
_pi_command_type
Definition: pi.h:399
_pi_kernel::set_kernel_arg
void set_kernel_arg(int index, size_t size, const void *arg)
Definition: pi_cuda.hpp:718
_pi_kernel::arguments::args_size_t
std::vector< size_t > args_size_t
Definition: pi_cuda.hpp:601
_pi_mem::mem_::buffer_mem_::unmap
void unmap(void *) noexcept
Detach the allocation from the host memory.
Definition: pi_cuda.hpp:287
pi_map_flags
pi_bitfield pi_map_flags
Definition: pi.h:557
cuda_piMemRelease
pi_result cuda_piMemRelease(pi_mem memObj)
Decreases the reference count of the Mem object.
Definition: pi_cuda.cpp:1964
_pi_program::get
native_type get() const noexcept
Definition: pi_cuda.hpp:553
_pi_mem::mem_::surface_mem_::array_
CUarray array_
Definition: pi_cuda.hpp:305
_pi_mem::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:372
_pi_program::set_metadata
pi_result set_metadata(const pi_device_binary_property *metadata, size_t length)
Definition: pi_cuda.cpp:500
_pi_event::make_native
static pi_event make_native(pi_command_type type, pi_queue queue)
Definition: pi_cuda.hpp:476
_pi_context::kind_
enum _pi_context::kind kind_
_pi_mem::mem_::surface_mem_::get_image_type
pi_mem_type get_image_type() const noexcept
Definition: pi_cuda.hpp:313
_pi_event::get_queue
pi_queue get_queue() const noexcept
Definition: pi_cuda.hpp:431
_pi_kernel::REQD_THREADS_PER_BLOCK_DIMENSIONS
static constexpr pi_uint32 REQD_THREADS_PER_BLOCK_DIMENSIONS
Definition: pi_cuda.hpp:588
cuda_piKernelGetGroupInfo
pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_cuda.cpp:2521
PI_EVENT_SUBMITTED
@ PI_EVENT_SUBMITTED
Definition: pi.h:125
PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
Definition: pi.h:353
_pi_mem::mem_::buffer_mem_::map_to_ptr
void * map_to_ptr(size_t offset, pi_map_flags flags) noexcept
Returns a pointer to data visible on the host that contains the data on the device associated with th...
Definition: pi_cuda.hpp:273
pi_int32
int32_t pi_int32
Definition: pi.h:67
_pi_context
PI context mapping to a CUDA context object.
Definition: pi_cuda.hpp:148
cuda_piKernelRelease
pi_result cuda_piKernelRelease(pi_kernel kernel)
Definition: pi_cuda.cpp:3319
_pi_program::binarySizeInBytes_
size_t binarySizeInBytes_
Definition: pi_cuda.hpp:527
cuda_piQueueRelease
pi_result cuda_piQueueRelease(pi_queue command_queue)
Definition: pi_cuda.cpp:2207
_pi_device
PI device mapping to a CUdevice.
Definition: pi_cuda.hpp:71
_pi_event::record
pi_result record()
Definition: pi_cuda.cpp:415
_pi_context::get_device
pi_device get_device() const noexcept
Definition: pi_cuda.hpp:187
_pi_context::invoke_extended_deleters
void invoke_extended_deleters()
Definition: pi_cuda.hpp:174
_pi_kernel::arguments::get_indices
const args_index_t & get_indices() const noexcept
Definition: pi_cuda.hpp:653
_pi_kernel::clear_local_size
void clear_local_size()
Definition: pi_cuda.hpp:736