DPC++ Runtime
Runtime libraries for oneAPI DPC++
pi_cuda.hpp
Go to the documentation of this file.
1 //===-- pi_cuda.hpp - CUDA Plugin -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
11 
17 
18 #ifndef PI_CUDA_HPP
19 #define PI_CUDA_HPP
20 
21 // This version should be incremented for any change made to this file or its
22 // corresponding .cpp file.
23 #define _PI_CUDA_PLUGIN_VERSION 1
24 
25 #define _PI_CUDA_PLUGIN_VERSION_STRING \
26  _PI_PLUGIN_VERSION_STRING(_PI_CUDA_PLUGIN_VERSION)
27 
28 #include "sycl/detail/pi.h"
29 #include <algorithm>
30 #include <array>
31 #include <atomic>
32 #include <cassert>
33 #include <cstring>
34 #include <cuda.h>
35 #include <functional>
36 #include <limits>
37 #include <memory>
38 #include <mutex>
39 #include <numeric>
40 #include <stdint.h>
41 #include <string>
42 #include <unordered_map>
43 #include <vector>
44 
45 extern "C" {
46 
61  pi_kernel_group_info param_name,
62  size_t param_value_size, void *param_value,
63  size_t *param_value_size_ret);
65 }
66 
67 using _pi_stream_guard = std::unique_lock<std::mutex>;
68 
74 struct _pi_platform {
75  std::vector<std::unique_ptr<_pi_device>> devices_;
76 };
77 
83 struct _pi_device {
84 private:
85  using native_type = CUdevice;
86 
87  native_type cuDevice_;
88  CUcontext cuContext_;
89  CUevent evBase_; // CUDA event used as base counter
90  std::atomic_uint32_t refCount_;
91  pi_platform platform_;
92 
93  static constexpr pi_uint32 max_work_item_dimensions = 3u;
94  size_t max_work_item_sizes[max_work_item_dimensions];
95  int max_work_group_size;
96 
97 public:
98  _pi_device(native_type cuDevice, CUcontext cuContext, CUevent evBase,
99  pi_platform platform)
100  : cuDevice_(cuDevice), cuContext_(cuContext),
101  evBase_(evBase), refCount_{1}, platform_(platform) {}
102 
103  ~_pi_device() { cuDevicePrimaryCtxRelease(cuDevice_); }
104 
105  native_type get() const noexcept { return cuDevice_; };
106 
107  CUcontext get_context() const noexcept { return cuContext_; };
108 
109  pi_uint32 get_reference_count() const noexcept { return refCount_; }
110 
111  pi_platform get_platform() const noexcept { return platform_; };
112 
114 
115  void save_max_work_item_sizes(size_t size,
116  size_t *save_max_work_item_sizes) noexcept {
117  memcpy(max_work_item_sizes, save_max_work_item_sizes, size);
118  };
119 
120  void save_max_work_group_size(int value) noexcept {
121  max_work_group_size = value;
122  };
123 
124  void get_max_work_item_sizes(size_t ret_size,
125  size_t *ret_max_work_item_sizes) const noexcept {
126  memcpy(ret_max_work_item_sizes, max_work_item_sizes, ret_size);
127  };
128 
129  int get_max_work_group_size() const noexcept { return max_work_group_size; };
130 };
131 
170 struct _pi_context {
171 
172  struct deleter_data {
174  void *user_data;
175 
176  void operator()() { function(user_data); }
177  };
178 
180 
183  std::atomic_uint32_t refCount_;
184 
186  : cuContext_{devId->get_context()}, deviceId_{devId}, refCount_{1} {
188  };
189 
191 
193  std::lock_guard<std::mutex> guard(mutex_);
194  for (auto &deleter : extended_deleters_) {
195  deleter();
196  }
197  }
198 
200  void *user_data) {
201  std::lock_guard<std::mutex> guard(mutex_);
202  extended_deleters_.emplace_back(deleter_data{function, user_data});
203  }
204 
205  pi_device get_device() const noexcept { return deviceId_; }
206 
207  native_type get() const noexcept { return cuContext_; }
208 
210 
212 
213  pi_uint32 get_reference_count() const noexcept { return refCount_; }
214 
215 private:
216  std::mutex mutex_;
217  std::vector<deleter_data> extended_deleters_;
218 };
219 
224 struct _pi_mem {
225 
226  // TODO: Move as much shared data up as possible
228 
229  // Context where the memory object is accessibles
231 
233  std::atomic_uint32_t refCount_;
234  enum class mem_type { buffer, surface } mem_type_;
235 
241  union mem_ {
242  // Handler for plain, pointer-based CUDA allocations
243  struct buffer_mem_ {
245 
246  // If this allocation is a sub-buffer (i.e., a view on an existing
247  // allocation), this is the pointer to the parent handler structure
249  // CUDA handler for the pointer
251 
253  void *hostPtr_;
255  size_t size_;
257  size_t mapOffset_;
259  void *mapPtr_;
262 
270  enum class alloc_mode {
271  classic,
272  use_host_ptr,
273  copy_in,
275  } allocMode_;
276 
277  native_type get() const noexcept { return ptr_; }
278 
279  size_t get_size() const noexcept { return size_; }
280 
281  void *get_map_ptr() const noexcept { return mapPtr_; }
282 
283  size_t get_map_offset(void *) const noexcept { return mapOffset_; }
284 
289  void *map_to_ptr(size_t offset, pi_map_flags flags) noexcept {
290  assert(mapPtr_ == nullptr);
291  mapOffset_ = offset;
292  mapFlags_ = flags;
293  if (hostPtr_) {
294  mapPtr_ = static_cast<char *>(hostPtr_) + offset;
295  } else {
296  // TODO: Allocate only what is needed based on the offset
297  mapPtr_ = static_cast<void *>(malloc(this->get_size()));
298  }
299  return mapPtr_;
300  }
301 
303  void unmap(void *) noexcept {
304  assert(mapPtr_ != nullptr);
305 
306  if (mapPtr_ != hostPtr_) {
307  free(mapPtr_);
308  }
309  mapPtr_ = nullptr;
310  mapOffset_ = 0;
311  }
312 
313  pi_map_flags get_map_flags() const noexcept {
314  assert(mapPtr_ != nullptr);
315  return mapFlags_;
316  }
317  } buffer_mem_;
318 
319  // Handler data for surface object (i.e. Images)
320  struct surface_mem_ {
321  CUarray array_;
322  CUsurfObject surfObj_;
324 
325  CUarray get_array() const noexcept { return array_; }
326 
327  CUsurfObject get_surface() const noexcept { return surfObj_; }
328 
329  pi_mem_type get_image_type() const noexcept { return imageType_; }
330  } surface_mem_;
331  } mem_;
332 
335  CUdeviceptr ptr, void *host_ptr, size_t size)
337  mem_.buffer_mem_.ptr_ = ptr;
338  mem_.buffer_mem_.parent_ = parent;
340  mem_.buffer_mem_.size_ = size;
342  mem_.buffer_mem_.mapPtr_ = nullptr;
345  if (is_sub_buffer()) {
347  } else {
349  }
350  };
351 
353  _pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf,
354  pi_mem_type image_type, void *host_ptr)
356  // Ignore unused parameter
357  (void)host_ptr;
358 
359  mem_.surface_mem_.array_ = array;
360  mem_.surface_mem_.surfObj_ = surf;
361  mem_.surface_mem_.imageType_ = image_type;
363  }
364 
366  if (mem_type_ == mem_type::buffer) {
367  if (is_sub_buffer()) {
369  return;
370  }
371  }
373  }
374 
375  // TODO: Move as many shared funcs up as possible
376  bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; }
377 
378  bool is_sub_buffer() const noexcept {
379  return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr));
380  }
381 
382  bool is_image() const noexcept { return mem_type_ == mem_type::surface; }
383 
384  pi_context get_context() const noexcept { return context_; }
385 
387 
389 
390  pi_uint32 get_reference_count() const noexcept { return refCount_; }
391 };
392 
395 struct _pi_queue {
397  static constexpr int default_num_compute_streams = 128;
398  static constexpr int default_num_transfer_streams = 64;
399 
400  std::vector<native_type> compute_streams_;
401  std::vector<native_type> transfer_streams_;
402  // delay_compute_ keeps track of which streams have been recently reused and
403  // their next use should be delayed. If a stream has been recently reused it
404  // will be skipped the next time it would be selected round-robin style. When
405  // skipped, its delay flag is cleared.
406  std::vector<bool> delay_compute_;
407  // keep track of which streams have applied barrier
408  std::vector<bool> compute_applied_barrier_;
409  std::vector<bool> transfer_applied_barrier_;
415  std::atomic_uint32_t refCount_;
416  std::atomic_uint32_t eventCount_;
417  std::atomic_uint32_t compute_stream_idx_;
418  std::atomic_uint32_t transfer_stream_idx_;
419  unsigned int num_compute_streams_;
420  unsigned int num_transfer_streams_;
423  unsigned int flags_;
424  // When compute_stream_sync_mutex_ and compute_stream_mutex_ both need to be
425  // locked at the same time, compute_stream_sync_mutex_ should be locked first
426  // to avoid deadlocks
430  std::mutex barrier_mutex_;
432 
433  _pi_queue(std::vector<CUstream> &&compute_streams,
434  std::vector<CUstream> &&transfer_streams, _pi_context *context,
435  _pi_device *device, pi_queue_properties properties,
436  unsigned int flags, bool backend_owns = true)
437  : compute_streams_{std::move(compute_streams)},
438  transfer_streams_{std::move(transfer_streams)},
439  delay_compute_(compute_streams_.size(), false),
442  device_{device}, properties_{properties}, refCount_{1}, eventCount_{0},
446  flags_(flags), has_ownership_{backend_owns} {
449  }
450 
454  }
455 
457  pi_uint32 stream_i);
459  pi_uint32 stream_i);
460 
461  // get_next_compute/transfer_stream() functions return streams from
462  // appropriate pools in round-robin fashion
463  native_type get_next_compute_stream(pi_uint32 *stream_token = nullptr);
464  // this overload tries select a stream that was used by one of dependancies.
465  // If that is not possible returns a new stream. If a stream is reused it
466  // returns a lock that needs to remain locked as long as the stream is in use
467  native_type get_next_compute_stream(pi_uint32 num_events_in_wait_list,
468  const pi_event *event_wait_list,
469  _pi_stream_guard &guard,
470  pi_uint32 *stream_token = nullptr);
473 
474  bool has_been_synchronized(pi_uint32 stream_token) {
475  // stream token not associated with one of the compute streams
476  if (stream_token == std::numeric_limits<pi_uint32>::max()) {
477  return false;
478  }
479  return last_sync_compute_streams_ > stream_token;
480  }
481 
482  bool can_reuse_stream(pi_uint32 stream_token) {
483  // stream token not associated with one of the compute streams
484  if (stream_token == std::numeric_limits<pi_uint32>::max()) {
485  return false;
486  }
487  // If the command represented by the stream token was not the last command
488  // enqueued to the stream we can not reuse the stream - we need to allow for
489  // commands enqueued after it and the one we are about to enqueue to run
490  // concurrently
491  bool is_last_command =
492  (compute_stream_idx_ - stream_token) <= compute_streams_.size();
493  // If there was a barrier enqueued to the queue after the command
494  // represented by the stream token we should not reuse the stream, as we can
495  // not take that stream into account for the bookkeeping for the next
496  // barrier - such a stream would not be synchronized with. Performance-wise
497  // it does not matter that we do not reuse the stream, as the work
498  // represented by the stream token is guaranteed to be complete by the
499  // barrier before any work we are about to enqueue to the stream will start,
500  // so the event does not need to be synchronized with.
501  return is_last_command && !has_been_synchronized(stream_token);
502  }
503 
504  template <typename T> bool all_of(T &&f) {
505  {
506  std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
507  unsigned int end =
508  std::min(static_cast<unsigned int>(compute_streams_.size()),
510  if (!std::all_of(compute_streams_.begin(), compute_streams_.begin() + end,
511  f))
512  return false;
513  }
514  {
515  std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
516  unsigned int end =
517  std::min(static_cast<unsigned int>(transfer_streams_.size()),
519  if (!std::all_of(transfer_streams_.begin(),
520  transfer_streams_.begin() + end, f))
521  return false;
522  }
523  return true;
524  }
525 
526  template <typename T> void for_each_stream(T &&f) {
527  {
528  std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
529  unsigned int end =
530  std::min(static_cast<unsigned int>(compute_streams_.size()),
532  for (unsigned int i = 0; i < end; i++) {
533  f(compute_streams_[i]);
534  }
535  }
536  {
537  std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
538  unsigned int end =
539  std::min(static_cast<unsigned int>(transfer_streams_.size()),
541  for (unsigned int i = 0; i < end; i++) {
542  f(transfer_streams_[i]);
543  }
544  }
545  }
546 
547  template <bool ResetUsed = false, typename T> void sync_streams(T &&f) {
548  auto sync_compute = [&f, &streams = compute_streams_,
549  &delay = delay_compute_](unsigned int start,
550  unsigned int stop) {
551  for (unsigned int i = start; i < stop; i++) {
552  f(streams[i]);
553  delay[i] = false;
554  }
555  };
556  auto sync_transfer = [&f, &streams = transfer_streams_](unsigned int start,
557  unsigned int stop) {
558  for (unsigned int i = start; i < stop; i++) {
559  f(streams[i]);
560  }
561  };
562  {
563  unsigned int size = static_cast<unsigned int>(compute_streams_.size());
564  std::lock_guard<std::mutex> compute_sync_guard(
566  std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
567  unsigned int start = last_sync_compute_streams_;
568  unsigned int end = num_compute_streams_ < size
570  : compute_stream_idx_.load();
571  if (end - start >= size) {
572  sync_compute(0, size);
573  } else {
574  start %= size;
575  end %= size;
576  if (start <= end) {
577  sync_compute(start, end);
578  } else {
579  sync_compute(start, size);
580  sync_compute(0, end);
581  }
582  }
583  if (ResetUsed) {
585  }
586  }
587  {
588  unsigned int size = static_cast<unsigned int>(transfer_streams_.size());
589  if (size > 0) {
590  std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
591  unsigned int start = last_sync_transfer_streams_;
592  unsigned int end = num_transfer_streams_ < size
594  : transfer_stream_idx_.load();
595  if (end - start >= size) {
596  sync_transfer(0, size);
597  } else {
598  start %= size;
599  end %= size;
600  if (start <= end) {
601  sync_transfer(start, end);
602  } else {
603  sync_transfer(start, size);
604  sync_transfer(0, end);
605  }
606  }
607  if (ResetUsed) {
609  }
610  }
611  }
612  }
613 
614  _pi_context *get_context() const { return context_; };
615 
616  _pi_device *get_device() const { return device_; };
617 
619 
621 
622  pi_uint32 get_reference_count() const noexcept { return refCount_; }
623 
624  pi_uint32 get_next_event_id() noexcept { return ++eventCount_; }
625 
626  bool backend_has_ownership() const noexcept { return has_ownership_; }
627 };
628 
629 typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
630  void *userData);
633 struct _pi_event {
634 public:
636 
637  pi_result record();
638 
639  pi_result wait();
640 
641  pi_result start();
642 
643  native_type get() const noexcept { return evEnd_; };
644 
645  pi_queue get_queue() const noexcept { return queue_; }
646 
647  CUstream get_stream() const noexcept { return stream_; }
648 
649  pi_uint32 get_compute_stream_token() const noexcept { return streamToken_; }
650 
651  pi_command_type get_command_type() const noexcept { return commandType_; }
652 
653  pi_uint32 get_reference_count() const noexcept { return refCount_; }
654 
655  bool is_recorded() const noexcept { return isRecorded_; }
656 
657  bool is_started() const noexcept { return isStarted_; }
658 
659  bool is_completed() const noexcept;
660 
661  pi_int32 get_execution_status() const noexcept {
662 
663  if (!is_recorded()) {
664  return PI_EVENT_SUBMITTED;
665  }
666 
667  if (!is_completed()) {
668  return PI_EVENT_RUNNING;
669  }
670  return PI_EVENT_COMPLETE;
671  }
672 
673  pi_context get_context() const noexcept { return context_; };
674 
675  pi_uint32 increment_reference_count() { return ++refCount_; }
676 
677  pi_uint32 decrement_reference_count() { return --refCount_; }
678 
679  pi_uint32 get_event_id() const noexcept { return eventId_; }
680 
681  bool backend_has_ownership() const noexcept { return has_ownership_; }
682 
683  // Returns the counter time when the associated command(s) were enqueued
684  //
685  pi_uint64 get_queued_time() const;
686 
687  // Returns the counter time when the associated command(s) started execution
688  //
689  pi_uint64 get_start_time() const;
690 
691  // Returns the counter time when the associated command(s) completed
692  //
693  pi_uint64 get_end_time() const;
694 
695  // construct a native CUDA. This maps closely to the underlying CUDA event.
696  static pi_event
699  return new _pi_event(type, queue->get_context(), queue, stream,
700  stream_token);
701  }
702 
703  static pi_event make_with_native(pi_context context, CUevent eventNative) {
704  return new _pi_event(context, eventNative);
705  }
706 
707  pi_result release();
708 
709  ~_pi_event();
710 
711 private:
712  // This constructor is private to force programmers to use the make_native /
713  // make_user static members in order to create a pi_event for CUDA.
714  _pi_event(pi_command_type type, pi_context context, pi_queue queue,
715  CUstream stream, pi_uint32 stream_token);
716 
717  // This constructor is private to force programmers to use the
718  // make_with_native for event introp
719  _pi_event(pi_context context, CUevent eventNative);
720 
721  pi_command_type commandType_; // The type of command associated with event.
722 
723  std::atomic_uint32_t refCount_; // Event reference count.
724 
725  bool has_ownership_; // Signifies if event owns the native type.
726 
727  bool hasBeenWaitedOn_; // Signifies whether the event has been waited
728  // on through a call to wait(), which implies
729  // that it has completed.
730 
731  bool isRecorded_; // Signifies wether a native CUDA event has been recorded
732  // yet.
733  bool isStarted_; // Signifies wether the operation associated with the
734  // PI event has started or not
735  //
736 
737  pi_uint32 streamToken_;
738  pi_uint32 eventId_; // Queue identifier of the event.
739 
740  native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
741  // event, this will be nullptr.
742 
743  native_type evStart_; // CUDA event handle associated with the start
744 
745  native_type evQueued_; // CUDA event handle associated with the time
746  // the command was enqueued
747 
748  pi_queue queue_; // pi_queue associated with the event. If this is a user
749  // event, this will be nullptr.
750 
751  CUstream stream_; // CUstream associated with the event. If this is a user
752  // event, this will be uninitialized.
753 
754  pi_context context_; // pi_context associated with the event. If this is a
755  // native event, this will be the same context associated
756  // with the queue_ member.
757 };
758 
761 struct _pi_program {
764  const char *binary_;
766  std::atomic_uint32_t refCount_;
768 
769  // Metadata
770  std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
772  std::unordered_map<std::string, std::string> globalIDMD_;
773 
774  constexpr static size_t MAX_LOG_SIZE = 8192u;
775 
777  std::string buildOptions_;
779 
780  _pi_program(pi_context ctxt);
781  ~_pi_program();
782 
784  size_t length);
785 
786  pi_result set_binary(const char *binary, size_t binarySizeInBytes);
787 
788  pi_result build_program(const char *build_options);
789 
790  pi_context get_context() const { return context_; };
791 
792  native_type get() const noexcept { return module_; };
793 
795 
797 
798  pi_uint32 get_reference_count() const noexcept { return refCount_; }
799 };
800 
817 struct _pi_kernel {
818  using native_type = CUfunction;
819 
822  std::string name_;
825  std::atomic_uint32_t refCount_;
826 
829 
837  struct arguments {
838  static constexpr size_t MAX_PARAM_BYTES = 4000u;
839  using args_t = std::array<char, MAX_PARAM_BYTES>;
840  using args_size_t = std::vector<size_t>;
841  using args_index_t = std::vector<void *>;
846 
847  std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0};
848 
850  // Place the implicit offset index at the end of the indicies collection
851  indices_.emplace_back(&implicitOffsetArgs_);
852  }
853 
859  void add_arg(size_t index, size_t size, const void *arg,
860  size_t localSize = 0) {
861  if (index + 2 > indices_.size()) {
862  // Move implicit offset argument index with the end
863  indices_.resize(index + 2, indices_.back());
864  // Ensure enough space for the new argument
865  paramSizes_.resize(index + 1);
866  offsetPerIndex_.resize(index + 1);
867  }
868  paramSizes_[index] = size;
869  // calculate the insertion point on the array
870  size_t insertPos = std::accumulate(std::begin(paramSizes_),
871  std::begin(paramSizes_) + index, 0);
872  // Update the stored value for the argument
873  std::memcpy(&storage_[insertPos], arg, size);
874  indices_[index] = &storage_[insertPos];
875  offsetPerIndex_[index] = localSize;
876  }
877 
878  void add_local_arg(size_t index, size_t size) {
879  size_t localOffset = this->get_local_size();
880 
881  // maximum required alignment is the size of the largest vector type
882  const size_t max_alignment = sizeof(double) * 16;
883 
884  // for arguments smaller than the maximum alignment simply align to the
885  // size of the argument
886  const size_t alignment = std::min(max_alignment, size);
887 
888  // align the argument
889  size_t alignedLocalOffset = localOffset;
890  if (localOffset % alignment != 0) {
891  alignedLocalOffset += alignment - (localOffset % alignment);
892  }
893 
894  add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset),
895  size + (alignedLocalOffset - localOffset));
896  }
897 
898  void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) {
899  assert(size == sizeof(std::uint32_t) * 3);
900  std::memcpy(implicitOffsetArgs_, implicitOffset, size);
901  }
902 
904  std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0);
905  }
906 
907  const args_index_t &get_indices() const noexcept { return indices_; }
908 
910  return std::accumulate(std::begin(offsetPerIndex_),
911  std::end(offsetPerIndex_), 0);
912  }
913  } args_;
914 
915  _pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name,
916  pi_program program, pi_context ctxt)
917  : function_{func}, functionWithOffsetParam_{funcWithOffsetParam},
918  name_{name}, context_{ctxt}, program_{program}, refCount_{1} {
923  this, ctxt->get_device(), PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
924  sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr);
925  (void)retError;
926  assert(retError == PI_SUCCESS);
927  }
928 
932  }
933 
934  pi_program get_program() const noexcept { return program_; }
935 
937 
939 
940  pi_uint32 get_reference_count() const noexcept { return refCount_; }
941 
942  native_type get() const noexcept { return function_; };
943 
946  };
947 
948  bool has_with_offset_parameter() const noexcept {
949  return functionWithOffsetParam_ != nullptr;
950  }
951 
952  pi_context get_context() const noexcept { return context_; };
953 
954  const char *get_name() const noexcept { return name_.c_str(); }
955 
960  pi_uint32 get_num_args() const noexcept { return args_.indices_.size() - 1; }
961 
962  void set_kernel_arg(int index, size_t size, const void *arg) {
963  args_.add_arg(index, size, arg);
964  }
965 
966  void set_kernel_local_arg(int index, size_t size) {
967  args_.add_local_arg(index, size);
968  }
969 
970  void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) {
971  args_.set_implicit_offset(size, implicitOffset);
972  }
973 
975  return args_.get_indices();
976  }
977 
978  pi_uint32 get_local_size() const noexcept { return args_.get_local_size(); }
979 
981 };
982 
988 struct _pi_sampler {
989  std::atomic_uint32_t refCount_;
992 
994  : refCount_(1), props_(0), context_(context) {}
995 
997 
999 
1000  pi_uint32 get_reference_count() const noexcept { return refCount_; }
1001 };
1002 
1003 // -------------------------------------------------------------
1004 // Helper types and functions
1005 //
1006 
1007 #endif // PI_CUDA_HPP
_pi_sampler::context_
pi_context context_
Definition: pi_cuda.hpp:991
_pi_kernel::has_with_offset_parameter
bool has_with_offset_parameter() const noexcept
Definition: pi_cuda.hpp:948
_pi_mem::_pi_mem
_pi_mem(pi_context ctxt, pi_mem parent, mem_::buffer_mem_::alloc_mode mode, CUdeviceptr ptr, void *host_ptr, size_t size)
Constructs the PI MEM handler for a non-typed allocation ("buffer")
Definition: pi_cuda.hpp:334
_pi_kernel::arguments::MAX_PARAM_BYTES
static constexpr size_t MAX_PARAM_BYTES
Definition: pi_cuda.hpp:838
_pi_kernel::native_type
CUfunction native_type
Definition: pi_cuda.hpp:818
_pi_event::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:653
_pi_mem::mem_
A PI Memory object represents either plain memory allocations ("Buffers" in OpenCL) or typed allocati...
Definition: pi_cuda.hpp:241
_pi_mem
PI Mem mapping to CUDA memory allocations, both data and texture/surface.
Definition: pi_cuda.hpp:224
CUevent
struct CUevent_st * CUevent
Definition: backend_traits_cuda.hpp:28
_pi_program::globalIDMD_
std::unordered_map< std::string, std::string > globalIDMD_
Definition: pi_cuda.hpp:772
PI_MAP_WRITE
constexpr pi_map_flags PI_MAP_WRITE
Definition: pi.h:623
_pi_mem_type
_pi_mem_type
Definition: pi.h:481
_pi_mem::mem_::surface_mem_::get_surface
CUsurfObject get_surface() const noexcept
Definition: pi_cuda.hpp:327
CUstream
struct CUstream_st * CUstream
Definition: backend_traits_cuda.hpp:27
_pi_kernel::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:938
sycl::_V1::access::mode
mode
Definition: access.hpp:30
_pi_kernel::arguments::get_local_size
pi_uint32 get_local_size() const
Definition: pi_cuda.hpp:909
pi.h
_pi_context::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:211
_pi_sampler::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:989
_pi_program::context_
_pi_context * context_
Definition: pi_cuda.hpp:767
_pi_event::get_context
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:673
_pi_mem::mem_::buffer_mem_::hostPtr_
void * hostPtr_
Pointer associated with this device on the host.
Definition: pi_cuda.hpp:253
_pi_context::deviceId_
_pi_device * deviceId_
Definition: pi_cuda.hpp:182
_pi_kernel::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:940
_pi_program::kernelReqdWorkGroupSizeMD_
std::unordered_map< std::string, std::tuple< uint32_t, uint32_t, uint32_t > > kernelReqdWorkGroupSizeMD_
Definition: pi_cuda.hpp:771
_pi_queue::context_
_pi_context * context_
Definition: pi_cuda.hpp:410
CUdeviceptr
unsigned int CUdeviceptr
Definition: backend_traits_cuda.hpp:35
_pi_mem::is_image
bool is_image() const noexcept
Definition: pi_cuda.hpp:382
_pi_event::get_compute_stream_token
pi_uint32 get_compute_stream_token() const noexcept
Definition: pi_cuda.hpp:649
_pi_kernel::arguments::args_index_t
std::vector< void * > args_index_t
Definition: pi_cuda.hpp:841
_pi_event::is_started
bool is_started() const noexcept
Definition: pi_cuda.hpp:657
_pi_queue::barrier_tmp_event_
CUevent barrier_tmp_event_
Definition: pi_cuda.hpp:414
_pi_sampler::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:996
_pi_queue::get
native_type get()
Definition: pi_cuda.hpp:472
_pi_device::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:109
_pi_queue::get_next_compute_stream
native_type get_next_compute_stream(pi_uint32 *stream_token=nullptr)
Definition: pi_cuda.cpp:525
_pi_context::_pi_context
_pi_context(_pi_device *devId)
Definition: pi_cuda.hpp:185
_pi_context::deleter_data
Definition: pi_cuda.hpp:172
_pi_sampler::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:998
_pi_kernel::arguments::implicitOffsetArgs_
std::uint32_t implicitOffsetArgs_[3]
Definition: pi_cuda.hpp:847
_pi_kernel::arguments::add_local_arg
void add_local_arg(size_t index, size_t size)
Definition: pi_cuda.hpp:878
_pi_kernel::get_arg_indices
const arguments::args_index_t & get_arg_indices() const
Definition: pi_cuda.hpp:974
_pi_kernel::name_
std::string name_
Definition: pi_cuda.hpp:822
_pi_queue::native_type
CUstream native_type
Definition: pi_cuda.hpp:396
_pi_context::get
native_type get() const noexcept
Definition: pi_cuda.hpp:207
_pi_mem::mem_::buffer_mem_::size_
size_t size_
Size of the allocation in bytes.
Definition: pi_cuda.hpp:255
_pi_program::get_context
pi_context get_context() const
Definition: pi_cuda.hpp:790
_pi_context::cuContext_
native_type cuContext_
Definition: pi_cuda.hpp:181
_pi_program::MAX_LOG_SIZE
constexpr static size_t MAX_LOG_SIZE
Definition: pi_cuda.hpp:774
_pi_sampler::_pi_sampler
_pi_sampler(pi_context context)
Definition: pi_cuda.hpp:993
_pi_device::get_platform
pi_platform get_platform() const noexcept
Definition: pi_cuda.hpp:111
_pi_stream_guard
std::unique_lock< std::mutex > _pi_stream_guard
Definition: pi_cuda.hpp:67
_pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr
@ alloc_host_ptr
_pi_result
_pi_result
Definition: pi.h:153
_pi_kernel::~_pi_kernel
~_pi_kernel()
Definition: pi_cuda.hpp:929
sycl::_V1::detail::memcpy
void memcpy(void *Dst, const void *Src, size_t Size)
Definition: memcpy.hpp:16
_pi_queue::transfer_applied_barrier_
std::vector< bool > transfer_applied_barrier_
Definition: pi_cuda.hpp:409
_pi_queue::for_each_stream
void for_each_stream(T &&f)
Definition: pi_cuda.hpp:526
sycl::_V1::ext::oneapi::experimental::alignment
constexpr alignment_key::value_t< K > alignment
Definition: properties.hpp:349
_pi_queue::device_
_pi_device * device_
Definition: pi_cuda.hpp:411
_pi_context::native_type
CUcontext native_type
Definition: pi_cuda.hpp:179
_pi_device::_pi_device
_pi_device(native_type cuDevice, CUcontext cuContext, CUevent evBase, pi_platform platform)
Definition: pi_cuda.hpp:98
_pi_event::get_queued_time
pi_uint64 get_queued_time() const
Definition: pi_cuda.cpp:688
_pi_queue::default_num_transfer_streams
static constexpr int default_num_transfer_streams
Definition: pi_cuda.hpp:398
_pi_mem::mem_::buffer_mem_::get_map_offset
size_t get_map_offset(void *) const noexcept
Definition: pi_cuda.hpp:283
_pi_mem::context_
pi_context context_
Definition: pi_cuda.hpp:230
_pi_device::get_max_work_group_size
int get_max_work_group_size() const noexcept
Definition: pi_cuda.hpp:129
_pi_mem::is_buffer
bool is_buffer() const noexcept
Definition: pi_cuda.hpp:376
_pi_queue::backend_has_ownership
bool backend_has_ownership() const noexcept
Definition: pi_cuda.hpp:626
_pi_queue::get_next_transfer_stream
native_type get_next_transfer_stream()
Definition: pi_cuda.cpp:586
_pi_queue::sync_streams
void sync_streams(T &&f)
Definition: pi_cuda.hpp:547
_pi_event::get_stream
CUstream get_stream() const noexcept
Definition: pi_cuda.hpp:647
_pi_program::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:766
_pi_mem::mem_::surface_mem_::get_array
CUarray get_array() const noexcept
Definition: pi_cuda.hpp:325
PI_EVENT_RUNNING
@ PI_EVENT_RUNNING
Definition: pi.h:163
max
simd< _Tp, _Abi > max(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
_pi_platform
A PI platform stores all known PI devices, in the CUDA plugin this is just a vector of available devi...
Definition: pi_cuda.hpp:74
_pi_kernel::arguments::paramSizes_
args_size_t paramSizes_
Definition: pi_cuda.hpp:843
_pi_queue::can_reuse_stream
bool can_reuse_stream(pi_uint32 stream_token)
Definition: pi_cuda.hpp:482
_pi_context::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:213
_pi_mem::mem_::buffer_mem_::mapOffset_
size_t mapOffset_
Offset of the active mapped region.
Definition: pi_cuda.hpp:257
_pi_platform::devices_
std::vector< std::unique_ptr< _pi_device > > devices_
Definition: pi_cuda.hpp:75
_pi_program::build_program
pi_result build_program(const char *build_options)
Definition: pi_cuda.cpp:837
sycl::_V1::ext::intel::host_ptr
multi_ptr< ElementType, access::address_space::ext_intel_global_host_space, IsDecorated > host_ptr
Definition: usm_pointers.hpp:32
_pi_kernel
Implementation of a PI Kernel for CUDA.
Definition: pi_cuda.hpp:817
CUcontext
struct CUctx_st * CUcontext
Definition: backend_traits_cuda.hpp:26
_pi_sampler::props_
pi_uint32 props_
Definition: pi_cuda.hpp:990
_pi_queue::eventCount_
std::atomic_uint32_t eventCount_
Definition: pi_cuda.hpp:416
_pi_mem::mem_::buffer_mem_::alloc_mode
alloc_mode
alloc_mode classic: Just a normal buffer allocated on the device via cuda malloc use_host_ptr: Use an...
Definition: pi_cuda.hpp:270
_pi_context::set_extended_deleter
void set_extended_deleter(pi_context_extended_deleter function, void *user_data)
Definition: pi_cuda.hpp:199
cuda_piKernelRetain
pi_result cuda_piKernelRetain(pi_kernel kernel)
Definition: pi_cuda.cpp:3967
_pi_mem::~_pi_mem
~_pi_mem()
Definition: pi_cuda.hpp:365
_pi_queue::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:618
cuda_piDeviceRetain
pi_result cuda_piDeviceRetain(pi_device)
Definition: pi_cuda.cpp:1088
_pi_program::buildOptions_
std::string buildOptions_
Definition: pi_cuda.hpp:777
_pi_mem::mem_type_
enum _pi_mem::mem_type mem_type_
_pi_queue::barrier_mutex_
std::mutex barrier_mutex_
Definition: pi_cuda.hpp:430
_pi_queue::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:622
_pi_queue::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:620
_pi_kernel::arguments::args_t
std::array< char, MAX_PARAM_BYTES > args_t
Definition: pi_cuda.hpp:839
_pi_program::buildStatus_
pi_program_build_status buildStatus_
Definition: pi_cuda.hpp:778
_pi_kernel::functionWithOffsetParam_
native_type functionWithOffsetParam_
Definition: pi_cuda.hpp:821
_pi_program::module_
native_type module_
Definition: pi_cuda.hpp:763
_pi_queue::has_been_synchronized
bool has_been_synchronized(pi_uint32 stream_token)
Definition: pi_cuda.hpp:474
_pi_queue
PI queue mapping on to CUstream objects.
Definition: pi_cuda.hpp:395
_pi_context::~_pi_context
~_pi_context()
Definition: pi_cuda.hpp:190
_pi_kernel::get_num_args
pi_uint32 get_num_args() const noexcept
Returns the number of arguments, excluding the implicit global offset.
Definition: pi_cuda.hpp:960
_pi_queue::get_next_event_id
pi_uint32 get_next_event_id() noexcept
Definition: pi_cuda.hpp:624
_pi_event::make_native
static pi_event make_native(pi_command_type type, pi_queue queue, CUstream stream, pi_uint32 stream_token=std::numeric_limits< pi_uint32 >::max())
Definition: pi_cuda.hpp:697
pi_uint32
uint32_t pi_uint32
Definition: pi.h:142
_pi_mem::mem_::buffer_mem_::ptr_
native_type ptr_
Definition: pi_cuda.hpp:250
_pi_mem::mem_::buffer_mem_::get_map_flags
pi_map_flags get_map_flags() const noexcept
Definition: pi_cuda.hpp:313
_pi_queue::barrier_event_
CUevent barrier_event_
Definition: pi_cuda.hpp:413
_pi_kernel::arguments::offsetPerIndex_
args_size_t offsetPerIndex_
Definition: pi_cuda.hpp:845
_pi_mem::mem_::surface_mem_
struct _pi_mem::mem_::surface_mem_ surface_mem_
_pi_event::backend_has_ownership
bool backend_has_ownership() const noexcept
Definition: pi_cuda.hpp:681
_pi_event::decrement_reference_count
pi_uint32 decrement_reference_count()
Definition: pi_cuda.hpp:677
_pi_kernel::function_
native_type function_
Definition: pi_cuda.hpp:820
_pi_queue::properties_
pi_queue_properties properties_
Definition: pi_cuda.hpp:412
_pi_program::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:798
CUmodule
struct CUmod_st * CUmodule
Definition: backend_traits_cuda.hpp:29
_pi_kernel::arguments::storage_
args_t storage_
Definition: pi_cuda.hpp:842
_pi_queue::compute_stream_idx_
std::atomic_uint32_t compute_stream_idx_
Definition: pi_cuda.hpp:417
_pi_kernel::arguments::add_arg
void add_arg(size_t index, size_t size, const void *arg, size_t localSize=0)
Adds an argument to the kernel.
Definition: pi_cuda.hpp:859
_pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr
@ use_host_ptr
_pi_mem::mem_::buffer_mem_::get
native_type get() const noexcept
Definition: pi_cuda.hpp:277
_pi_queue::transfer_stream_wait_for_barrier_if_needed
void transfer_stream_wait_for_barrier_if_needed(CUstream stream, pi_uint32 stream_i)
Definition: pi_cuda.cpp:517
_pi_event::make_with_native
static pi_event make_with_native(pi_context context, CUevent eventNative)
Definition: pi_cuda.hpp:703
_pi_kernel::context_
pi_context context_
Definition: pi_cuda.hpp:823
pi_context_extended_deleter
void(* pi_context_extended_deleter)(void *user_data)
Definition: pi.h:1161
_pi_mem::mem_::surface_mem_::surfObj_
CUsurfObject surfObj_
Definition: pi_cuda.hpp:322
_pi_event::get_execution_status
pi_int32 get_execution_status() const noexcept
Definition: pi_cuda.hpp:661
_pi_device::get_elapsed_time
pi_uint64 get_elapsed_time(CUevent) const
Definition: pi_cuda.cpp:680
_pi_context::deleter_data::user_data
void * user_data
Definition: pi_cuda.hpp:174
_pi_kernel::get_program
pi_program get_program() const noexcept
Definition: pi_cuda.hpp:934
_pi_mem::mem_type
mem_type
Definition: pi_cuda.hpp:234
cuda_piProgramRelease
pi_result cuda_piProgramRelease(pi_program program)
Decreases the reference count of a pi_program object.
Definition: pi_cuda.cpp:3821
_pi_kernel::_pi_kernel
_pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name, pi_program program, pi_context ctxt)
Definition: pi_cuda.hpp:915
_pi_event::get_event_id
pi_uint32 get_event_id() const noexcept
Definition: pi_cuda.hpp:679
_pi_queue::~_pi_queue
~_pi_queue()
Definition: pi_cuda.hpp:451
_pi_event::get_command_type
pi_command_type get_command_type() const noexcept
Definition: pi_cuda.hpp:651
_pi_event::native_type
CUevent native_type
Definition: pi_cuda.hpp:635
_pi_mem::mem_::buffer_mem_::get_size
size_t get_size() const noexcept
Definition: pi_cuda.hpp:279
_pi_kernel::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:936
_pi_mem::mem_::buffer_mem_::mapPtr_
void * mapPtr_
Pointer to the active mapped region, if any.
Definition: pi_cuda.hpp:259
pi_uint64
uint64_t pi_uint64
Definition: pi.h:143
cuda_piContextRetain
pi_result cuda_piContextRetain(pi_context context)
Definition: pi_cuda.cpp:1128
_pi_mem::mem_::buffer_mem_::parent_
pi_mem parent_
Definition: pi_cuda.hpp:248
_pi_queue::has_ownership_
bool has_ownership_
Definition: pi_cuda.hpp:431
_pi_queue::last_sync_transfer_streams_
unsigned int last_sync_transfer_streams_
Definition: pi_cuda.hpp:422
_pi_device_binary_property_struct
Definition: pi.h:768
_pi_device::save_max_work_group_size
void save_max_work_group_size(int value) noexcept
Definition: pi_cuda.hpp:120
_pi_event::wait
pi_result wait()
Definition: pi_cuda.cpp:733
_pi_program
Implementation of PI Program on CUDA Module object.
Definition: pi_cuda.hpp:761
_pi_queue::transfer_stream_idx_
std::atomic_uint32_t transfer_stream_idx_
Definition: pi_cuda.hpp:418
_pi_sampler
Implementation of samplers for CUDA.
Definition: pi_cuda.hpp:988
_pi_queue::delay_compute_
std::vector< bool > delay_compute_
Definition: pi_cuda.hpp:406
sycl::_V1::access::target::device
@ device
_pi_sampler::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:1000
_pi_device::get
native_type get() const noexcept
Definition: pi_cuda.hpp:105
_pi_kernel::get_context
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:952
_pi_queue::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:415
_pi_program::infoLog_
char infoLog_[MAX_LOG_SIZE]
Definition: pi_cuda.hpp:776
_pi_event::is_completed
bool is_completed() const noexcept
Definition: pi_cuda.cpp:663
_pi_queue::get_device
_pi_device * get_device() const
Definition: pi_cuda.hpp:616
cuda_piMemRetain
pi_result cuda_piMemRetain(pi_mem mem)
Definition: pi_cuda.cpp:3556
_pi_mem::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:390
_pi_event::release
pi_result release()
Definition: pi_cuda.cpp:745
_pi_mem::mem_::buffer_mem_
Definition: pi_cuda.hpp:243
PI_EVENT_COMPLETE
@ PI_EVENT_COMPLETE
Definition: pi.h:162
_pi_mem::mem_type::buffer
@ buffer
_pi_kernel::get_local_size
pi_uint32 get_local_size() const noexcept
Definition: pi_cuda.hpp:978
_pi_mem::mem_::buffer_mem_::native_type
CUdeviceptr native_type
Definition: pi_cuda.hpp:244
_pi_device::save_max_work_item_sizes
void save_max_work_item_sizes(size_t size, size_t *save_max_work_item_sizes) noexcept
Definition: pi_cuda.hpp:115
_pi_program::~_pi_program
~_pi_program()
Definition: pi_cuda.cpp:778
_pi_event::get
native_type get() const noexcept
Definition: pi_cuda.hpp:643
_pi_kernel::arguments::set_implicit_offset
void set_implicit_offset(size_t size, std::uint32_t *implicitOffset)
Definition: pi_cuda.hpp:898
_pi_mem::mem_::buffer_mem_::alloc_mode::copy_in
@ copy_in
_pi_program::binary_
const char * binary_
Definition: pi_cuda.hpp:764
_pi_mem::mem_::buffer_mem_::get_map_ptr
void * get_map_ptr() const noexcept
Definition: pi_cuda.hpp:281
all_of
bool all_of(const simd_mask< _Tp, _Abi > &) noexcept
_pi_kernel::get_with_offset_parameter
native_type get_with_offset_parameter() const noexcept
Definition: pi_cuda.hpp:944
_pi_context::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:209
_pi_event::~_pi_event
~_pi_event()
Definition: pi_cuda.cpp:638
_pi_event::increment_reference_count
pi_uint32 increment_reference_count()
Definition: pi_cuda.hpp:675
_pi_queue::num_compute_streams_
unsigned int num_compute_streams_
Definition: pi_cuda.hpp:419
_pi_context::deleter_data::operator()
void operator()()
Definition: pi_cuda.hpp:176
cuda_piDeviceRelease
pi_result cuda_piDeviceRelease(pi_device)
Definition: pi_cuda.cpp:1204
_pi_kernel::arguments::clear_local_size
void clear_local_size()
Definition: pi_cuda.hpp:903
_pi_mem::mem_::surface_mem_::imageType_
pi_mem_type imageType_
Definition: pi_cuda.hpp:323
_pi_event::_pi_event
_pi_event()
Definition: pi_esimd_emulator.hpp:202
pi_queue_properties
pi_bitfield pi_queue_properties
Definition: pi.h:646
_pi_kernel::program_
pi_program program_
Definition: pi_cuda.hpp:824
_pi_program::_pi_program
_pi_program()
Definition: pi_esimd_emulator.hpp:211
_pi_kernel::arguments
Structure that holds the arguments to the kernel.
Definition: pi_cuda.hpp:837
_pi_mem::mem_type::surface
@ surface
_pi_queue::compute_stream_wait_for_barrier_if_needed
void compute_stream_wait_for_barrier_if_needed(CUstream stream, pi_uint32 stream_i)
Definition: pi_cuda.cpp:509
_pi_mem::mem_::surface_mem_
Definition: pi_cuda.hpp:320
_pi_program::native_type
CUmodule native_type
Definition: pi_cuda.hpp:762
PI_PROGRAM_BUILD_STATUS_NONE
@ PI_PROGRAM_BUILD_STATUS_NONE
Definition: pi.h:185
_pi_event::get_start_time
pi_uint64 get_start_time() const
Definition: pi_cuda.cpp:693
_pi_event
PI Event mapping to CUevent.
Definition: pi_cuda.hpp:633
_pi_queue::compute_stream_mutex_
std::mutex compute_stream_mutex_
Definition: pi_cuda.hpp:428
_pi_queue::compute_applied_barrier_
std::vector< bool > compute_applied_barrier_
Definition: pi_cuda.hpp:408
_pi_program::errorLog_
char errorLog_[MAX_LOG_SIZE]
Definition: pi_cuda.hpp:776
_pi_context::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:183
_pi_kernel::reqdThreadsPerBlock_
size_t reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS]
Definition: pi_cuda.hpp:828
_pi_mem::mem_::buffer_mem_
struct _pi_mem::mem_::buffer_mem_ buffer_mem_
_pi_mem::mem_::buffer_mem_::alloc_mode::classic
@ classic
_pi_mem::refCount_
std::atomic_uint32_t refCount_
Reference counting of the handler.
Definition: pi_cuda.hpp:233
_pi_mem::mem_
union _pi_mem::mem_ mem_
_pi_kernel::set_implicit_offset_arg
void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset)
Definition: pi_cuda.hpp:970
_pi_queue::compute_streams_
std::vector< native_type > compute_streams_
Definition: pi_cuda.hpp:400
_pi_mem::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:386
_pi_queue::num_transfer_streams_
unsigned int num_transfer_streams_
Definition: pi_cuda.hpp:420
_pi_kernel::get_name
const char * get_name() const noexcept
Definition: pi_cuda.hpp:954
cuda_piProgramRetain
pi_result cuda_piProgramRetain(pi_program program)
Definition: pi_cuda.cpp:3811
_pi_event::is_recorded
bool is_recorded() const noexcept
Definition: pi_cuda.hpp:655
_pi_kernel::arguments::arguments
arguments()
Definition: pi_cuda.hpp:849
_pi_device::get_max_work_item_sizes
void get_max_work_item_sizes(size_t ret_size, size_t *ret_max_work_item_sizes) const noexcept
Definition: pi_cuda.hpp:124
_pi_program::set_binary
pi_result set_binary(const char *binary, size_t binarySizeInBytes)
Definition: pi_cuda.cpp:829
_pi_kernel::get
native_type get() const noexcept
Definition: pi_cuda.hpp:942
cuda_piQueueRetain
pi_result cuda_piQueueRetain(pi_queue command_queue)
Definition: pi_cuda.cpp:2723
_pi_mem::_pi_mem
_pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf, pi_mem_type image_type, void *host_ptr)
Constructs the PI allocation for an Image object (surface in CUDA)
Definition: pi_cuda.hpp:353
_pi_program_build_status
_pi_program_build_status
Definition: pi.h:184
_pi_kernel::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:825
_pi_kernel::arguments::indices_
args_index_t indices_
Definition: pi_cuda.hpp:844
_pi_queue::_pi_queue
_pi_queue(std::vector< CUstream > &&compute_streams, std::vector< CUstream > &&transfer_streams, _pi_context *context, _pi_device *device, pi_queue_properties properties, unsigned int flags, bool backend_owns=true)
Definition: pi_cuda.hpp:433
_pi_kernel::args_
struct _pi_kernel::arguments args_
_pi_queue::transfer_stream_mutex_
std::mutex transfer_stream_mutex_
Definition: pi_cuda.hpp:429
_pi_kernel::set_kernel_local_arg
void set_kernel_local_arg(int index, size_t size)
Definition: pi_cuda.hpp:966
_pi_queue::default_num_compute_streams
static constexpr int default_num_compute_streams
Definition: pi_cuda.hpp:397
sycl::_V1::detail::usm::free
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:277
_pi_event::start
pi_result start()
Definition: pi_cuda.cpp:645
_pi_mem::mem_::buffer_mem_::allocMode_
enum _pi_mem::mem_::buffer_mem_::alloc_mode allocMode_
_pi_mem::get_context
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:384
cuda_piContextRelease
pi_result cuda_piContextRelease(pi_context ctxt)
Definition: pi_cuda.cpp:2302
_pi_kernel_group_info
_pi_kernel_group_info
Definition: pi.h:411
_pi_device::get_context
CUcontext get_context() const noexcept
Definition: pi_cuda.hpp:107
_pi_program::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:796
pfn_notify
void(* pfn_notify)(pi_event event, pi_int32 eventCommandStatus, void *userData)
Definition: pi_cuda.hpp:629
_pi_mem::is_sub_buffer
bool is_sub_buffer() const noexcept
Definition: pi_cuda.hpp:378
_pi_queue::get_context
_pi_context * get_context() const
Definition: pi_cuda.hpp:614
_pi_queue::transfer_streams_
std::vector< native_type > transfer_streams_
Definition: pi_cuda.hpp:401
_pi_mem::mem_::buffer_mem_::mapFlags_
pi_map_flags mapFlags_
Original flags for the mapped region.
Definition: pi_cuda.hpp:261
_pi_event::get_end_time
pi_uint64 get_end_time() const
Definition: pi_cuda.cpp:698
_pi_program::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:794
_pi_command_type
_pi_command_type
Definition: pi.h:447
_pi_kernel::set_kernel_arg
void set_kernel_arg(int index, size_t size, const void *arg)
Definition: pi_cuda.hpp:962
_pi_kernel::arguments::args_size_t
std::vector< size_t > args_size_t
Definition: pi_cuda.hpp:840
_pi_device::~_pi_device
~_pi_device()
Definition: pi_cuda.hpp:103
_pi_mem::mem_::buffer_mem_::unmap
void unmap(void *) noexcept
Detach the allocation from the host memory.
Definition: pi_cuda.hpp:303
pi_map_flags
pi_bitfield pi_map_flags
Definition: pi.h:621
cuda_piMemRelease
pi_result cuda_piMemRelease(pi_mem memObj)
Decreases the reference count of the Mem object.
Definition: pi_cuda.cpp:2431
_pi_queue::compute_stream_sync_mutex_
std::mutex compute_stream_sync_mutex_
Definition: pi_cuda.hpp:427
_pi_program::get
native_type get() const noexcept
Definition: pi_cuda.hpp:792
_pi_queue::last_sync_compute_streams_
unsigned int last_sync_compute_streams_
Definition: pi_cuda.hpp:421
_pi_mem::mem_::surface_mem_::array_
CUarray array_
Definition: pi_cuda.hpp:321
_pi_mem::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:388
_pi_program::set_metadata
pi_result set_metadata(const pi_device_binary_property *metadata, size_t length)
Definition: pi_cuda.cpp:789
_pi_queue::flags_
unsigned int flags_
Definition: pi_cuda.hpp:423
_pi_mem::mem_::surface_mem_::get_image_type
pi_mem_type get_image_type() const noexcept
Definition: pi_cuda.hpp:329
_pi_event::get_queue
pi_queue get_queue() const noexcept
Definition: pi_cuda.hpp:645
_pi_kernel::REQD_THREADS_PER_BLOCK_DIMENSIONS
static constexpr pi_uint32 REQD_THREADS_PER_BLOCK_DIMENSIONS
Definition: pi_cuda.hpp:827
CUdevice
int CUdevice
Definition: backend_traits_cuda.hpp:25
cuda_piKernelGetGroupInfo
pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_cuda.cpp:3102
PI_EVENT_SUBMITTED
@ PI_EVENT_SUBMITTED
Definition: pi.h:164
PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
Definition: pi.h:414
_pi_mem::mem_::buffer_mem_::map_to_ptr
void * map_to_ptr(size_t offset, pi_map_flags flags) noexcept
Returns a pointer to data visible on the host that contains the data on the device associated with th...
Definition: pi_cuda.hpp:289
pi_int32
int32_t pi_int32
Definition: pi.h:141
_pi_context
PI context mapping to a CUDA context object.
Definition: pi_cuda.hpp:170
cuda_piKernelRelease
pi_result cuda_piKernelRelease(pi_kernel kernel)
Definition: pi_cuda.cpp:3975
_pi_program::binarySizeInBytes_
size_t binarySizeInBytes_
Definition: pi_cuda.hpp:765
cuda_piQueueRelease
pi_result cuda_piQueueRelease(pi_queue command_queue)
Definition: pi_cuda.cpp:2731
sycl::_V1::malloc
void * malloc(size_t size, const device &dev, const context &ctxt, usm::alloc kind _CODELOCPARAM(&CodeLoc))
_pi_queue::all_of
bool all_of(T &&f)
Definition: pi_cuda.hpp:504
_pi_device
PI device mapping to a CUdevice.
Definition: pi_cuda.hpp:83
_pi_event::record
pi_result record()
Definition: pi_cuda.cpp:703
_pi_context::get_device
pi_device get_device() const noexcept
Definition: pi_cuda.hpp:205
_pi_context::invoke_extended_deleters
void invoke_extended_deleters()
Definition: pi_cuda.hpp:192
_pi_kernel::arguments::get_indices
const args_index_t & get_indices() const noexcept
Definition: pi_cuda.hpp:907
_pi_kernel::clear_local_size
void clear_local_size()
Definition: pi_cuda.hpp:980