DPC++ Runtime
Runtime libraries for oneAPI DPC++
pi_cuda.hpp
Go to the documentation of this file.
1 //===-- pi_cuda.hpp - CUDA Plugin -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
11 
17 
18 #ifndef PI_CUDA_HPP
19 #define PI_CUDA_HPP
20 
21 // This version should be incremented for any change made to this file or its
22 // corresponding .cpp file.
23 #define _PI_CUDA_PLUGIN_VERSION 1
24 
25 #define _PI_CUDA_PLUGIN_VERSION_STRING \
26  _PI_PLUGIN_VERSION_STRING(_PI_CUDA_PLUGIN_VERSION)
27 
28 #include "sycl/detail/pi.h"
29 #include <array>
30 #include <atomic>
31 #include <cassert>
32 #include <cstring>
33 #include <cuda.h>
34 #include <functional>
35 #include <limits>
36 #include <memory>
37 #include <mutex>
38 #include <numeric>
39 #include <stdint.h>
40 #include <string>
41 #include <unordered_map>
42 #include <vector>
43 
44 extern "C" {
45 
60  pi_kernel_group_info param_name,
61  size_t param_value_size, void *param_value,
62  size_t *param_value_size_ret);
64 }
65 
66 using _pi_stream_guard = std::unique_lock<std::mutex>;
67 
73 struct _pi_platform {
74  static CUevent evBase_; // CUDA event used as base counter
75  std::vector<std::unique_ptr<_pi_device>> devices_;
76 };
77 
83 struct _pi_device {
84 private:
85  using native_type = CUdevice;
86 
87  native_type cuDevice_;
88  std::atomic_uint32_t refCount_;
89  pi_platform platform_;
90 
91  static constexpr pi_uint32 max_work_item_dimensions = 3u;
92  size_t max_work_item_sizes[max_work_item_dimensions];
93  int max_work_group_size;
94 
95 public:
96  _pi_device(native_type cuDevice, pi_platform platform)
97  : cuDevice_(cuDevice), refCount_{1}, platform_(platform) {}
98 
99  native_type get() const noexcept { return cuDevice_; };
100 
101  pi_uint32 get_reference_count() const noexcept { return refCount_; }
102 
103  pi_platform get_platform() const noexcept { return platform_; };
104 
105  void save_max_work_item_sizes(size_t size,
106  size_t *save_max_work_item_sizes) noexcept {
107  memcpy(max_work_item_sizes, save_max_work_item_sizes, size);
108  };
109 
110  void save_max_work_group_size(int value) noexcept {
111  max_work_group_size = value;
112  };
113 
114  void get_max_work_item_sizes(size_t ret_size,
115  size_t *ret_max_work_item_sizes) const noexcept {
116  memcpy(ret_max_work_item_sizes, max_work_item_sizes, ret_size);
117  };
118 
119  int get_max_work_group_size() const noexcept { return max_work_group_size; };
120 };
121 
160 struct _pi_context {
161 
162  struct deleter_data {
164  void *user_data;
165 
166  void operator()() { function(user_data); }
167  };
168 
170 
171  enum class kind { primary, user_defined } kind_;
174  std::atomic_uint32_t refCount_;
175 
177  bool backend_owns = true)
178  : kind_{k}, cuContext_{ctxt}, deviceId_{devId}, refCount_{1},
179  has_ownership{backend_owns} {
181  };
182 
184 
186  std::lock_guard<std::mutex> guard(mutex_);
187  for (auto &deleter : extended_deleters_) {
188  deleter();
189  }
190  }
191 
193  void *user_data) {
194  std::lock_guard<std::mutex> guard(mutex_);
195  extended_deleters_.emplace_back(deleter_data{function, user_data});
196  }
197 
198  pi_device get_device() const noexcept { return deviceId_; }
199 
200  native_type get() const noexcept { return cuContext_; }
201 
202  bool is_primary() const noexcept { return kind_ == kind::primary; }
203 
205 
207 
208  pi_uint32 get_reference_count() const noexcept { return refCount_; }
209 
210  bool backend_has_ownership() const noexcept { return has_ownership; }
211 
212 private:
213  std::mutex mutex_;
214  std::vector<deleter_data> extended_deleters_;
215  const bool has_ownership;
216 };
217 
222 struct _pi_mem {
223 
224  // TODO: Move as much shared data up as possible
226 
227  // Context where the memory object is accessibles
229 
231  std::atomic_uint32_t refCount_;
232  enum class mem_type { buffer, surface } mem_type_;
233 
239  union mem_ {
240  // Handler for plain, pointer-based CUDA allocations
241  struct buffer_mem_ {
243 
244  // If this allocation is a sub-buffer (i.e., a view on an existing
245  // allocation), this is the pointer to the parent handler structure
247  // CUDA handler for the pointer
249 
251  void *hostPtr_;
253  size_t size_;
255  size_t mapOffset_;
257  void *mapPtr_;
260 
268  enum class alloc_mode {
269  classic,
270  use_host_ptr,
271  copy_in,
274 
275  native_type get() const noexcept { return ptr_; }
276 
277  size_t get_size() const noexcept { return size_; }
278 
279  void *get_map_ptr() const noexcept { return mapPtr_; }
280 
281  size_t get_map_offset(void *) const noexcept { return mapOffset_; }
282 
287  void *map_to_ptr(size_t offset, pi_map_flags flags) noexcept {
288  assert(mapPtr_ == nullptr);
289  mapOffset_ = offset;
290  mapFlags_ = flags;
291  if (hostPtr_) {
292  mapPtr_ = static_cast<char *>(hostPtr_) + offset;
293  } else {
294  // TODO: Allocate only what is needed based on the offset
295  mapPtr_ = static_cast<void *>(malloc(this->get_size()));
296  }
297  return mapPtr_;
298  }
299 
301  void unmap(void *) noexcept {
302  assert(mapPtr_ != nullptr);
303 
304  if (mapPtr_ != hostPtr_) {
305  free(mapPtr_);
306  }
307  mapPtr_ = nullptr;
308  mapOffset_ = 0;
309  }
310 
311  pi_map_flags get_map_flags() const noexcept {
312  assert(mapPtr_ != nullptr);
313  return mapFlags_;
314  }
316 
317  // Handler data for surface object (i.e. Images)
318  struct surface_mem_ {
319  CUarray array_;
320  CUsurfObject surfObj_;
322 
323  CUarray get_array() const noexcept { return array_; }
324 
325  CUsurfObject get_surface() const noexcept { return surfObj_; }
326 
327  pi_mem_type get_image_type() const noexcept { return imageType_; }
329  } mem_;
330 
333  CUdeviceptr ptr, void *host_ptr, size_t size)
334  : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer} {
335  mem_.buffer_mem_.ptr_ = ptr;
336  mem_.buffer_mem_.parent_ = parent;
338  mem_.buffer_mem_.size_ = size;
340  mem_.buffer_mem_.mapPtr_ = nullptr;
343  if (is_sub_buffer()) {
345  } else {
347  }
348  };
349 
351  _pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf,
352  pi_mem_type image_type, void *host_ptr)
353  : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface} {
354  // Ignore unused parameter
355  (void)host_ptr;
356 
357  mem_.surface_mem_.array_ = array;
358  mem_.surface_mem_.surfObj_ = surf;
359  mem_.surface_mem_.imageType_ = image_type;
361  }
362 
364  if (mem_type_ == mem_type::buffer) {
365  if (is_sub_buffer()) {
367  return;
368  }
369  }
371  }
372 
373  // TODO: Move as many shared funcs up as possible
374  bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; }
375 
376  bool is_sub_buffer() const noexcept {
377  return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr));
378  }
379 
380  bool is_image() const noexcept { return mem_type_ == mem_type::surface; }
381 
382  pi_context get_context() const noexcept { return context_; }
383 
385 
387 
388  pi_uint32 get_reference_count() const noexcept { return refCount_; }
389 };
390 
393 struct _pi_queue {
395  static constexpr int default_num_compute_streams = 128;
396  static constexpr int default_num_transfer_streams = 64;
397 
398  std::vector<native_type> compute_streams_;
399  std::vector<native_type> transfer_streams_;
400  // delay_compute_ keeps track of which streams have been recently reused and
401  // their next use should be delayed. If a stream has been recently reused it
402  // will be skipped the next time it would be selected round-robin style. When
403  // skipped, its delay flag is cleared.
404  std::vector<bool> delay_compute_;
405  // keep track of which streams have applied barrier
406  std::vector<bool> compute_applied_barrier_;
407  std::vector<bool> transfer_applied_barrier_;
413  std::atomic_uint32_t refCount_;
414  std::atomic_uint32_t eventCount_;
415  std::atomic_uint32_t compute_stream_idx_;
416  std::atomic_uint32_t transfer_stream_idx_;
417  unsigned int num_compute_streams_;
418  unsigned int num_transfer_streams_;
421  unsigned int flags_;
422  // When compute_stream_sync_mutex_ and compute_stream_mutex_ both need to be
423  // locked at the same time, compute_stream_sync_mutex_ should be locked first
424  // to avoid deadlocks
428  std::mutex barrier_mutex_;
430 
431  _pi_queue(std::vector<CUstream> &&compute_streams,
432  std::vector<CUstream> &&transfer_streams, _pi_context *context,
433  _pi_device *device, pi_queue_properties properties,
434  unsigned int flags, bool backend_owns = true)
435  : compute_streams_{std::move(compute_streams)},
436  transfer_streams_{std::move(transfer_streams)},
437  delay_compute_(compute_streams_.size(), false),
440  device_{device}, properties_{properties}, refCount_{1}, eventCount_{0},
444  flags_(flags), has_ownership_{backend_owns} {
447  }
448 
452  }
453 
455  pi_uint32 stream_i);
457  pi_uint32 stream_i);
458 
459  // get_next_compute/transfer_stream() functions return streams from
460  // appropriate pools in round-robin fashion
461  native_type get_next_compute_stream(pi_uint32 *stream_token = nullptr);
462  // this overload tries select a stream that was used by one of dependancies.
463  // If that is not possible returns a new stream. If a stream is reused it
464  // returns a lock that needs to remain locked as long as the stream is in use
465  native_type get_next_compute_stream(pi_uint32 num_events_in_wait_list,
466  const pi_event *event_wait_list,
467  _pi_stream_guard &guard,
468  pi_uint32 *stream_token = nullptr);
471 
472  bool has_been_synchronized(pi_uint32 stream_token) {
473  // stream token not associated with one of the compute streams
474  if (stream_token == std::numeric_limits<pi_uint32>::max()) {
475  return false;
476  }
477  return last_sync_compute_streams_ >= stream_token;
478  }
479 
480  bool can_reuse_stream(pi_uint32 stream_token) {
481  // stream token not associated with one of the compute streams
482  if (stream_token == std::numeric_limits<pi_uint32>::max()) {
483  return false;
484  }
485  // If the command represented by the stream token was not the last command
486  // enqueued to the stream we can not reuse the stream - we need to allow for
487  // commands enqueued after it and the one we are about to enqueue to run
488  // concurrently
489  bool is_last_command =
490  (compute_stream_idx_ - stream_token) <= compute_streams_.size();
491  // If there was a barrier enqueued to the queue after the command
492  // represented by the stream token we should not reuse the stream, as we can
493  // not take that stream into account for the bookkeeping for the next
494  // barrier - such a stream would not be synchronized with. Performance-wise
495  // it does not matter that we do not reuse the stream, as the work
496  // represented by the stream token is guaranteed to be complete by the
497  // barrier before any work we are about to enqueue to the stream will start,
498  // so the event does not need to be synchronized with.
499  return is_last_command && !has_been_synchronized(stream_token);
500  }
501 
502  template <typename T> void for_each_stream(T &&f) {
503  {
504  std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
505  unsigned int end =
506  std::min(static_cast<unsigned int>(compute_streams_.size()),
508  for (unsigned int i = 0; i < end; i++) {
509  f(compute_streams_[i]);
510  }
511  }
512  {
513  std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
514  unsigned int end =
515  std::min(static_cast<unsigned int>(transfer_streams_.size()),
517  for (unsigned int i = 0; i < end; i++) {
518  f(transfer_streams_[i]);
519  }
520  }
521  }
522 
523  template <bool ResetUsed = false, typename T> void sync_streams(T &&f) {
524  auto sync_compute = [&f, &streams = compute_streams_,
525  &delay = delay_compute_](unsigned int start,
526  unsigned int stop) {
527  for (unsigned int i = start; i < stop; i++) {
528  f(streams[i]);
529  delay[i] = false;
530  }
531  };
532  auto sync_transfer = [&f, &streams = transfer_streams_](unsigned int start,
533  unsigned int stop) {
534  for (unsigned int i = start; i < stop; i++) {
535  f(streams[i]);
536  }
537  };
538  {
539  unsigned int size = static_cast<unsigned int>(compute_streams_.size());
540  std::lock_guard compute_sync_guard(compute_stream_sync_mutex_);
541  std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
542  unsigned int start = last_sync_compute_streams_;
543  unsigned int end = num_compute_streams_ < size
545  : compute_stream_idx_.load();
546  if (ResetUsed) {
548  }
549  if (end - start >= size) {
550  sync_compute(0, size);
551  } else {
552  start %= size;
553  end %= size;
554  if (start <= end) {
555  sync_compute(start, end);
556  } else {
557  sync_compute(start, size);
558  sync_compute(0, end);
559  }
560  }
561  }
562  {
563  unsigned int size = static_cast<unsigned int>(transfer_streams_.size());
564  if (size > 0) {
565  std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
566  unsigned int start = last_sync_transfer_streams_;
567  unsigned int end = num_transfer_streams_ < size
569  : transfer_stream_idx_.load();
570  if (ResetUsed) {
572  }
573  if (end - start >= size) {
574  sync_transfer(0, size);
575  } else {
576  start %= size;
577  end %= size;
578  if (start <= end) {
579  sync_transfer(start, end);
580  } else {
581  sync_transfer(start, size);
582  sync_transfer(0, end);
583  }
584  }
585  }
586  }
587  }
588 
589  _pi_context *get_context() const { return context_; };
590 
591  _pi_device *get_device() const { return device_; };
592 
594 
596 
597  pi_uint32 get_reference_count() const noexcept { return refCount_; }
598 
599  pi_uint32 get_next_event_id() noexcept { return ++eventCount_; }
600 
601  bool backend_has_ownership() const noexcept { return has_ownership_; }
602 };
603 
604 typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
605  void *userData);
608 struct _pi_event {
609 public:
611 
612  pi_result record();
613 
614  pi_result wait();
615 
616  pi_result start();
617 
618  native_type get() const noexcept { return evEnd_; };
619 
620  pi_queue get_queue() const noexcept { return queue_; }
621 
622  CUstream get_stream() const noexcept { return stream_; }
623 
624  pi_uint32 get_compute_stream_token() const noexcept { return streamToken_; }
625 
626  pi_command_type get_command_type() const noexcept { return commandType_; }
627 
628  pi_uint32 get_reference_count() const noexcept { return refCount_; }
629 
630  bool is_recorded() const noexcept { return isRecorded_; }
631 
632  bool is_started() const noexcept { return isStarted_; }
633 
634  bool is_completed() const noexcept;
635 
636  pi_int32 get_execution_status() const noexcept {
637 
638  if (!is_recorded()) {
639  return PI_EVENT_SUBMITTED;
640  }
641 
642  if (!is_completed()) {
643  return PI_EVENT_RUNNING;
644  }
645  return PI_EVENT_COMPLETE;
646  }
647 
648  pi_context get_context() const noexcept { return context_; };
649 
650  pi_uint32 increment_reference_count() { return ++refCount_; }
651 
652  pi_uint32 decrement_reference_count() { return --refCount_; }
653 
654  pi_uint32 get_event_id() const noexcept { return eventId_; }
655 
656  bool backend_has_ownership() const noexcept { return has_ownership_; }
657 
658  // Returns the counter time when the associated command(s) were enqueued
659  //
660  pi_uint64 get_queued_time() const;
661 
662  // Returns the counter time when the associated command(s) started execution
663  //
664  pi_uint64 get_start_time() const;
665 
666  // Returns the counter time when the associated command(s) completed
667  //
668  pi_uint64 get_end_time() const;
669 
670  // construct a native CUDA. This maps closely to the underlying CUDA event.
671  static pi_event
674  return new _pi_event(type, queue->get_context(), queue, stream,
675  stream_token);
676  }
677 
678  static pi_event make_with_native(pi_context context, CUevent eventNative) {
679  return new _pi_event(context, eventNative);
680  }
681 
682  pi_result release();
683 
684  ~_pi_event();
685 
686 private:
687  // This constructor is private to force programmers to use the make_native /
688  // make_user static members in order to create a pi_event for CUDA.
689  _pi_event(pi_command_type type, pi_context context, pi_queue queue,
690  CUstream stream, pi_uint32 stream_token);
691 
692  // This constructor is private to force programmers to use the
693  // make_with_native for event introp
694  _pi_event(pi_context context, CUevent eventNative);
695 
696  pi_command_type commandType_; // The type of command associated with event.
697 
698  std::atomic_uint32_t refCount_; // Event reference count.
699 
700  bool has_ownership_; // Signifies if event owns the native type.
701 
702  bool hasBeenWaitedOn_; // Signifies whether the event has been waited
703  // on through a call to wait(), which implies
704  // that it has completed.
705 
706  bool isRecorded_; // Signifies wether a native CUDA event has been recorded
707  // yet.
708  bool isStarted_; // Signifies wether the operation associated with the
709  // PI event has started or not
710  //
711 
712  pi_uint32 streamToken_;
713  pi_uint32 eventId_; // Queue identifier of the event.
714 
715  native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
716  // event, this will be nullptr.
717 
718  native_type evStart_; // CUDA event handle associated with the start
719 
720  native_type evQueued_; // CUDA event handle associated with the time
721  // the command was enqueued
722 
723  pi_queue queue_; // pi_queue associated with the event. If this is a user
724  // event, this will be nullptr.
725 
726  CUstream stream_; // CUstream associated with the event. If this is a user
727  // event, this will be uninitialized.
728 
729  pi_context context_; // pi_context associated with the event. If this is a
730  // native event, this will be the same context associated
731  // with the queue_ member.
732 };
733 
736 struct _pi_program {
739  const char *binary_;
741  std::atomic_uint32_t refCount_;
743 
744  // Metadata
745  std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
747 
748  constexpr static size_t MAX_LOG_SIZE = 8192u;
749 
751  std::string buildOptions_;
753 
754  _pi_program(pi_context ctxt);
755  ~_pi_program();
756 
758  size_t length);
759 
760  pi_result set_binary(const char *binary, size_t binarySizeInBytes);
761 
762  pi_result build_program(const char *build_options);
763 
764  pi_context get_context() const { return context_; };
765 
766  native_type get() const noexcept { return module_; };
767 
769 
771 
772  pi_uint32 get_reference_count() const noexcept { return refCount_; }
773 };
774 
791 struct _pi_kernel {
792  using native_type = CUfunction;
793 
796  std::string name_;
799  std::atomic_uint32_t refCount_;
800 
803 
811  struct arguments {
812  static constexpr size_t MAX_PARAM_BYTES = 4000u;
813  using args_t = std::array<char, MAX_PARAM_BYTES>;
814  using args_size_t = std::vector<size_t>;
815  using args_index_t = std::vector<void *>;
820 
821  std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0};
822 
824  // Place the implicit offset index at the end of the indicies collection
825  indices_.emplace_back(&implicitOffsetArgs_);
826  }
827 
833  void add_arg(size_t index, size_t size, const void *arg,
834  size_t localSize = 0) {
835  if (index + 2 > indices_.size()) {
836  // Move implicit offset argument index with the end
837  indices_.resize(index + 2, indices_.back());
838  // Ensure enough space for the new argument
839  paramSizes_.resize(index + 1);
840  offsetPerIndex_.resize(index + 1);
841  }
842  paramSizes_[index] = size;
843  // calculate the insertion point on the array
844  size_t insertPos = std::accumulate(std::begin(paramSizes_),
845  std::begin(paramSizes_) + index, 0);
846  // Update the stored value for the argument
847  std::memcpy(&storage_[insertPos], arg, size);
848  indices_[index] = &storage_[insertPos];
849  offsetPerIndex_[index] = localSize;
850  }
851 
852  void add_local_arg(size_t index, size_t size) {
853  size_t localOffset = this->get_local_size();
854 
855  // maximum required alignment is the size of the largest vector type
856  const size_t max_alignment = sizeof(double) * 16;
857 
858  // for arguments smaller than the maximum alignment simply align to the
859  // size of the argument
860  const size_t alignment = std::min(max_alignment, size);
861 
862  // align the argument
863  size_t alignedLocalOffset = localOffset;
864  if (localOffset % alignment != 0) {
865  alignedLocalOffset += alignment - (localOffset % alignment);
866  }
867 
868  add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset),
869  size + (alignedLocalOffset - localOffset));
870  }
871 
872  void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) {
873  assert(size == sizeof(std::uint32_t) * 3);
874  std::memcpy(implicitOffsetArgs_, implicitOffset, size);
875  }
876 
878  std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0);
879  }
880 
881  const args_index_t &get_indices() const noexcept { return indices_; }
882 
884  return std::accumulate(std::begin(offsetPerIndex_),
885  std::end(offsetPerIndex_), 0);
886  }
887  } args_;
888 
889  _pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name,
890  pi_program program, pi_context ctxt)
891  : function_{func}, functionWithOffsetParam_{funcWithOffsetParam},
892  name_{name}, context_{ctxt}, program_{program}, refCount_{1} {
898  sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr);
899  (void)retError;
900  assert(retError == PI_SUCCESS);
901  }
902 
906  }
907 
908  pi_program get_program() const noexcept { return program_; }
909 
911 
913 
914  pi_uint32 get_reference_count() const noexcept { return refCount_; }
915 
916  native_type get() const noexcept { return function_; };
917 
920  };
921 
922  bool has_with_offset_parameter() const noexcept {
923  return functionWithOffsetParam_ != nullptr;
924  }
925 
926  pi_context get_context() const noexcept { return context_; };
927 
928  const char *get_name() const noexcept { return name_.c_str(); }
929 
934  pi_uint32 get_num_args() const noexcept { return args_.indices_.size() - 1; }
935 
936  void set_kernel_arg(int index, size_t size, const void *arg) {
937  args_.add_arg(index, size, arg);
938  }
939 
940  void set_kernel_local_arg(int index, size_t size) {
941  args_.add_local_arg(index, size);
942  }
943 
944  void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) {
945  args_.set_implicit_offset(size, implicitOffset);
946  }
947 
949  return args_.get_indices();
950  }
951 
952  pi_uint32 get_local_size() const noexcept { return args_.get_local_size(); }
953 
955 };
956 
962 struct _pi_sampler {
963  std::atomic_uint32_t refCount_;
966 
968  : refCount_(1), props_(0), context_(context) {}
969 
971 
973 
974  pi_uint32 get_reference_count() const noexcept { return refCount_; }
975 };
976 
977 // -------------------------------------------------------------
978 // Helper types and functions
979 //
980 
981 #endif // PI_CUDA_HPP
unsigned int CUdeviceptr
struct CUevent_st * CUevent
struct CUmod_st * CUmodule
struct CUstream_st * CUstream
int CUdevice
struct CUctx_st * CUcontext
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:221
void memcpy(void *Dst, const void *Src, std::size_t Size)
float length(T p) __NOEXC
Definition: builtins.hpp:1032
multi_ptr< ElementType, access::address_space::ext_intel_global_host_space, IsDecorated > host_ptr
Definition: pointers.hpp:40
void * malloc(size_t size, const device &dev, const context &ctxt, usm::alloc kind _CODELOCPARAM(&CodeLoc))
int32_t pi_int32
Definition: pi.h:102
pi_bitfield pi_map_flags
Definition: pi.h:552
_pi_result
Definition: pi.h:114
_pi_program_build_status
Definition: pi.h:144
@ PI_PROGRAM_BUILD_STATUS_NONE
Definition: pi.h:145
_pi_kernel_group_info
Definition: pi.h:344
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
Definition: pi.h:347
uint64_t pi_uint64
Definition: pi.h:104
pi_bitfield pi_queue_properties
Definition: pi.h:577
uint32_t pi_uint32
Definition: pi.h:103
void(* pi_context_extended_deleter)(void *user_data)
Definition: pi.h:1070
constexpr pi_map_flags PI_MAP_WRITE
Definition: pi.h:554
_pi_mem_type
Definition: pi.h:412
_pi_command_type
Definition: pi.h:380
@ PI_EVENT_SUBMITTED
Definition: pi.h:125
@ PI_EVENT_COMPLETE
Definition: pi.h:123
@ PI_EVENT_RUNNING
Definition: pi.h:124
pi_result cuda_piMemRelease(pi_mem memObj)
Decreases the reference count of the Mem object.
Definition: pi_cuda.cpp:2314
pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_cuda.cpp:2920
pi_result cuda_piContextRetain(pi_context context)
Definition: pi_cuda.cpp:1057
pi_result cuda_piProgramRelease(pi_program program)
Decreases the reference count of a pi_program object.
Definition: pi_cuda.cpp:3591
pi_result cuda_piKernelRetain(pi_kernel kernel)
Definition: pi_cuda.cpp:3737
pi_result cuda_piContextRelease(pi_context ctxt)
Definition: pi_cuda.cpp:2149
pi_result cuda_piQueueRelease(pi_queue command_queue)
Definition: pi_cuda.cpp:2561
pi_result cuda_piProgramRetain(pi_program program)
Definition: pi_cuda.cpp:3581
pi_result cuda_piKernelRelease(pi_kernel kernel)
Definition: pi_cuda.cpp:3745
pi_result cuda_piDeviceRetain(pi_device)
Definition: pi_cuda.cpp:1013
pi_result cuda_piMemRetain(pi_mem mem)
Definition: pi_cuda.cpp:3328
pi_result cuda_piDeviceRelease(pi_device)
Definition: pi_cuda.cpp:1133
pi_result cuda_piQueueRetain(pi_queue command_queue)
Definition: pi_cuda.cpp:2553
void(* pfn_notify)(pi_event event, pi_int32 eventCommandStatus, void *userData)
Definition: pi_cuda.hpp:604
std::unique_lock< std::mutex > _pi_stream_guard
Definition: pi_cuda.hpp:66
simd< _Tp, _Abi > max(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
PI context mapping to a CUDA context object.
Definition: pi_cuda.hpp:160
CUcontext native_type
Definition: pi_cuda.hpp:169
native_type get() const noexcept
Definition: pi_cuda.hpp:200
_pi_context(kind k, CUcontext ctxt, _pi_device *devId, bool backend_owns=true)
Definition: pi_cuda.hpp:176
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:206
void set_extended_deleter(pi_context_extended_deleter function, void *user_data)
Definition: pi_cuda.hpp:192
native_type cuContext_
Definition: pi_cuda.hpp:172
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:204
void invoke_extended_deleters()
Definition: pi_cuda.hpp:185
bool is_primary() const noexcept
Definition: pi_cuda.hpp:202
_pi_device * deviceId_
Definition: pi_cuda.hpp:173
bool backend_has_ownership() const noexcept
Definition: pi_cuda.hpp:210
enum _pi_context::kind kind_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:174
pi_device get_device() const noexcept
Definition: pi_cuda.hpp:198
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:208
PI device mapping to a CUdevice.
Definition: pi_cuda.hpp:83
pi_platform get_platform() const noexcept
Definition: pi_cuda.hpp:103
void get_max_work_item_sizes(size_t ret_size, size_t *ret_max_work_item_sizes) const noexcept
Definition: pi_cuda.hpp:114
void save_max_work_group_size(int value) noexcept
Definition: pi_cuda.hpp:110
_pi_device(native_type cuDevice, pi_platform platform)
Definition: pi_cuda.hpp:96
native_type get() const noexcept
Definition: pi_cuda.hpp:99
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:101
int get_max_work_group_size() const noexcept
Definition: pi_cuda.hpp:119
void save_max_work_item_sizes(size_t size, size_t *save_max_work_item_sizes) noexcept
Definition: pi_cuda.hpp:105
PI Event mapping to CUevent.
Definition: pi_cuda.hpp:608
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:628
static pi_event make_native(pi_command_type type, pi_queue queue, CUstream stream, pi_uint32 stream_token=std::numeric_limits< pi_uint32 >::max())
Definition: pi_cuda.hpp:672
bool backend_has_ownership() const noexcept
Definition: pi_cuda.hpp:656
CUstream get_stream() const noexcept
Definition: pi_cuda.hpp:622
bool is_recorded() const noexcept
Definition: pi_cuda.hpp:630
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:648
pi_result wait()
Definition: pi_cuda.cpp:616
pi_command_type get_command_type() const noexcept
Definition: pi_cuda.hpp:626
pi_uint32 get_compute_stream_token() const noexcept
Definition: pi_cuda.hpp:624
pi_result start()
Definition: pi_cuda.cpp:524
bool is_started() const noexcept
Definition: pi_cuda.hpp:632
pi_uint64 get_start_time() const
Definition: pi_cuda.cpp:568
~_pi_event()
Definition: pi_cuda.cpp:517
pi_uint64 get_queued_time() const
Definition: pi_cuda.cpp:559
pi_uint64 get_end_time() const
Definition: pi_cuda.cpp:577
pi_result release()
Definition: pi_cuda.cpp:628
pi_int32 get_execution_status() const noexcept
Definition: pi_cuda.hpp:636
pi_uint32 increment_reference_count()
Definition: pi_cuda.hpp:650
CUevent native_type
Definition: pi_cuda.hpp:610
pi_queue get_queue() const noexcept
Definition: pi_cuda.hpp:620
bool is_completed() const noexcept
Definition: pi_cuda.cpp:542
static pi_event make_with_native(pi_context context, CUevent eventNative)
Definition: pi_cuda.hpp:678
native_type get() const noexcept
Definition: pi_cuda.hpp:618
pi_result record()
Definition: pi_cuda.cpp:586
pi_uint32 decrement_reference_count()
Definition: pi_cuda.hpp:652
pi_uint32 get_event_id() const noexcept
Definition: pi_cuda.hpp:654
Structure that holds the arguments to the kernel.
Definition: pi_cuda.hpp:811
void add_arg(size_t index, size_t size, const void *arg, size_t localSize=0)
Adds an argument to the kernel.
Definition: pi_cuda.hpp:833
const args_index_t & get_indices() const noexcept
Definition: pi_cuda.hpp:881
std::vector< void * > args_index_t
Definition: pi_cuda.hpp:815
args_size_t paramSizes_
Definition: pi_cuda.hpp:817
void set_implicit_offset(size_t size, std::uint32_t *implicitOffset)
Definition: pi_cuda.hpp:872
std::uint32_t implicitOffsetArgs_[3]
Definition: pi_cuda.hpp:821
pi_uint32 get_local_size() const
Definition: pi_cuda.hpp:883
std::array< char, MAX_PARAM_BYTES > args_t
Definition: pi_cuda.hpp:813
std::vector< size_t > args_size_t
Definition: pi_cuda.hpp:814
args_size_t offsetPerIndex_
Definition: pi_cuda.hpp:819
void add_local_arg(size_t index, size_t size)
Definition: pi_cuda.hpp:852
args_index_t indices_
Definition: pi_cuda.hpp:818
static constexpr size_t MAX_PARAM_BYTES
Definition: pi_cuda.hpp:812
Implementation of a PI Kernel for CUDA.
Definition: pi_cuda.hpp:791
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:926
native_type get_with_offset_parameter() const noexcept
Definition: pi_cuda.hpp:918
struct _pi_kernel::arguments args_
pi_uint32 get_num_args() const noexcept
Returns the number of arguments, excluding the implicit global offset.
Definition: pi_cuda.hpp:934
pi_context context_
Definition: pi_cuda.hpp:797
CUfunction native_type
Definition: pi_cuda.hpp:792
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:914
bool has_with_offset_parameter() const noexcept
Definition: pi_cuda.hpp:922
_pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name, pi_program program, pi_context ctxt)
Definition: pi_cuda.hpp:889
const char * get_name() const noexcept
Definition: pi_cuda.hpp:928
native_type functionWithOffsetParam_
Definition: pi_cuda.hpp:795
size_t reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS]
Definition: pi_cuda.hpp:802
void clear_local_size()
Definition: pi_cuda.hpp:954
void set_kernel_arg(int index, size_t size, const void *arg)
Definition: pi_cuda.hpp:936
const arguments::args_index_t & get_arg_indices() const
Definition: pi_cuda.hpp:948
void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset)
Definition: pi_cuda.hpp:944
pi_uint32 get_local_size() const noexcept
Definition: pi_cuda.hpp:952
std::string name_
Definition: pi_cuda.hpp:796
static constexpr pi_uint32 REQD_THREADS_PER_BLOCK_DIMENSIONS
Definition: pi_cuda.hpp:801
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:912
pi_program program_
Definition: pi_cuda.hpp:798
native_type function_
Definition: pi_cuda.hpp:794
pi_program get_program() const noexcept
Definition: pi_cuda.hpp:908
native_type get() const noexcept
Definition: pi_cuda.hpp:916
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:799
void set_kernel_local_arg(int index, size_t size)
Definition: pi_cuda.hpp:940
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:910
native_type get() const noexcept
Definition: pi_cuda.hpp:275
void * hostPtr_
Pointer associated with this device on the host.
Definition: pi_cuda.hpp:251
size_t mapOffset_
Offset of the active mapped region.
Definition: pi_cuda.hpp:255
pi_map_flags get_map_flags() const noexcept
Definition: pi_cuda.hpp:311
size_t get_map_offset(void *) const noexcept
Definition: pi_cuda.hpp:281
void unmap(void *) noexcept
Detach the allocation from the host memory.
Definition: pi_cuda.hpp:301
size_t size_
Size of the allocation in bytes.
Definition: pi_cuda.hpp:253
void * get_map_ptr() const noexcept
Definition: pi_cuda.hpp:279
alloc_mode
alloc_mode classic: Just a normal buffer allocated on the device via cuda malloc use_host_ptr: Use an...
Definition: pi_cuda.hpp:268
pi_map_flags mapFlags_
Original flags for the mapped region.
Definition: pi_cuda.hpp:259
enum _pi_mem::mem_::buffer_mem_::alloc_mode allocMode_
size_t get_size() const noexcept
Definition: pi_cuda.hpp:277
void * map_to_ptr(size_t offset, pi_map_flags flags) noexcept
Returns a pointer to data visible on the host that contains the data on the device associated with th...
Definition: pi_cuda.hpp:287
void * mapPtr_
Pointer to the active mapped region, if any.
Definition: pi_cuda.hpp:257
pi_mem_type get_image_type() const noexcept
Definition: pi_cuda.hpp:327
CUsurfObject get_surface() const noexcept
Definition: pi_cuda.hpp:325
CUarray get_array() const noexcept
Definition: pi_cuda.hpp:323
PI Mem mapping to CUDA memory allocations, both data and texture/surface.
Definition: pi_cuda.hpp:222
std::atomic_uint32_t refCount_
Reference counting of the handler.
Definition: pi_cuda.hpp:231
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:382
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:388
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:384
_pi_mem(pi_context ctxt, pi_mem parent, mem_::buffer_mem_::alloc_mode mode, CUdeviceptr ptr, void *host_ptr, size_t size)
Constructs the PI MEM handler for a non-typed allocation ("buffer")
Definition: pi_cuda.hpp:332
union _pi_mem::mem_ mem_
bool is_sub_buffer() const noexcept
Definition: pi_cuda.hpp:376
_pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf, pi_mem_type image_type, void *host_ptr)
Constructs the PI allocation for an Image object (surface in CUDA)
Definition: pi_cuda.hpp:351
bool is_image() const noexcept
Definition: pi_cuda.hpp:380
enum _pi_mem::mem_type mem_type_
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:386
~_pi_mem()
Definition: pi_cuda.hpp:363
bool is_buffer() const noexcept
Definition: pi_cuda.hpp:374
pi_context context_
Definition: pi_cuda.hpp:228
A PI platform stores all known PI devices, in the CUDA plugin this is just a vector of available devi...
Definition: pi_cuda.hpp:73
std::vector< std::unique_ptr< _pi_device > > devices_
Definition: pi_cuda.hpp:75
static CUevent evBase_
Definition: pi_cuda.hpp:74
Implementation of PI Program on CUDA Module object.
Definition: pi_cuda.hpp:736
pi_program_build_status buildStatus_
Definition: pi_cuda.hpp:752
pi_result build_program(const char *build_options)
Definition: pi_cuda.cpp:711
CUmodule native_type
Definition: pi_cuda.hpp:737
native_type module_
Definition: pi_cuda.hpp:738
char errorLog_[MAX_LOG_SIZE]
Definition: pi_cuda.hpp:750
pi_context get_context() const
Definition: pi_cuda.hpp:764
size_t binarySizeInBytes_
Definition: pi_cuda.hpp:740
native_type get() const noexcept
Definition: pi_cuda.hpp:766
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:772
constexpr static size_t MAX_LOG_SIZE
Definition: pi_cuda.hpp:748
char infoLog_[MAX_LOG_SIZE]
Definition: pi_cuda.hpp:750
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:768
_pi_context * context_
Definition: pi_cuda.hpp:742
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:741
std::unordered_map< std::string, std::tuple< uint32_t, uint32_t, uint32_t > > kernelReqdWorkGroupSizeMD_
Definition: pi_cuda.hpp:746
pi_result set_binary(const char *binary, size_t binarySizeInBytes)
Definition: pi_cuda.cpp:703
pi_result set_metadata(const pi_device_binary_property *metadata, size_t length)
Definition: pi_cuda.cpp:676
std::string buildOptions_
Definition: pi_cuda.hpp:751
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:770
const char * binary_
Definition: pi_cuda.hpp:739
PI queue mapping on to CUstream objects.
Definition: pi_cuda.hpp:393
unsigned int num_compute_streams_
Definition: pi_cuda.hpp:417
_pi_context * get_context() const
Definition: pi_cuda.hpp:589
std::vector< bool > transfer_applied_barrier_
Definition: pi_cuda.hpp:407
native_type get()
Definition: pi_cuda.hpp:470
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:593
_pi_device * get_device() const
Definition: pi_cuda.hpp:591
_pi_queue(std::vector< CUstream > &&compute_streams, std::vector< CUstream > &&transfer_streams, _pi_context *context, _pi_device *device, pi_queue_properties properties, unsigned int flags, bool backend_owns=true)
Definition: pi_cuda.hpp:431
void transfer_stream_wait_for_barrier_if_needed(CUstream stream, pi_uint32 stream_i)
Definition: pi_cuda.cpp:396
std::mutex transfer_stream_mutex_
Definition: pi_cuda.hpp:427
std::mutex barrier_mutex_
Definition: pi_cuda.hpp:428
bool has_ownership_
Definition: pi_cuda.hpp:429
unsigned int last_sync_compute_streams_
Definition: pi_cuda.hpp:419
std::atomic_uint32_t compute_stream_idx_
Definition: pi_cuda.hpp:415
static constexpr int default_num_compute_streams
Definition: pi_cuda.hpp:395
CUstream native_type
Definition: pi_cuda.hpp:394
unsigned int num_transfer_streams_
Definition: pi_cuda.hpp:418
void compute_stream_wait_for_barrier_if_needed(CUstream stream, pi_uint32 stream_i)
Definition: pi_cuda.cpp:388
std::vector< bool > delay_compute_
Definition: pi_cuda.hpp:404
bool has_been_synchronized(pi_uint32 stream_token)
Definition: pi_cuda.hpp:472
unsigned int flags_
Definition: pi_cuda.hpp:421
CUevent barrier_tmp_event_
Definition: pi_cuda.hpp:412
std::mutex compute_stream_mutex_
Definition: pi_cuda.hpp:426
pi_uint32 get_next_event_id() noexcept
Definition: pi_cuda.hpp:599
std::atomic_uint32_t transfer_stream_idx_
Definition: pi_cuda.hpp:416
_pi_context * context_
Definition: pi_cuda.hpp:408
unsigned int last_sync_transfer_streams_
Definition: pi_cuda.hpp:420
bool can_reuse_stream(pi_uint32 stream_token)
Definition: pi_cuda.hpp:480
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:595
std::mutex compute_stream_sync_mutex_
Definition: pi_cuda.hpp:425
void sync_streams(T &&f)
Definition: pi_cuda.hpp:523
static constexpr int default_num_transfer_streams
Definition: pi_cuda.hpp:396
std::atomic_uint32_t eventCount_
Definition: pi_cuda.hpp:414
std::vector< native_type > compute_streams_
Definition: pi_cuda.hpp:398
native_type get_next_transfer_stream()
Definition: pi_cuda.cpp:465
bool backend_has_ownership() const noexcept
Definition: pi_cuda.hpp:601
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:413
_pi_device * device_
Definition: pi_cuda.hpp:409
~_pi_queue()
Definition: pi_cuda.hpp:449
CUevent barrier_event_
Definition: pi_cuda.hpp:411
std::vector< bool > compute_applied_barrier_
Definition: pi_cuda.hpp:406
std::vector< native_type > transfer_streams_
Definition: pi_cuda.hpp:399
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:597
void for_each_stream(T &&f)
Definition: pi_cuda.hpp:502
native_type get_next_compute_stream(pi_uint32 *stream_token=nullptr)
Definition: pi_cuda.cpp:404
pi_queue_properties properties_
Definition: pi_cuda.hpp:410
Implementation of samplers for CUDA.
Definition: pi_cuda.hpp:962
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:963
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:970
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:972
pi_uint32 props_
Definition: pi_cuda.hpp:964
pi_context context_
Definition: pi_cuda.hpp:965
_pi_sampler(pi_context context)
Definition: pi_cuda.hpp:967
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:974
A PI Memory object represents either plain memory allocations ("Buffers" in OpenCL) or typed allocati...
Definition: pi_cuda.hpp:239
struct _pi_mem::mem_::buffer_mem_ buffer_mem_
struct _pi_mem::mem_::surface_mem_ surface_mem_