DPC++ Runtime
Runtime libraries for oneAPI DPC++
pi_cuda.hpp
Go to the documentation of this file.
1 //===-- pi_cuda.hpp - CUDA Plugin -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
11 
17 
18 #ifndef PI_CUDA_HPP
19 #define PI_CUDA_HPP
20 
21 #include "CL/sycl/detail/pi.h"
22 #include <array>
23 #include <atomic>
24 #include <cassert>
25 #include <cstring>
26 #include <cuda.h>
27 #include <functional>
28 #include <limits>
29 #include <mutex>
30 #include <numeric>
31 #include <stdint.h>
32 #include <string>
33 #include <unordered_map>
34 #include <vector>
35 
36 extern "C" {
37 
52  pi_kernel_group_info param_name,
53  size_t param_value_size, void *param_value,
54  size_t *param_value_size_ret);
56 }
57 
63 struct _pi_platform {
64  static CUevent evBase_; // CUDA event used as base counter
65  std::vector<std::unique_ptr<_pi_device>> devices_;
66 };
67 
73 struct _pi_device {
74 private:
75  using native_type = CUdevice;
76 
77  native_type cuDevice_;
78  std::atomic_uint32_t refCount_;
79  pi_platform platform_;
80 
81  static constexpr pi_uint32 max_work_item_dimensions = 3u;
82  size_t max_work_item_sizes[max_work_item_dimensions];
83  int max_work_group_size;
84 
85 public:
86  _pi_device(native_type cuDevice, pi_platform platform)
87  : cuDevice_(cuDevice), refCount_{1}, platform_(platform) {}
88 
89  native_type get() const noexcept { return cuDevice_; };
90 
91  pi_uint32 get_reference_count() const noexcept { return refCount_; }
92 
93  pi_platform get_platform() const noexcept { return platform_; };
94 
95  void save_max_work_item_sizes(size_t size,
96  size_t *save_max_work_item_sizes) noexcept {
97  memcpy(max_work_item_sizes, save_max_work_item_sizes, size);
98  };
99 
100  void save_max_work_group_size(int value) noexcept {
101  max_work_group_size = value;
102  };
103 
104  void get_max_work_item_sizes(size_t ret_size,
105  size_t *ret_max_work_item_sizes) const noexcept {
106  memcpy(ret_max_work_item_sizes, max_work_item_sizes, ret_size);
107  };
108 
109  int get_max_work_group_size() const noexcept { return max_work_group_size; };
110 };
111 
150 struct _pi_context {
151 
152  struct deleter_data {
154  void *user_data;
155 
156  void operator()() { function(user_data); }
157  };
158 
160 
161  enum class kind { primary, user_defined } kind_;
164  std::atomic_uint32_t refCount_;
165 
167  : kind_{k}, cuContext_{ctxt}, deviceId_{devId}, refCount_{1} {
169  };
170 
172 
174  std::lock_guard<std::mutex> guard(mutex_);
175  for (auto &deleter : extended_deleters_) {
176  deleter();
177  }
178  }
179 
181  void *user_data) {
182  std::lock_guard<std::mutex> guard(mutex_);
183  extended_deleters_.emplace_back(deleter_data{function, user_data});
184  }
185 
186  pi_device get_device() const noexcept { return deviceId_; }
187 
188  native_type get() const noexcept { return cuContext_; }
189 
190  bool is_primary() const noexcept { return kind_ == kind::primary; }
191 
193 
195 
196  pi_uint32 get_reference_count() const noexcept { return refCount_; }
197 
198 private:
199  std::mutex mutex_;
200  std::vector<deleter_data> extended_deleters_;
201 };
202 
207 struct _pi_mem {
208 
209  // TODO: Move as much shared data up as possible
211 
212  // Context where the memory object is accessibles
214 
216  std::atomic_uint32_t refCount_;
217  enum class mem_type { buffer, surface } mem_type_;
218 
224  union mem_ {
225  // Handler for plain, pointer-based CUDA allocations
226  struct buffer_mem_ {
228 
229  // If this allocation is a sub-buffer (i.e., a view on an existing
230  // allocation), this is the pointer to the parent handler structure
232  // CUDA handler for the pointer
234 
236  void *hostPtr_;
238  size_t size_;
240  size_t mapOffset_;
242  void *mapPtr_;
245 
253  enum class alloc_mode {
254  classic,
255  use_host_ptr,
256  copy_in,
258  } allocMode_;
259 
260  native_type get() const noexcept { return ptr_; }
261 
262  size_t get_size() const noexcept { return size_; }
263 
264  void *get_map_ptr() const noexcept { return mapPtr_; }
265 
266  size_t get_map_offset(void *) const noexcept { return mapOffset_; }
267 
272  void *map_to_ptr(size_t offset, pi_map_flags flags) noexcept {
273  assert(mapPtr_ == nullptr);
274  mapOffset_ = offset;
275  mapFlags_ = flags;
276  if (hostPtr_) {
277  mapPtr_ = static_cast<char *>(hostPtr_) + offset;
278  } else {
279  // TODO: Allocate only what is needed based on the offset
280  mapPtr_ = static_cast<void *>(malloc(this->get_size()));
281  }
282  return mapPtr_;
283  }
284 
286  void unmap(void *) noexcept {
287  assert(mapPtr_ != nullptr);
288 
289  if (mapPtr_ != hostPtr_) {
290  free(mapPtr_);
291  }
292  mapPtr_ = nullptr;
293  mapOffset_ = 0;
294  }
295 
296  pi_map_flags get_map_flags() const noexcept {
297  assert(mapPtr_ != nullptr);
298  return mapFlags_;
299  }
300  } buffer_mem_;
301 
302  // Handler data for surface object (i.e. Images)
303  struct surface_mem_ {
304  CUarray array_;
305  CUsurfObject surfObj_;
307 
308  CUarray get_array() const noexcept { return array_; }
309 
310  CUsurfObject get_surface() const noexcept { return surfObj_; }
311 
312  pi_mem_type get_image_type() const noexcept { return imageType_; }
313  } surface_mem_;
314  } mem_;
315 
318  CUdeviceptr ptr, void *host_ptr, size_t size)
320  mem_.buffer_mem_.ptr_ = ptr;
321  mem_.buffer_mem_.parent_ = parent;
323  mem_.buffer_mem_.size_ = size;
325  mem_.buffer_mem_.mapPtr_ = nullptr;
328  if (is_sub_buffer()) {
330  } else {
332  }
333  };
334 
336  _pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf,
337  pi_mem_type image_type, void *host_ptr)
339  // Ignore unused parameter
340  (void)host_ptr;
341 
342  mem_.surface_mem_.array_ = array;
343  mem_.surface_mem_.surfObj_ = surf;
344  mem_.surface_mem_.imageType_ = image_type;
346  }
347 
349  if (mem_type_ == mem_type::buffer) {
350  if (is_sub_buffer()) {
352  return;
353  }
354  }
356  }
357 
358  // TODO: Move as many shared funcs up as possible
359  bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; }
360 
361  bool is_sub_buffer() const noexcept {
362  return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr));
363  }
364 
365  bool is_image() const noexcept { return mem_type_ == mem_type::surface; }
366 
367  pi_context get_context() const noexcept { return context_; }
368 
370 
372 
373  pi_uint32 get_reference_count() const noexcept { return refCount_; }
374 };
375 
378 struct _pi_queue {
380  static constexpr int default_num_compute_streams = 128;
381  static constexpr int default_num_transfer_streams = 64;
382 
383  std::vector<native_type> compute_streams_;
384  std::vector<native_type> transfer_streams_;
388  std::atomic_uint32_t refCount_;
389  std::atomic_uint32_t eventCount_;
390  std::atomic_uint32_t compute_stream_idx_;
391  std::atomic_uint32_t transfer_stream_idx_;
392  unsigned int num_compute_streams_;
393  unsigned int num_transfer_streams_;
394  unsigned int flags_;
397 
398  _pi_queue(std::vector<CUstream> &&compute_streams,
399  std::vector<CUstream> &&transfer_streams, _pi_context *context,
401  unsigned int flags)
402  : compute_streams_{std::move(compute_streams)},
403  transfer_streams_{std::move(transfer_streams)}, context_{context},
404  device_{device}, properties_{properties}, refCount_{1}, eventCount_{0},
409  }
410 
414  }
415 
416  // get_next_compute/transfer_stream() functions return streams from
417  // appropriate pools in round-robin fashion
421 
422  template <typename T> void for_each_stream(T &&f) {
423  {
424  std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
425  unsigned int end =
426  std::min(static_cast<unsigned int>(compute_streams_.size()),
428  for (unsigned int i = 0; i < end; i++) {
429  f(compute_streams_[i]);
430  }
431  }
432  {
433  std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
434  unsigned int end =
435  std::min(static_cast<unsigned int>(transfer_streams_.size()),
437  for (unsigned int i = 0; i < end; i++) {
438  f(transfer_streams_[i]);
439  }
440  }
441  }
442 
443  _pi_context *get_context() const { return context_; };
444 
446 
448 
449  pi_uint32 get_reference_count() const noexcept { return refCount_; }
450 
451  pi_uint32 get_next_event_id() noexcept { return ++eventCount_; }
452 };
453 
454 typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
455  void *userData);
458 struct _pi_event {
459 public:
461 
462  pi_result record();
463 
464  pi_result wait();
465 
466  pi_result start();
467 
468  native_type get() const noexcept { return evEnd_; };
469 
470  pi_queue get_queue() const noexcept { return queue_; }
471 
472  CUstream get_stream() const noexcept { return stream_; }
473 
474  pi_command_type get_command_type() const noexcept { return commandType_; }
475 
476  pi_uint32 get_reference_count() const noexcept { return refCount_; }
477 
478  bool is_recorded() const noexcept { return isRecorded_; }
479 
480  bool is_started() const noexcept { return isStarted_; }
481 
482  bool is_completed() const noexcept;
483 
484  pi_int32 get_execution_status() const noexcept {
485 
486  if (!is_recorded()) {
487  return PI_EVENT_SUBMITTED;
488  }
489 
490  if (!is_completed()) {
491  return PI_EVENT_RUNNING;
492  }
493  return PI_EVENT_COMPLETE;
494  }
495 
496  pi_context get_context() const noexcept { return context_; };
497 
498  pi_uint32 increment_reference_count() { return ++refCount_; }
499 
500  pi_uint32 decrement_reference_count() { return --refCount_; }
501 
502  pi_uint32 get_event_id() const noexcept { return eventId_; }
503 
504  // Returns the counter time when the associated command(s) were enqueued
505  //
506  pi_uint64 get_queued_time() const;
507 
508  // Returns the counter time when the associated command(s) started execution
509  //
510  pi_uint64 get_start_time() const;
511 
512  // Returns the counter time when the associated command(s) completed
513  //
514  pi_uint64 get_end_time() const;
515 
516  // construct a native CUDA. This maps closely to the underlying CUDA event.
518  CUstream stream) {
519  return new _pi_event(type, queue->get_context(), queue, stream);
520  }
521 
522  pi_result release();
523 
524  ~_pi_event();
525 
526 private:
527  // This constructor is private to force programmers to use the make_native /
528  // make_user static members in order to create a pi_event for CUDA.
530  CUstream stream);
531 
532  pi_command_type commandType_; // The type of command associated with event.
533 
534  std::atomic_uint32_t refCount_; // Event reference count.
535 
536  bool hasBeenWaitedOn_; // Signifies whether the event has been waited
537  // on through a call to wait(), which implies
538  // that it has completed.
539 
540  bool isRecorded_; // Signifies wether a native CUDA event has been recorded
541  // yet.
542  bool isStarted_; // Signifies wether the operation associated with the
543  // PI event has started or not
544  //
545 
546  pi_uint32 eventId_; // Queue identifier of the event.
547 
548  native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
549  // event, this will be nullptr.
550 
551  native_type evStart_; // CUDA event handle associated with the start
552 
553  native_type evQueued_; // CUDA event handle associated with the time
554  // the command was enqueued
555 
556  pi_queue queue_; // pi_queue associated with the event. If this is a user
557  // event, this will be nullptr.
558 
559  CUstream stream_; // CUstream associated with the event. If this is a user
560  // event, this will be uninitialized.
561 
562  pi_context context_; // pi_context associated with the event. If this is a
563  // native event, this will be the same context associated
564  // with the queue_ member.
565 };
566 
569 struct _pi_program {
572  const char *binary_;
574  std::atomic_uint32_t refCount_;
576 
577  // Metadata
578  std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
580 
581  constexpr static size_t MAX_LOG_SIZE = 8192u;
582 
584  std::string buildOptions_;
586 
587  _pi_program(pi_context ctxt);
588  ~_pi_program();
589 
591  size_t length);
592 
593  pi_result set_binary(const char *binary, size_t binarySizeInBytes);
594 
595  pi_result build_program(const char *build_options);
596 
597  pi_context get_context() const { return context_; };
598 
599  native_type get() const noexcept { return module_; };
600 
602 
604 
605  pi_uint32 get_reference_count() const noexcept { return refCount_; }
606 };
607 
624 struct _pi_kernel {
625  using native_type = CUfunction;
626 
629  std::string name_;
632  std::atomic_uint32_t refCount_;
633 
636 
644  struct arguments {
645  static constexpr size_t MAX_PARAM_BYTES = 4000u;
646  using args_t = std::array<char, MAX_PARAM_BYTES>;
647  using args_size_t = std::vector<size_t>;
648  using args_index_t = std::vector<void *>;
653 
654  std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0};
655 
657  // Place the implicit offset index at the end of the indicies collection
658  indices_.emplace_back(&implicitOffsetArgs_);
659  }
660 
666  void add_arg(size_t index, size_t size, const void *arg,
667  size_t localSize = 0) {
668  if (index + 2 > indices_.size()) {
669  // Move implicit offset argument index with the end
670  indices_.resize(index + 2, indices_.back());
671  // Ensure enough space for the new argument
672  paramSizes_.resize(index + 1);
673  offsetPerIndex_.resize(index + 1);
674  }
675  paramSizes_[index] = size;
676  // calculate the insertion point on the array
677  size_t insertPos = std::accumulate(std::begin(paramSizes_),
678  std::begin(paramSizes_) + index, 0);
679  // Update the stored value for the argument
680  std::memcpy(&storage_[insertPos], arg, size);
681  indices_[index] = &storage_[insertPos];
682  offsetPerIndex_[index] = localSize;
683  }
684 
685  void add_local_arg(size_t index, size_t size) {
686  size_t localOffset = this->get_local_size();
687 
688  // maximum required alignment is the size of the largest vector type
689  const size_t max_alignment = sizeof(double) * 16;
690 
691  // for arguments smaller than the maximum alignment simply align to the
692  // size of the argument
693  const size_t alignment = std::min(max_alignment, size);
694 
695  // align the argument
696  size_t alignedLocalOffset = localOffset;
697  if (localOffset % alignment != 0) {
698  alignedLocalOffset += alignment - (localOffset % alignment);
699  }
700 
701  add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset),
702  size + (alignedLocalOffset - localOffset));
703  }
704 
705  void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) {
706  assert(size == sizeof(std::uint32_t) * 3);
707  std::memcpy(implicitOffsetArgs_, implicitOffset, size);
708  }
709 
711  std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0);
712  }
713 
714  const args_index_t &get_indices() const noexcept { return indices_; }
715 
717  return std::accumulate(std::begin(offsetPerIndex_),
718  std::end(offsetPerIndex_), 0);
719  }
720  } args_;
721 
722  _pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name,
723  pi_program program, pi_context ctxt)
724  : function_{func}, functionWithOffsetParam_{funcWithOffsetParam},
725  name_{name}, context_{ctxt}, program_{program}, refCount_{1} {
730  this, ctxt->get_device(), PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
731  sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr);
732  (void)retError;
733  assert(retError == PI_SUCCESS);
734  }
735 
739  }
740 
741  pi_program get_program() const noexcept { return program_; }
742 
744 
746 
747  pi_uint32 get_reference_count() const noexcept { return refCount_; }
748 
749  native_type get() const noexcept { return function_; };
750 
753  };
754 
755  bool has_with_offset_parameter() const noexcept {
756  return functionWithOffsetParam_ != nullptr;
757  }
758 
759  pi_context get_context() const noexcept { return context_; };
760 
761  const char *get_name() const noexcept { return name_.c_str(); }
762 
767  pi_uint32 get_num_args() const noexcept { return args_.indices_.size() - 1; }
768 
769  void set_kernel_arg(int index, size_t size, const void *arg) {
770  args_.add_arg(index, size, arg);
771  }
772 
773  void set_kernel_local_arg(int index, size_t size) {
774  args_.add_local_arg(index, size);
775  }
776 
777  void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) {
778  args_.set_implicit_offset(size, implicitOffset);
779  }
780 
782  return args_.get_indices();
783  }
784 
785  pi_uint32 get_local_size() const noexcept { return args_.get_local_size(); }
786 
788 };
789 
795 struct _pi_sampler {
796  std::atomic_uint32_t refCount_;
799 
801  : refCount_(1), props_(0), context_(context) {}
802 
804 
806 
807  pi_uint32 get_reference_count() const noexcept { return refCount_; }
808 };
809 
810 // -------------------------------------------------------------
811 // Helper types and functions
812 //
813 
814 #endif // PI_CUDA_HPP
_pi_sampler::context_
pi_context context_
Definition: pi_cuda.hpp:798
_pi_kernel::has_with_offset_parameter
bool has_with_offset_parameter() const noexcept
Definition: pi_cuda.hpp:755
_pi_mem::_pi_mem
_pi_mem(pi_context ctxt, pi_mem parent, mem_::buffer_mem_::alloc_mode mode, CUdeviceptr ptr, void *host_ptr, size_t size)
Constructs the PI MEM handler for a non-typed allocation ("buffer")
Definition: pi_cuda.hpp:317
_pi_kernel::arguments::MAX_PARAM_BYTES
static constexpr size_t MAX_PARAM_BYTES
Definition: pi_cuda.hpp:645
_pi_kernel::native_type
CUfunction native_type
Definition: pi_cuda.hpp:625
_pi_event::make_native
static pi_event make_native(pi_command_type type, pi_queue queue, CUstream stream)
Definition: pi_cuda.hpp:517
_pi_event::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:476
_pi_mem::mem_
A PI Memory object represents either plain memory allocations ("Buffers" in OpenCL) or typed allocati...
Definition: pi_cuda.hpp:224
_pi_mem
PI Mem mapping to CUDA memory allocations, both data and texture/surface.
Definition: pi_cuda.hpp:207
PI_MAP_WRITE
constexpr pi_map_flags PI_MAP_WRITE
Definition: pi.h:594
_pi_mem_type
_pi_mem_type
Definition: pi.h:449
_pi_mem::mem_::surface_mem_::get_surface
CUsurfObject get_surface() const noexcept
Definition: pi_cuda.hpp:310
PI_SUCCESS
@ PI_SUCCESS
Definition: pi.h:86
_pi_kernel::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:745
CUcontext
struct CUctx_st * CUcontext
Definition: backend_traits_cuda.hpp:26
_pi_device::_pi_device
_pi_device(native_type cuDevice, pi_platform platform)
Definition: pi_cuda.hpp:86
_pi_kernel::arguments::get_local_size
pi_uint32 get_local_size() const
Definition: pi_cuda.hpp:716
_pi_queue::compute_streams_
std::vector< native_type > compute_streams_
Definition: pi_cuda.hpp:383
_pi_context::kind::primary
@ primary
pi.h
_pi_context::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:194
CUstream
struct CUstream_st * CUstream
Definition: backend_traits_cuda.hpp:27
_pi_sampler::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:796
_pi_program::context_
_pi_context * context_
Definition: pi_cuda.hpp:575
_pi_event::get_context
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:496
_pi_queue::_pi_queue
_pi_queue(std::vector< CUstream > &&compute_streams, std::vector< CUstream > &&transfer_streams, _pi_context *context, _pi_device *device, pi_queue_properties properties, unsigned int flags)
Definition: pi_cuda.hpp:398
_pi_mem::mem_::buffer_mem_::hostPtr_
void * hostPtr_
Pointer associated with this device on the host.
Definition: pi_cuda.hpp:236
_pi_context::deviceId_
_pi_device * deviceId_
Definition: pi_cuda.hpp:163
_pi_kernel::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:747
_pi_program::kernelReqdWorkGroupSizeMD_
std::unordered_map< std::string, std::tuple< uint32_t, uint32_t, uint32_t > > kernelReqdWorkGroupSizeMD_
Definition: pi_cuda.hpp:579
_pi_queue::context_
_pi_context * context_
Definition: pi_cuda.hpp:385
_pi_mem::is_image
bool is_image() const noexcept
Definition: pi_cuda.hpp:365
cl::sycl::info::device
device
Definition: info_desc.hpp:53
_pi_kernel::arguments::args_index_t
std::vector< void * > args_index_t
Definition: pi_cuda.hpp:648
_pi_event::is_started
bool is_started() const noexcept
Definition: pi_cuda.hpp:480
_pi_sampler::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:803
_pi_queue::get
native_type get()
Definition: pi_cuda.hpp:420
_pi_device::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:91
_pi_context::_pi_context
_pi_context(kind k, CUcontext ctxt, _pi_device *devId)
Definition: pi_cuda.hpp:166
_pi_context::deleter_data
Definition: pi_cuda.hpp:152
_pi_sampler::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:805
_pi_kernel::arguments::implicitOffsetArgs_
std::uint32_t implicitOffsetArgs_[3]
Definition: pi_cuda.hpp:654
_pi_kernel::arguments::add_local_arg
void add_local_arg(size_t index, size_t size)
Definition: pi_cuda.hpp:685
_pi_kernel::get_arg_indices
const arguments::args_index_t & get_arg_indices() const
Definition: pi_cuda.hpp:781
_pi_kernel::name_
std::string name_
Definition: pi_cuda.hpp:629
_pi_queue::native_type
CUstream native_type
Definition: pi_cuda.hpp:379
_pi_queue::get_next_compute_stream
native_type get_next_compute_stream()
Definition: pi_cuda.cpp:370
_pi_context::get
native_type get() const noexcept
Definition: pi_cuda.hpp:188
_pi_mem::mem_::buffer_mem_::size_
size_t size_
Size of the allocation in bytes.
Definition: pi_cuda.hpp:238
_pi_program::get_context
pi_context get_context() const
Definition: pi_cuda.hpp:597
_pi_context::cuContext_
native_type cuContext_
Definition: pi_cuda.hpp:162
_pi_program::MAX_LOG_SIZE
constexpr static size_t MAX_LOG_SIZE
Definition: pi_cuda.hpp:581
_pi_sampler::_pi_sampler
_pi_sampler(pi_context context)
Definition: pi_cuda.hpp:800
_pi_device::get_platform
pi_platform get_platform() const noexcept
Definition: pi_cuda.hpp:93
_pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr
@ alloc_host_ptr
_pi_result
_pi_result
Definition: pi.h:85
_pi_kernel::~_pi_kernel
~_pi_kernel()
Definition: pi_cuda.hpp:736
_pi_queue::for_each_stream
void for_each_stream(T &&f)
Definition: pi_cuda.hpp:422
_pi_queue::device_
_pi_device * device_
Definition: pi_cuda.hpp:386
_pi_context::native_type
CUcontext native_type
Definition: pi_cuda.hpp:159
_pi_event::get_queued_time
pi_uint64 get_queued_time() const
Definition: pi_cuda.cpp:465
_pi_mem::mem_::buffer_mem_::get_map_offset
size_t get_map_offset(void *) const noexcept
Definition: pi_cuda.hpp:266
cl::sycl::info::kernel
kernel
Definition: info_desc.hpp:236
_pi_mem::context_
pi_context context_
Definition: pi_cuda.hpp:213
_pi_device::get_max_work_group_size
int get_max_work_group_size() const noexcept
Definition: pi_cuda.hpp:109
_pi_mem::is_buffer
bool is_buffer() const noexcept
Definition: pi_cuda.hpp:359
_pi_event::get_stream
CUstream get_stream() const noexcept
Definition: pi_cuda.hpp:472
CUdeviceptr
unsigned int CUdeviceptr
Definition: backend_traits_cuda.hpp:35
_pi_program::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:574
_pi_mem::mem_::surface_mem_::get_array
CUarray get_array() const noexcept
Definition: pi_cuda.hpp:308
PI_EVENT_RUNNING
@ PI_EVENT_RUNNING
Definition: pi.h:136
_pi_platform
A PI platform stores all known PI devices, in the CUDA plugin this is just a vector of available devi...
Definition: pi_cuda.hpp:63
_pi_kernel::arguments::paramSizes_
args_size_t paramSizes_
Definition: pi_cuda.hpp:650
_pi_context::kind
kind
Definition: pi_cuda.hpp:161
_pi_context::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:196
cl::sycl::malloc
void * malloc(size_t size, const device &dev, const context &ctxt, usm::alloc kind, const detail::code_location CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:368
_pi_mem::mem_::buffer_mem_::mapOffset_
size_t mapOffset_
Offset of the active mapped region.
Definition: pi_cuda.hpp:240
_pi_platform::devices_
std::vector< std::unique_ptr< _pi_device > > devices_
Definition: pi_cuda.hpp:65
cl::sycl::info::queue
queue
Definition: info_desc.hpp:229
_pi_program::build_program
pi_result build_program(const char *build_options)
Definition: pi_cuda.cpp:613
_pi_kernel
Implementation of a PI Kernel for CUDA.
Definition: pi_cuda.hpp:624
_pi_sampler::props_
pi_uint32 props_
Definition: pi_cuda.hpp:797
_pi_queue::eventCount_
std::atomic_uint32_t eventCount_
Definition: pi_cuda.hpp:389
_pi_mem::mem_::buffer_mem_::alloc_mode
alloc_mode
alloc_mode classic: Just a normal buffer allocated on the device via cuda malloc use_host_ptr: Use an...
Definition: pi_cuda.hpp:253
_pi_context::set_extended_deleter
void set_extended_deleter(pi_context_extended_deleter function, void *user_data)
Definition: pi_cuda.hpp:180
cl::sycl::detail::memcpy
void memcpy(void *Dst, const void *Src, std::size_t Size)
cuda_piKernelRetain
pi_result cuda_piKernelRetain(pi_kernel kernel)
Definition: pi_cuda.cpp:3475
_pi_mem::~_pi_mem
~_pi_mem()
Definition: pi_cuda.hpp:348
_pi_queue::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:445
cuda_piDeviceRetain
pi_result cuda_piDeviceRetain(pi_device)
Definition: pi_cuda.cpp:915
_pi_program::buildOptions_
std::string buildOptions_
Definition: pi_cuda.hpp:584
_pi_mem::mem_type_
enum _pi_mem::mem_type mem_type_
_pi_queue::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:449
_pi_queue::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:447
_pi_kernel::arguments::args_t
std::array< char, MAX_PARAM_BYTES > args_t
Definition: pi_cuda.hpp:646
_pi_program::buildStatus_
pi_program_build_status buildStatus_
Definition: pi_cuda.hpp:585
cl::sycl::length
float length(T p) __NOEXC
Definition: builtins.hpp:1032
_pi_kernel::functionWithOffsetParam_
native_type functionWithOffsetParam_
Definition: pi_cuda.hpp:628
_pi_program::module_
native_type module_
Definition: pi_cuda.hpp:571
_pi_queue
PI queue mapping on to CUstream objects.
Definition: pi_cuda.hpp:378
_pi_context::~_pi_context
~_pi_context()
Definition: pi_cuda.hpp:171
_pi_kernel::get_num_args
pi_uint32 get_num_args() const noexcept
Returns the number of arguments, excluding the implicit global offset.
Definition: pi_cuda.hpp:767
_pi_queue::get_next_event_id
pi_uint32 get_next_event_id() noexcept
Definition: pi_cuda.hpp:451
pi_uint32
uint32_t pi_uint32
Definition: pi.h:72
_pi_mem::mem_::buffer_mem_::ptr_
native_type ptr_
Definition: pi_cuda.hpp:233
_pi_mem::mem_::buffer_mem_::get_map_flags
pi_map_flags get_map_flags() const noexcept
Definition: pi_cuda.hpp:296
_pi_kernel::arguments::offsetPerIndex_
args_size_t offsetPerIndex_
Definition: pi_cuda.hpp:652
_pi_mem::mem_::surface_mem_
struct _pi_mem::mem_::surface_mem_ surface_mem_
_pi_event::decrement_reference_count
pi_uint32 decrement_reference_count()
Definition: pi_cuda.hpp:500
_pi_kernel::function_
native_type function_
Definition: pi_cuda.hpp:627
_pi_queue::properties_
pi_queue_properties properties_
Definition: pi_cuda.hpp:387
_pi_program::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:605
_pi_kernel::arguments::storage_
args_t storage_
Definition: pi_cuda.hpp:649
_pi_queue::compute_stream_idx_
std::atomic_uint32_t compute_stream_idx_
Definition: pi_cuda.hpp:390
_pi_kernel::arguments::add_arg
void add_arg(size_t index, size_t size, const void *arg, size_t localSize=0)
Adds an argument to the kernel.
Definition: pi_cuda.hpp:666
_pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr
@ use_host_ptr
_pi_mem::mem_::buffer_mem_::get
native_type get() const noexcept
Definition: pi_cuda.hpp:260
_pi_kernel::context_
pi_context context_
Definition: pi_cuda.hpp:630
pi_context_extended_deleter
void(* pi_context_extended_deleter)(void *user_data)
Definition: pi.h:1091
_pi_mem::mem_::surface_mem_::surfObj_
CUsurfObject surfObj_
Definition: pi_cuda.hpp:305
_pi_event::get_execution_status
pi_int32 get_execution_status() const noexcept
Definition: pi_cuda.hpp:484
cl::sycl::host_ptr
multi_ptr< ElementType, access::address_space::global_host_space > host_ptr
Definition: pointers.hpp:32
_pi_context::deleter_data::user_data
void * user_data
Definition: pi_cuda.hpp:154
_pi_context::kind::user_defined
@ user_defined
_pi_kernel::get_program
pi_program get_program() const noexcept
Definition: pi_cuda.hpp:741
_pi_mem::mem_type
mem_type
Definition: pi_cuda.hpp:217
cuda_piProgramRelease
pi_result cuda_piProgramRelease(pi_program program)
Decreases the reference count of a pi_program object.
Definition: pi_cuda.cpp:3329
_pi_kernel::_pi_kernel
_pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name, pi_program program, pi_context ctxt)
Definition: pi_cuda.hpp:722
_pi_event::get_event_id
pi_uint32 get_event_id() const noexcept
Definition: pi_cuda.hpp:502
_pi_queue::~_pi_queue
~_pi_queue()
Definition: pi_cuda.hpp:411
_pi_event::get_command_type
pi_command_type get_command_type() const noexcept
Definition: pi_cuda.hpp:474
_pi_event::native_type
CUevent native_type
Definition: pi_cuda.hpp:460
_pi_queue::default_num_transfer_streams
static constexpr int default_num_transfer_streams
Definition: pi_cuda.hpp:381
_pi_mem::mem_::buffer_mem_::get_size
size_t get_size() const noexcept
Definition: pi_cuda.hpp:262
_pi_kernel::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:743
_pi_mem::mem_::buffer_mem_::mapPtr_
void * mapPtr_
Pointer to the active mapped region, if any.
Definition: pi_cuda.hpp:242
pi_uint64
uint64_t pi_uint64
Definition: pi.h:73
cuda_piContextRetain
pi_result cuda_piContextRetain(pi_context context)
Definition: pi_cuda.cpp:959
_pi_mem::mem_::buffer_mem_::parent_
pi_mem parent_
Definition: pi_cuda.hpp:231
cl::sycl::info::platform::name
@ name
_pi_device_binary_property_struct
Definition: pi.h:701
_pi_device::save_max_work_group_size
void save_max_work_group_size(int value) noexcept
Definition: pi_cuda.hpp:100
_pi_event::wait
pi_result wait()
Definition: pi_cuda.cpp:522
_pi_program
Implementation of PI Program on CUDA Module object.
Definition: pi_cuda.hpp:569
_pi_queue::transfer_stream_idx_
std::atomic_uint32_t transfer_stream_idx_
Definition: pi_cuda.hpp:391
_pi_sampler
Implementation of samplers for CUDA.
Definition: pi_cuda.hpp:795
_pi_sampler::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:807
_pi_device::get
native_type get() const noexcept
Definition: pi_cuda.hpp:89
_pi_kernel::get_context
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:759
_pi_queue::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:388
_pi_program::infoLog_
char infoLog_[MAX_LOG_SIZE]
Definition: pi_cuda.hpp:583
_pi_event::is_completed
bool is_completed() const noexcept
Definition: pi_cuda.cpp:448
cuda_piMemRetain
pi_result cuda_piMemRetain(pi_mem mem)
Definition: pi_cuda.cpp:3065
_pi_mem::get_reference_count
pi_uint32 get_reference_count() const noexcept
Definition: pi_cuda.hpp:373
_pi_event::release
pi_result release()
Definition: pi_cuda.cpp:534
_pi_mem::mem_::buffer_mem_
Definition: pi_cuda.hpp:226
CUmodule
struct CUmod_st * CUmodule
Definition: backend_traits_cuda.hpp:29
PI_EVENT_COMPLETE
@ PI_EVENT_COMPLETE
Definition: pi.h:135
_pi_mem::mem_type::buffer
@ buffer
_pi_kernel::get_local_size
pi_uint32 get_local_size() const noexcept
Definition: pi_cuda.hpp:785
_pi_mem::mem_::buffer_mem_::native_type
CUdeviceptr native_type
Definition: pi_cuda.hpp:227
_pi_device::save_max_work_item_sizes
void save_max_work_item_sizes(size_t size, size_t *save_max_work_item_sizes) noexcept
Definition: pi_cuda.hpp:95
_pi_program::~_pi_program
~_pi_program()
Definition: pi_cuda.cpp:563
_pi_event::get
native_type get() const noexcept
Definition: pi_cuda.hpp:468
_pi_kernel::arguments::set_implicit_offset
void set_implicit_offset(size_t size, std::uint32_t *implicitOffset)
Definition: pi_cuda.hpp:705
_pi_mem::mem_::buffer_mem_::alloc_mode::copy_in
@ copy_in
_pi_program::binary_
const char * binary_
Definition: pi_cuda.hpp:572
CUdevice
int CUdevice
Definition: backend_traits_cuda.hpp:25
_pi_mem::mem_::buffer_mem_::get_map_ptr
void * get_map_ptr() const noexcept
Definition: pi_cuda.hpp:264
_pi_kernel::get_with_offset_parameter
native_type get_with_offset_parameter() const noexcept
Definition: pi_cuda.hpp:751
_pi_context::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:192
_pi_event::~_pi_event
~_pi_event()
Definition: pi_cuda.cpp:423
_pi_event::increment_reference_count
pi_uint32 increment_reference_count()
Definition: pi_cuda.hpp:498
_pi_queue::num_compute_streams_
unsigned int num_compute_streams_
Definition: pi_cuda.hpp:392
_pi_context::deleter_data::operator()
void operator()()
Definition: pi_cuda.hpp:156
cuda_piDeviceRelease
pi_result cuda_piDeviceRelease(pi_device)
Definition: pi_cuda.cpp:1037
_pi_kernel::arguments::clear_local_size
void clear_local_size()
Definition: pi_cuda.hpp:710
_pi_mem::mem_::surface_mem_::imageType_
pi_mem_type imageType_
Definition: pi_cuda.hpp:306
cl::sycl::detail::usm::free
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:181
_pi_event::_pi_event
_pi_event()
Definition: pi_esimd_emulator.hpp:194
cl::sycl::info::event
event
Definition: info_desc.hpp:289
pi_queue_properties
pi_bitfield pi_queue_properties
Definition: pi.h:623
_pi_kernel::program_
pi_program program_
Definition: pi_cuda.hpp:631
_pi_program::_pi_program
_pi_program()
Definition: pi_esimd_emulator.hpp:203
_pi_kernel::arguments
Structure that holds the arguments to the kernel.
Definition: pi_cuda.hpp:644
_pi_mem::mem_type::surface
@ surface
_pi_mem::mem_::surface_mem_
Definition: pi_cuda.hpp:303
_pi_program::native_type
CUmodule native_type
Definition: pi_cuda.hpp:570
_pi_queue::get_next_transfer_stream
native_type get_next_transfer_stream()
Definition: pi_cuda.cpp:384
PI_PROGRAM_BUILD_STATUS_NONE
@ PI_PROGRAM_BUILD_STATUS_NONE
Definition: pi.h:157
_pi_event::get_start_time
pi_uint64 get_start_time() const
Definition: pi_cuda.cpp:474
_pi_event
PI Event mapping to CUevent.
Definition: pi_cuda.hpp:458
_pi_queue::compute_stream_mutex_
std::mutex compute_stream_mutex_
Definition: pi_cuda.hpp:395
_pi_program::errorLog_
char errorLog_[MAX_LOG_SIZE]
Definition: pi_cuda.hpp:583
_pi_context::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:164
_pi_queue::transfer_streams_
std::vector< native_type > transfer_streams_
Definition: pi_cuda.hpp:384
_pi_kernel::reqdThreadsPerBlock_
size_t reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS]
Definition: pi_cuda.hpp:635
_pi_mem::mem_::buffer_mem_
struct _pi_mem::mem_::buffer_mem_ buffer_mem_
_pi_mem::mem_::buffer_mem_::alloc_mode::classic
@ classic
_pi_mem::refCount_
std::atomic_uint32_t refCount_
Reference counting of the handler.
Definition: pi_cuda.hpp:216
_pi_mem::mem_
union _pi_mem::mem_ mem_
_pi_kernel::set_implicit_offset_arg
void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset)
Definition: pi_cuda.hpp:777
_pi_queue::default_num_compute_streams
static constexpr int default_num_compute_streams
Definition: pi_cuda.hpp:380
_pi_mem::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:369
_pi_queue::num_transfer_streams_
unsigned int num_transfer_streams_
Definition: pi_cuda.hpp:393
_pi_kernel::get_name
const char * get_name() const noexcept
Definition: pi_cuda.hpp:761
cuda_piProgramRetain
pi_result cuda_piProgramRetain(pi_program program)
Definition: pi_cuda.cpp:3319
_pi_event::is_recorded
bool is_recorded() const noexcept
Definition: pi_cuda.hpp:478
_pi_kernel::arguments::arguments
arguments()
Definition: pi_cuda.hpp:656
CUevent
struct CUevent_st * CUevent
Definition: backend_traits_cuda.hpp:28
_pi_device::get_max_work_item_sizes
void get_max_work_item_sizes(size_t ret_size, size_t *ret_max_work_item_sizes) const noexcept
Definition: pi_cuda.hpp:104
_pi_program::set_binary
pi_result set_binary(const char *binary, size_t binarySizeInBytes)
Definition: pi_cuda.cpp:605
_pi_kernel::get
native_type get() const noexcept
Definition: pi_cuda.hpp:749
cuda_piQueueRetain
pi_result cuda_piQueueRetain(pi_queue command_queue)
Definition: pi_cuda.cpp:2354
_pi_mem::_pi_mem
_pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf, pi_mem_type image_type, void *host_ptr)
Constructs the PI allocation for an Image object (surface in CUDA)
Definition: pi_cuda.hpp:336
_pi_program_build_status
_pi_program_build_status
Definition: pi.h:156
_pi_kernel::refCount_
std::atomic_uint32_t refCount_
Definition: pi_cuda.hpp:632
cl::sycl::info::platform
platform
Definition: info_desc.hpp:31
_pi_kernel::arguments::indices_
args_index_t indices_
Definition: pi_cuda.hpp:651
_pi_kernel::args_
struct _pi_kernel::arguments args_
_pi_queue::transfer_stream_mutex_
std::mutex transfer_stream_mutex_
Definition: pi_cuda.hpp:396
cl::sycl::access::mode
mode
Definition: access.hpp:28
cl::sycl::info::context
context
Definition: info_desc.hpp:42
_pi_kernel::set_kernel_local_arg
void set_kernel_local_arg(int index, size_t size)
Definition: pi_cuda.hpp:773
_pi_event::start
pi_result start()
Definition: pi_cuda.cpp:430
_pi_mem::mem_::buffer_mem_::allocMode_
enum _pi_mem::mem_::buffer_mem_::alloc_mode allocMode_
_pi_mem::get_context
pi_context get_context() const noexcept
Definition: pi_cuda.hpp:367
cuda_piContextRelease
pi_result cuda_piContextRelease(pi_context ctxt)
Definition: pi_cuda.cpp:1975
_pi_kernel_group_info
_pi_kernel_group_info
Definition: pi.h:368
_pi_program::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:603
pfn_notify
void(* pfn_notify)(pi_event event, pi_int32 eventCommandStatus, void *userData)
Definition: pi_cuda.hpp:454
_pi_mem::is_sub_buffer
bool is_sub_buffer() const noexcept
Definition: pi_cuda.hpp:361
_pi_queue::get_context
_pi_context * get_context() const
Definition: pi_cuda.hpp:443
_pi_context::is_primary
bool is_primary() const noexcept
Definition: pi_cuda.hpp:190
_pi_mem::mem_::buffer_mem_::mapFlags_
pi_map_flags mapFlags_
Original flags for the mapped region.
Definition: pi_cuda.hpp:244
_pi_event::get_end_time
pi_uint64 get_end_time() const
Definition: pi_cuda.cpp:483
_pi_program::increment_reference_count
pi_uint32 increment_reference_count() noexcept
Definition: pi_cuda.hpp:601
_pi_command_type
_pi_command_type
Definition: pi.h:417
_pi_kernel::set_kernel_arg
void set_kernel_arg(int index, size_t size, const void *arg)
Definition: pi_cuda.hpp:769
_pi_kernel::arguments::args_size_t
std::vector< size_t > args_size_t
Definition: pi_cuda.hpp:647
_pi_mem::mem_::buffer_mem_::unmap
void unmap(void *) noexcept
Detach the allocation from the host memory.
Definition: pi_cuda.hpp:286
pi_map_flags
pi_bitfield pi_map_flags
Definition: pi.h:592
cuda_piMemRelease
pi_result cuda_piMemRelease(pi_mem memObj)
Decreases the reference count of the Mem object.
Definition: pi_cuda.cpp:2115
_pi_program::get
native_type get() const noexcept
Definition: pi_cuda.hpp:599
_pi_mem::mem_::surface_mem_::array_
CUarray array_
Definition: pi_cuda.hpp:304
_pi_mem::decrement_reference_count
pi_uint32 decrement_reference_count() noexcept
Definition: pi_cuda.hpp:371
_pi_program::set_metadata
pi_result set_metadata(const pi_device_binary_property *metadata, size_t length)
Definition: pi_cuda.cpp:578
_pi_queue::flags_
unsigned int flags_
Definition: pi_cuda.hpp:394
_pi_context::kind_
enum _pi_context::kind kind_
_pi_mem::mem_::surface_mem_::get_image_type
pi_mem_type get_image_type() const noexcept
Definition: pi_cuda.hpp:312
_pi_event::get_queue
pi_queue get_queue() const noexcept
Definition: pi_cuda.hpp:470
_pi_kernel::REQD_THREADS_PER_BLOCK_DIMENSIONS
static constexpr pi_uint32 REQD_THREADS_PER_BLOCK_DIMENSIONS
Definition: pi_cuda.hpp:634
cuda_piKernelGetGroupInfo
pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_cuda.cpp:2688
PI_EVENT_SUBMITTED
@ PI_EVENT_SUBMITTED
Definition: pi.h:137
PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
Definition: pi.h:371
_pi_mem::mem_::buffer_mem_::map_to_ptr
void * map_to_ptr(size_t offset, pi_map_flags flags) noexcept
Returns a pointer to data visible on the host that contains the data on the device associated with th...
Definition: pi_cuda.hpp:272
pi_int32
int32_t pi_int32
Definition: pi.h:71
_pi_context
PI context mapping to a CUDA context object.
Definition: pi_cuda.hpp:150
cuda_piKernelRelease
pi_result cuda_piKernelRelease(pi_kernel kernel)
Definition: pi_cuda.cpp:3483
_pi_program::binarySizeInBytes_
size_t binarySizeInBytes_
Definition: pi_cuda.hpp:573
cuda_piQueueRelease
pi_result cuda_piQueueRelease(pi_queue command_queue)
Definition: pi_cuda.cpp:2362
_pi_platform::evBase_
static CUevent evBase_
Definition: pi_cuda.hpp:64
_pi_device
PI device mapping to a CUdevice.
Definition: pi_cuda.hpp:73
_pi_event::record
pi_result record()
Definition: pi_cuda.cpp:492
_pi_context::get_device
pi_device get_device() const noexcept
Definition: pi_cuda.hpp:186
_pi_context::invoke_extended_deleters
void invoke_extended_deleters()
Definition: pi_cuda.hpp:173
_pi_kernel::arguments::get_indices
const args_index_t & get_indices() const noexcept
Definition: pi_cuda.hpp:714
_pi_kernel::clear_local_size
void clear_local_size()
Definition: pi_cuda.hpp:787