DPC++ Runtime
Runtime libraries for oneAPI DPC++
|
|
Go to the documentation of this file.
23 #define _PI_CUDA_PLUGIN_VERSION 1
25 #define _PI_CUDA_PLUGIN_VERSION_STRING \
26 _PI_PLUGIN_VERSION_STRING(_PI_CUDA_PLUGIN_VERSION)
42 #include <unordered_map>
62 size_t param_value_size,
void *param_value,
63 size_t *param_value_size_ret);
75 std::vector<std::unique_ptr<_pi_device>>
devices_;
87 native_type cuDevice_;
90 std::atomic_uint32_t refCount_;
93 static constexpr
pi_uint32 max_work_item_dimensions = 3u;
94 size_t max_work_item_sizes[max_work_item_dimensions];
95 int max_work_group_size;
100 : cuDevice_(cuDevice), cuContext_(cuContext),
101 evBase_(evBase), refCount_{1}, platform_(platform) {}
105 native_type
get() const noexcept {
return cuDevice_; };
121 max_work_group_size = value;
125 size_t *ret_max_work_item_sizes)
const noexcept {
126 memcpy(ret_max_work_item_sizes, max_work_item_sizes, ret_size);
193 std::lock_guard<std::mutex> guard(mutex_);
194 for (
auto &deleter : extended_deleters_) {
201 std::lock_guard<std::mutex> guard(mutex_);
202 extended_deleters_.emplace_back(
deleter_data{
function, user_data});
217 std::vector<deleter_data> extended_deleters_;
434 std::vector<CUstream> &&transfer_streams,
_pi_context *context,
436 unsigned int flags,
bool backend_owns =
true)
491 bool is_last_command =
504 template <
typename T>
bool all_of(T &&f) {
532 for (
unsigned int i = 0; i < end; i++) {
541 for (
unsigned int i = 0; i < end; i++) {
547 template <
bool ResetUsed = false,
typename T>
void sync_streams(T &&f) {
551 for (
unsigned int i = start; i < stop; i++) {
558 for (
unsigned int i = start; i < stop; i++) {
564 std::lock_guard<std::mutex> compute_sync_guard(
571 if (end - start >= size) {
572 sync_compute(0, size);
577 sync_compute(start, end);
579 sync_compute(start, size);
580 sync_compute(0, end);
595 if (end - start >= size) {
596 sync_transfer(0, size);
601 sync_transfer(start, end);
603 sync_transfer(start, size);
604 sync_transfer(0, end);
704 return new _pi_event(context, eventNative);
723 std::atomic_uint32_t refCount_;
727 bool hasBeenWaitedOn_;
770 std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
839 using args_t = std::array<char, MAX_PARAM_BYTES>;
859 void add_arg(
size_t index,
size_t size,
const void *arg,
860 size_t localSize = 0) {
870 size_t insertPos = std::accumulate(std::begin(
paramSizes_),
882 const size_t max_alignment =
sizeof(double) * 16;
886 const size_t alignment = std::min(max_alignment, size);
889 size_t alignedLocalOffset = localOffset;
894 add_arg(index,
sizeof(
size_t), (
const void *)&(alignedLocalOffset),
895 size + (alignedLocalOffset - localOffset));
899 assert(size ==
sizeof(std::uint32_t) * 3);
915 _pi_kernel(CUfunction func, CUfunction funcWithOffsetParam,
const char *name,
926 assert(retError == PI_SUCCESS);
1007 #endif // PI_CUDA_HPP
bool has_with_offset_parameter() const noexcept
_pi_mem(pi_context ctxt, pi_mem parent, mem_::buffer_mem_::alloc_mode mode, CUdeviceptr ptr, void *host_ptr, size_t size)
Constructs the PI MEM handler for a non-typed allocation ("buffer")
static constexpr size_t MAX_PARAM_BYTES
pi_uint32 get_reference_count() const noexcept
A PI Memory object represents either plain memory allocations ("Buffers" in OpenCL) or typed allocati...
PI Mem mapping to CUDA memory allocations, both data and texture/surface.
struct CUevent_st * CUevent
std::unordered_map< std::string, std::string > globalIDMD_
constexpr pi_map_flags PI_MAP_WRITE
CUsurfObject get_surface() const noexcept
struct CUstream_st * CUstream
pi_uint32 decrement_reference_count() noexcept
pi_uint32 get_local_size() const
pi_uint32 decrement_reference_count() noexcept
std::atomic_uint32_t refCount_
pi_context get_context() const noexcept
void * hostPtr_
Pointer associated with this device on the host.
pi_uint32 get_reference_count() const noexcept
std::unordered_map< std::string, std::tuple< uint32_t, uint32_t, uint32_t > > kernelReqdWorkGroupSizeMD_
bool is_image() const noexcept
pi_uint32 get_compute_stream_token() const noexcept
std::vector< void * > args_index_t
bool is_started() const noexcept
CUevent barrier_tmp_event_
pi_uint32 increment_reference_count() noexcept
pi_uint32 get_reference_count() const noexcept
native_type get_next_compute_stream(pi_uint32 *stream_token=nullptr)
_pi_context(_pi_device *devId)
pi_uint32 decrement_reference_count() noexcept
std::uint32_t implicitOffsetArgs_[3]
void add_local_arg(size_t index, size_t size)
const arguments::args_index_t & get_arg_indices() const
native_type get() const noexcept
size_t size_
Size of the allocation in bytes.
pi_context get_context() const
constexpr static size_t MAX_LOG_SIZE
_pi_sampler(pi_context context)
pi_platform get_platform() const noexcept
std::unique_lock< std::mutex > _pi_stream_guard
void memcpy(void *Dst, const void *Src, size_t Size)
std::vector< bool > transfer_applied_barrier_
void for_each_stream(T &&f)
constexpr alignment_key::value_t< K > alignment
_pi_device(native_type cuDevice, CUcontext cuContext, CUevent evBase, pi_platform platform)
pi_uint64 get_queued_time() const
static constexpr int default_num_transfer_streams
size_t get_map_offset(void *) const noexcept
int get_max_work_group_size() const noexcept
bool is_buffer() const noexcept
bool backend_has_ownership() const noexcept
native_type get_next_transfer_stream()
CUstream get_stream() const noexcept
std::atomic_uint32_t refCount_
CUarray get_array() const noexcept
simd< _Tp, _Abi > max(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
bool can_reuse_stream(pi_uint32 stream_token)
pi_uint32 get_reference_count() const noexcept
size_t mapOffset_
Offset of the active mapped region.
pi_result build_program(const char *build_options)
multi_ptr< ElementType, access::address_space::ext_intel_global_host_space, IsDecorated > host_ptr
Implementation of a PI Kernel for CUDA.
struct CUctx_st * CUcontext
std::atomic_uint32_t eventCount_
alloc_mode
alloc_mode classic: Just a normal buffer allocated on the device via cuda malloc use_host_ptr: Use an...
void set_extended_deleter(pi_context_extended_deleter function, void *user_data)
pi_result cuda_piKernelRetain(pi_kernel kernel)
pi_uint32 increment_reference_count() noexcept
pi_result cuda_piDeviceRetain(pi_device)
std::string buildOptions_
enum _pi_mem::mem_type mem_type_
std::mutex barrier_mutex_
pi_uint32 get_reference_count() const noexcept
pi_uint32 decrement_reference_count() noexcept
std::array< char, MAX_PARAM_BYTES > args_t
pi_program_build_status buildStatus_
native_type functionWithOffsetParam_
bool has_been_synchronized(pi_uint32 stream_token)
PI queue mapping on to CUstream objects.
pi_uint32 get_num_args() const noexcept
Returns the number of arguments, excluding the implicit global offset.
pi_uint32 get_next_event_id() noexcept
static pi_event make_native(pi_command_type type, pi_queue queue, CUstream stream, pi_uint32 stream_token=std::numeric_limits< pi_uint32 >::max())
pi_map_flags get_map_flags() const noexcept
args_size_t offsetPerIndex_
struct _pi_mem::mem_::surface_mem_ surface_mem_
bool backend_has_ownership() const noexcept
pi_uint32 decrement_reference_count()
pi_queue_properties properties_
pi_uint32 get_reference_count() const noexcept
struct CUmod_st * CUmodule
std::atomic_uint32_t compute_stream_idx_
void add_arg(size_t index, size_t size, const void *arg, size_t localSize=0)
Adds an argument to the kernel.
native_type get() const noexcept
void transfer_stream_wait_for_barrier_if_needed(CUstream stream, pi_uint32 stream_i)
static pi_event make_with_native(pi_context context, CUevent eventNative)
void(* pi_context_extended_deleter)(void *user_data)
pi_int32 get_execution_status() const noexcept
pi_uint64 get_elapsed_time(CUevent) const
pi_program get_program() const noexcept
pi_result cuda_piProgramRelease(pi_program program)
Decreases the reference count of a pi_program object.
_pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name, pi_program program, pi_context ctxt)
pi_uint32 get_event_id() const noexcept
pi_command_type get_command_type() const noexcept
size_t get_size() const noexcept
pi_uint32 increment_reference_count() noexcept
void * mapPtr_
Pointer to the active mapped region, if any.
pi_result cuda_piContextRetain(pi_context context)
unsigned int last_sync_transfer_streams_
void save_max_work_group_size(int value) noexcept
Implementation of PI Program on CUDA Module object.
std::atomic_uint32_t transfer_stream_idx_
Implementation of samplers for CUDA.
std::vector< bool > delay_compute_
pi_uint32 get_reference_count() const noexcept
native_type get() const noexcept
pi_context get_context() const noexcept
std::atomic_uint32_t refCount_
char infoLog_[MAX_LOG_SIZE]
bool is_completed() const noexcept
_pi_device * get_device() const
pi_result cuda_piMemRetain(pi_mem mem)
pi_uint32 get_reference_count() const noexcept
pi_uint32 get_local_size() const noexcept
void save_max_work_item_sizes(size_t size, size_t *save_max_work_item_sizes) noexcept
native_type get() const noexcept
void set_implicit_offset(size_t size, std::uint32_t *implicitOffset)
void * get_map_ptr() const noexcept
bool all_of(const simd_mask< _Tp, _Abi > &) noexcept
native_type get_with_offset_parameter() const noexcept
pi_uint32 increment_reference_count() noexcept
pi_uint32 increment_reference_count()
unsigned int num_compute_streams_
pi_result cuda_piDeviceRelease(pi_device)
pi_bitfield pi_queue_properties
Structure that holds the arguments to the kernel.
void compute_stream_wait_for_barrier_if_needed(CUstream stream, pi_uint32 stream_i)
@ PI_PROGRAM_BUILD_STATUS_NONE
pi_uint64 get_start_time() const
PI Event mapping to CUevent.
std::mutex compute_stream_mutex_
std::vector< bool > compute_applied_barrier_
char errorLog_[MAX_LOG_SIZE]
std::atomic_uint32_t refCount_
size_t reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS]
struct _pi_mem::mem_::buffer_mem_ buffer_mem_
std::atomic_uint32_t refCount_
Reference counting of the handler.
void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset)
std::vector< native_type > compute_streams_
pi_uint32 increment_reference_count() noexcept
unsigned int num_transfer_streams_
const char * get_name() const noexcept
pi_result cuda_piProgramRetain(pi_program program)
bool is_recorded() const noexcept
void get_max_work_item_sizes(size_t ret_size, size_t *ret_max_work_item_sizes) const noexcept
pi_result set_binary(const char *binary, size_t binarySizeInBytes)
native_type get() const noexcept
pi_result cuda_piQueueRetain(pi_queue command_queue)
_pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf, pi_mem_type image_type, void *host_ptr)
Constructs the PI allocation for an Image object (surface in CUDA)
std::atomic_uint32_t refCount_
_pi_queue(std::vector< CUstream > &&compute_streams, std::vector< CUstream > &&transfer_streams, _pi_context *context, _pi_device *device, pi_queue_properties properties, unsigned int flags, bool backend_owns=true)
struct _pi_kernel::arguments args_
std::mutex transfer_stream_mutex_
void set_kernel_local_arg(int index, size_t size)
static constexpr int default_num_compute_streams
void free(void *Ptr, const context &Ctxt, const code_location &CL)
enum _pi_mem::mem_::buffer_mem_::alloc_mode allocMode_
pi_context get_context() const noexcept
pi_result cuda_piContextRelease(pi_context ctxt)
CUcontext get_context() const noexcept
pi_uint32 decrement_reference_count() noexcept
void(* pfn_notify)(pi_event event, pi_int32 eventCommandStatus, void *userData)
bool is_sub_buffer() const noexcept
_pi_context * get_context() const
std::vector< native_type > transfer_streams_
pi_map_flags mapFlags_
Original flags for the mapped region.
pi_uint64 get_end_time() const
pi_uint32 increment_reference_count() noexcept
void set_kernel_arg(int index, size_t size, const void *arg)
std::vector< size_t > args_size_t
void unmap(void *) noexcept
Detach the allocation from the host memory.
pi_result cuda_piMemRelease(pi_mem memObj)
Decreases the reference count of the Mem object.
std::mutex compute_stream_sync_mutex_
native_type get() const noexcept
unsigned int last_sync_compute_streams_
pi_uint32 decrement_reference_count() noexcept
pi_result set_metadata(const pi_device_binary_property *metadata, size_t length)
pi_mem_type get_image_type() const noexcept
pi_queue get_queue() const noexcept
static constexpr pi_uint32 REQD_THREADS_PER_BLOCK_DIMENSIONS
pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
void * map_to_ptr(size_t offset, pi_map_flags flags) noexcept
Returns a pointer to data visible on the host that contains the data on the device associated with th...
PI context mapping to a CUDA context object.
pi_result cuda_piKernelRelease(pi_kernel kernel)
size_t binarySizeInBytes_
pi_result cuda_piQueueRelease(pi_queue command_queue)
void * malloc(size_t size, const device &dev, const context &ctxt, usm::alloc kind _CODELOCPARAM(&CodeLoc))
PI device mapping to a CUdevice.
pi_device get_device() const noexcept
void invoke_extended_deleters()
const args_index_t & get_indices() const noexcept