Implementation of a PI Kernel for CUDA. More...
#include <cuda/pi_cuda.hpp>
Classes | |
struct | arguments |
Structure that holds the arguments to the kernel. More... | |
Public Types | |
using | native_type = CUfunction |
using | native_type = hipFunction_t |
Public Member Functions | |
_pi_kernel (CUfunction func, CUfunction funcWithOffsetParam, const char *name, pi_program program, pi_context ctxt) | |
~_pi_kernel () | |
pi_program | get_program () const noexcept |
pi_uint32 | increment_reference_count () noexcept |
pi_uint32 | decrement_reference_count () noexcept |
pi_uint32 | get_reference_count () const noexcept |
native_type | get () const noexcept |
native_type | get_with_offset_parameter () const noexcept |
bool | has_with_offset_parameter () const noexcept |
pi_context | get_context () const noexcept |
const char * | get_name () const noexcept |
pi_uint32 | get_num_args () const noexcept |
Returns the number of arguments, excluding the implicit global offset. More... | |
void | set_kernel_arg (int index, size_t size, const void *arg) |
void | set_kernel_local_arg (int index, size_t size) |
void | set_implicit_offset_arg (size_t size, std::uint32_t *implicitOffset) |
const arguments::args_index_t & | get_arg_indices () const |
pi_uint32 | get_local_size () const noexcept |
void | clear_local_size () |
_pi_kernel () | |
_pi_kernel (hipFunction_t func, hipFunction_t funcWithOffsetParam, const char *name, pi_program program, pi_context ctxt) | |
_pi_kernel (hipFunction_t func, const char *name, pi_program program, pi_context ctxt) | |
~_pi_kernel () | |
pi_program | get_program () const noexcept |
pi_uint32 | increment_reference_count () noexcept |
pi_uint32 | decrement_reference_count () noexcept |
pi_uint32 | get_reference_count () const noexcept |
native_type | get () const noexcept |
native_type | get_with_offset_parameter () const noexcept |
bool | has_with_offset_parameter () const noexcept |
pi_context | get_context () const noexcept |
const char * | get_name () const noexcept |
pi_uint32 | get_num_args () const noexcept |
Returns the number of arguments, excluding the implicit global offset. More... | |
void | set_kernel_arg (int index, size_t size, const void *arg) |
void | set_kernel_local_arg (int index, size_t size) |
void | set_implicit_offset_arg (size_t size, std::uint32_t *implicitOffset) |
arguments::args_index_t | get_arg_indices () const |
pi_uint32 | get_local_size () const noexcept |
void | clear_local_size () |
Public Attributes | |
native_type | function_ |
native_type | functionWithOffsetParam_ |
std::string | name_ |
pi_context | context_ |
pi_program | program_ |
std::atomic_uint32_t | refCount_ |
size_t | reqdThreadsPerBlock_ [REQD_THREADS_PER_BLOCK_DIMENSIONS] |
struct _pi_kernel::arguments | args_ |
Static Public Attributes | |
static constexpr pi_uint32 | REQD_THREADS_PER_BLOCK_DIMENSIONS = 3u |
Implementation of a PI Kernel for CUDA.
Implementation of a PI Kernel for HIP.
PI Kernels are used to set kernel arguments, creating a state on the Kernel object for a given invocation. This is not the case of CUFunction objects, which are simply passed together with the arguments on the invocation. The PI Kernel implementation for CUDA stores the list of arguments, argument sizes and offsets to emulate the interface of PI Kernel, saving the arguments for the later dispatch. Note that in PI API, the Local memory is specified as a size per individual argument, but in CUDA only the total usage of shared memory is required since it is not passed as a parameter. A compiler pass converts the PI API local memory model into the CUDA shared model. This object simply calculates the total of shared memory, and the initial offsets of each parameter.
PI Kernels are used to set kernel arguments, creating a state on the Kernel object for a given invocation. This is not the case of HIPFunction objects, which are simply passed together with the arguments on the invocation. The PI Kernel implementation for HIP stores the list of arguments, argument sizes and offsets to emulate the interface of PI Kernel, saving the arguments for the later dispatch. Note that in PI API, the Local memory is specified as a size per individual argument, but in HIP only the total usage of shared memory is required since it is not passed as a parameter. A compiler pass converts the PI API local memory model into the HIP shared model. This object simply calculates the total of shared memory, and the initial offsets of each parameter.
Definition at line 817 of file pi_cuda.hpp.
using _pi_kernel::native_type = hipFunction_t |
Definition at line 777 of file pi_hip.hpp.
using _pi_kernel::native_type = CUfunction |
Definition at line 818 of file pi_cuda.hpp.
|
inline |
Note: this code assumes that there is only one device per context
Definition at line 915 of file pi_cuda.hpp.
|
inline |
Definition at line 929 of file pi_cuda.hpp.
References context_, cuda_piContextRelease(), cuda_piProgramRelease(), and program_.
|
inline |
Definition at line 218 of file pi_esimd_emulator.hpp.
|
inline |
Definition at line 871 of file pi_hip.hpp.
|
inline |
Definition at line 879 of file pi_hip.hpp.
|
inline |
Definition at line 883 of file pi_hip.hpp.
References context_, hip_piContextRelease(), hip_piProgramRelease(), and program_.
|
inline |
Definition at line 934 of file pi_hip.hpp.
References args_, and _pi_kernel::arguments::clear_local_size().
|
inline |
Definition at line 980 of file pi_cuda.hpp.
References args_, and _pi_kernel::arguments::clear_local_size().
|
inlinenoexcept |
Definition at line 892 of file pi_hip.hpp.
References refCount_.
|
inlinenoexcept |
Definition at line 938 of file pi_cuda.hpp.
References refCount_.
|
inlinenoexcept |
Definition at line 896 of file pi_hip.hpp.
References function_.
|
inlinenoexcept |
Definition at line 942 of file pi_cuda.hpp.
References function_.
|
inline |
Definition at line 928 of file pi_hip.hpp.
References args_, and _pi_kernel::arguments::get_indices().
|
inline |
Definition at line 974 of file pi_cuda.hpp.
References args_, and _pi_kernel::arguments::get_indices().
|
inlinenoexcept |
Definition at line 906 of file pi_hip.hpp.
References context_.
|
inlinenoexcept |
Definition at line 952 of file pi_cuda.hpp.
References context_.
|
inlinenoexcept |
Definition at line 932 of file pi_hip.hpp.
References args_, and _pi_kernel::arguments::get_local_size().
|
inlinenoexcept |
Definition at line 978 of file pi_cuda.hpp.
References args_, and _pi_kernel::arguments::get_local_size().
|
inlinenoexcept |
Definition at line 908 of file pi_hip.hpp.
References name_.
|
inlinenoexcept |
Definition at line 954 of file pi_cuda.hpp.
References name_.
|
inlinenoexcept |
Returns the number of arguments, excluding the implicit global offset.
Note this only returns the current known number of arguments, not the real one required by the kernel, since this cannot be queried from the HIP Driver API
Definition at line 914 of file pi_hip.hpp.
References args_, and _pi_kernel::arguments::indices_.
|
inlinenoexcept |
Returns the number of arguments, excluding the implicit global offset.
Note this only returns the current known number of arguments, not the real one required by the kernel, since this cannot be queried from the CUDA Driver API
Definition at line 960 of file pi_cuda.hpp.
References args_, and _pi_kernel::arguments::indices_.
|
inlinenoexcept |
Definition at line 888 of file pi_hip.hpp.
References program_.
|
inlinenoexcept |
Definition at line 934 of file pi_cuda.hpp.
References program_.
|
inlinenoexcept |
Definition at line 894 of file pi_hip.hpp.
References refCount_.
|
inlinenoexcept |
Definition at line 940 of file pi_cuda.hpp.
References refCount_.
|
inlinenoexcept |
Definition at line 898 of file pi_hip.hpp.
References functionWithOffsetParam_.
|
inlinenoexcept |
Definition at line 944 of file pi_cuda.hpp.
References functionWithOffsetParam_.
|
inlinenoexcept |
Definition at line 902 of file pi_hip.hpp.
References functionWithOffsetParam_.
|
inlinenoexcept |
Definition at line 948 of file pi_cuda.hpp.
References functionWithOffsetParam_.
|
inlinenoexcept |
Definition at line 890 of file pi_hip.hpp.
References refCount_.
|
inlinenoexcept |
Definition at line 936 of file pi_cuda.hpp.
References refCount_.
|
inline |
Definition at line 924 of file pi_hip.hpp.
References args_, and _pi_kernel::arguments::set_implicit_offset().
|
inline |
Definition at line 970 of file pi_cuda.hpp.
References args_, and _pi_kernel::arguments::set_implicit_offset().
|
inline |
Definition at line 916 of file pi_hip.hpp.
References _pi_kernel::arguments::add_arg(), and args_.
|
inline |
Definition at line 962 of file pi_cuda.hpp.
References _pi_kernel::arguments::add_arg(), and args_.
|
inline |
Definition at line 920 of file pi_hip.hpp.
References _pi_kernel::arguments::add_local_arg(), and args_.
|
inline |
Definition at line 966 of file pi_cuda.hpp.
References _pi_kernel::arguments::add_local_arg(), and args_.
struct _pi_kernel::arguments _pi_kernel::args_ |
pi_context _pi_kernel::context_ |
Definition at line 823 of file pi_cuda.hpp.
Referenced by get_context(), and ~_pi_kernel().
native_type _pi_kernel::function_ |
Definition at line 820 of file pi_cuda.hpp.
Referenced by get().
native_type _pi_kernel::functionWithOffsetParam_ |
Definition at line 821 of file pi_cuda.hpp.
Referenced by get_with_offset_parameter(), and has_with_offset_parameter().
std::string _pi_kernel::name_ |
Definition at line 822 of file pi_cuda.hpp.
Referenced by get_name().
pi_program _pi_kernel::program_ |
Definition at line 824 of file pi_cuda.hpp.
Referenced by get_program(), and ~_pi_kernel().
std::atomic_uint32_t _pi_kernel::refCount_ |
Definition at line 825 of file pi_cuda.hpp.
Referenced by decrement_reference_count(), get_reference_count(), and increment_reference_count().
|
staticconstexpr |
Definition at line 827 of file pi_cuda.hpp.
size_t _pi_kernel::reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS] |
Definition at line 828 of file pi_cuda.hpp.