C++ API Reference#
-
class ModelFactory : public intel_npu_acceleration_library::OVInferenceModel#
- #include <nn_factory.h>
The ModelFactory class implements a generic interface for NPU network generation and inference. It supports only single input single output operations with input of shape [batch, input_channels] and output of shape [batch, output_channels].
Public Functions
-
inline ModelFactory(std::string device, size_t inC, size_t outC, size_t batch, bool profile = false)#
Construct a new Model Factory object.
- Parameters:
device – target device
inC – number of input channels
outC – number of output channels
batch – batch size
profile – enable/disable profiling
-
inline ov::op::Op *parameter(size_t dim0, size_t dim1, ov::element::Type_t dtype)#
Create a new 2D [dim0, dim1] network parameter.
- Parameters:
dim0 – dimension 0
dim1 – dimension 1
dtype – parameter datatype
- Returns:
ov::op::Op*
-
inline ov::op::Op *matmul(ov::op::Op *input, ov::op::Op *&weights, bool trA = false, bool trB = true)#
Create a new matmul operation.
- Parameters:
input – matmul lhs input
weights – matmul rhs input, a.k.a. weights
trA – transpose the lhs input
trB – transpose the rhs input
- Returns:
ov::op::Op*
-
inline ov::op::Op *gelu(ov::op::Op *input)#
Create a new gelu operation.
- Parameters:
input – operation’s input node
- Returns:
ov::op::Op*
-
inline ov::op::Op *swish(ov::op::Op *input)#
Create a new swish operation.
- Parameters:
input – operation’s input node
- Returns:
ov::op::Op*
-
inline ov::op::Op *softmax(ov::op::Op *input)#
Create a new softmax operation.
- Parameters:
input – operation’s input node
- Returns:
ov::op::Op*
-
inline ov::op::Op *convert_to_fp16(ov::op::Op *input)#
Create a new conversion to fp16 operation.
- Parameters:
input – operation’s input node
- Returns:
ov::op::Op*
-
inline ov::op::Op *eltwise_add(ov::op::Op *x1, ov::op::Op *&x2)#
Create a new elementwise add operation.
- Parameters:
x1 – eltwise lhs input
x2 – eltwise rhs input
- Returns:
ov::op::Op*
-
inline ov::op::Op *eltwise_mul(ov::op::Op *x1, ov::op::Op *&x2)#
Create a new elementwise multiply operation.
- Parameters:
x1 – eltwise lhs input
x2 – eltwise rhs input
- Returns:
ov::op::Op*
-
inline ov::op::Op *eltwise_div(ov::op::Op *x1, ov::op::Op *&x2)#
Create a new elementwise division operation.
- Parameters:
x1 – eltwise lhs input
x2 – eltwise rhs input
- Returns:
ov::op::Op*
-
inline void compile(ov::op::Op *result)#
Compile the model.
- Parameters:
result – the last operation in the network. Must have a [batch, output_channel] shape
-
inline ModelFactory(std::string device, size_t inC, size_t outC, size_t batch, bool profile = false)#
-
class OVInferenceModel#
- #include <inference.h>
The OVInferenceModel implements the basic of NN inference on NPU.
Subclassed by intel_npu_acceleration_library::ModelFactory
Public Functions
-
inline OVInferenceModel(std::string device, size_t inC, size_t outC, size_t batch, bool profile = false)#
Construct a new OVInferenceModel object.
- Parameters:
device – target device
inC – number of input channels
outC – number of output channels
batch – batch size
profile – enable/disable profiling
-
inline virtual ~OVInferenceModel()#
-
inline void saveCompiledModel(const std::string &path)#
Save the model to a local path.
- Parameters:
path –
-
inline void saveModel(const std::string &path)#
Save the model to a local path.
- Parameters:
path –
-
inline void run()#
Run an inference.
- Returns:
void
-
inline void setActivations(half_ptr _X, half_ptr _Out)#
Set the input and output activations.
- Parameters:
_X – pointer to the float16 input activation
_Out – pointer to the float16 output activation
Set the network parameters.
- Parameters:
_weights – vector of network parameters
started – atomic bool variable that is set to true once the conversion started. Useful for thread syncronizations
Public Members
-
ov::Tensor X#
Model input tensor.
-
ov::Tensor Out#
Model output tensor.
-
std::thread wt_thread#
Async weight prefetch thread.
Protected Functions
-
inline void compile_model(std::string device)#
Compile a generated OV model to a specific device.
- Parameters:
device – target compialtion device
-
inline virtual void create_ov_model()#
Create a ov model object. This class needs to be override in child classes.
-
inline OVInferenceModel(std::string device, size_t inC, size_t outC, size_t batch, bool profile = false)#
-
class Parameter#
- #include <parameters.h>
The Parameter class represents a generic NN parameter.
Subclassed by intel_npu_acceleration_library::ParameterWithConversion
Public Functions
-
inline Parameter(Shape shape)#
Construct a new Parameter object.
- Parameters:
shape – parameter shape
-
inline Parameter(half_ptr _data, Shape shape)#
Construct a new Parameter object from fp16 data pointer.
- Parameters:
_data – fp16 parameter data pointer
shape – parameter shape
-
inline Parameter(int8_t *_data, Shape shape)#
Construct a new Parameter object from int8 data pointer.
- Parameters:
_data – int8 parameter data pointer
shape – parameter shape
-
inline size_t get_size() const#
Get the size of the parameter.
- Returns:
size_t
-
inline Parameter(Shape shape)#
-
class Parameters#
- #include <parameters.h>
The class Parameters represents a list of NN parameter for a NPU kernel.
Public Functions
-
inline Parameters &add_parameter(half_ptr data, Shape shape)#
Add a new float16 parameter.
- Parameters:
data –
shape –
- Returns:
-
inline Parameters &add_parameter(int8_t *data, half_ptr scale, Shape shape)#
Add a new int8 parameter, provide also the scale.
- Parameters:
data –
scale –
shape –
- Returns:
-
inline Parameters &add_parameter(int8_t *data, float *scale, Shape shape)#
Add a new int8 parameter with explicit CPU conversion.
- Parameters:
data –
scale –
shape –
- Returns:
-
inline auto &get_parameters()#
Get the parameters.
- Returns:
auto
-
inline Parameters &add_parameter(half_ptr data, Shape shape)#
-
class ParameterWithConversion : public intel_npu_acceleration_library::Parameter#
- #include <parameters.h>
The ParameterWithConversion represent a generic quantized NN parameter where the conversion to fp16 is performed explicitly on CPU. The conversion equation is Y_float = Scale * float(data)
Public Functions
-
inline ParameterWithConversion(int8_t *data, float *scale, Shape shape)#
Construct a new ParameterWithConversion object from int8 data, float scale and shape.
- Parameters:
data – int8 data buffer
scale – float per output channel scale
shape – parameter shape
-
inline ParameterWithConversion(int8_t *data, float *scale, Shape shape)#
-
class Shape#
- #include <parameters.h>
A class representing a generic tensor shape.
Public Functions
-
inline Shape(std::initializer_list<size_t> dims)#
Construct a new Shape object.
- Parameters:
dims – : a list of integers representing each dimension size
-
inline const size_t &operator[](int idx)#
Overload of the operator []. Return the dimension at index idx.
- Parameters:
idx –
- Returns:
const size_t&
-
inline size_t get_size() const#
Get the number of element of the tensor.
- Returns:
size_t
Private Members
-
std::vector<size_t> dimensions#
-
inline Shape(std::initializer_list<size_t> dims)#
-
namespace intel_npu_acceleration_library#
Functions
-
bool _isNPUAvailable(ov::Core &core)#
Return true if the NPU is available on the system, otherwise return false.
- Parameters:
core – ov::Cor object
- Returns:
true NPU AI accelerator is available
- Returns:
false NPU AI accelerator is not available
-
void vector_to_fp16(const int8_t *src, float scale, half_ptr dst, size_t size)#
Convert a int8 vector to fp16 given a scalar scale.
- Parameters:
src – pointer to the source int8 buffer
scale – Float scale
dst – pointer to the destination float16 buffer
size – size of the src and dst buffers
-
void array_to_fp16_worker(const int8_t *input, float *scale, half_ptr output, size_t input_channels, size_t output_channels)#
Convert a int8 array to fp16 given a per output channel scale vector.
- Parameters:
input – pointer to the source int8 buffer of shape [output_channels, input_channels]
scale – pointer of a float scale vector of shape [output_channels]
output – dst pointer to the destination float16 buffer of shape [output_channels, input_channels]
input_channels – number of input channels
output_channels – number of output channels
-
void to_fp16(const int8_t *input, float *scale, half_ptr output, size_t input_channels, size_t output_channels, unsigned int num_threads)#
Convert a int8 array to fp16 given a per output channel scale vector.
- Parameters:
input – pointer to the source int8 buffer of shape [output_channels, input_channels]
scale – pointer of a float scale vector of shape [output_channels]
output – dst pointer to the destination float16 buffer of shape [output_channels, input_channels]
input_channels – number of input channels
output_channels – number of output channels
num_threads – number of parallel threads to use
-
bool _isNPUAvailable(ov::Core &core)#
- file common.h
- #include “openvino/openvino.hpp”#include “openvino/opsets/opset1.hpp”#include “openvino/opsets/opset13.hpp”#include “openvino/opsets/opset4.hpp”#include “openvino/opsets/opset6.hpp”#include “openvino/opsets/opset7.hpp”#include “openvino/opsets/opset8.hpp”
- file conversion.h
- #include <immintrin.h>#include <iostream>#include <thread>#include <vector>#include “intel_npu_acceleration_library/common.h”
- file inference.h
- #include <atomic>#include <condition_variable>#include <cstdint>#include <cstdlib>#include <cstring>#include <fstream>#include <iostream>#include <limits>#include <memory>#include <mutex>#include <string>#include <thread>#include <vector>#include “intel_npu_acceleration_library/common.h”#include “intel_npu_acceleration_library/parameters.h”
- file nn_factory.h
- #include “intel_npu_acceleration_library/inference.h”
Typedefs
-
typedef ov::Output<ov::Node> OVNode#
-
typedef ov::Output<ov::Node> OVNode#
- file parameters.h
- #include <memory>#include “intel_npu_acceleration_library/common.h”#include “intel_npu_acceleration_library/conversion.h”
- dir include
- dir intel_npu_acceleration_library