doxygen/html_en/model_8h_source.html

// Copyright ⓒ 2019- Mobilint Inc. All rights reserved.


#ifndef QBRUNTIME_MODEL_H_

#define QBRUNTIME_MODEL_H_


#include <cstdint>

#ifndef _MSC_VER

#include <experimental/propagate_const>

#endif

#include <memory>

#include <string>

#include <vector>


#include "qbruntime/export.h"

#include "qbruntime/future.h"

#include "qbruntime/model_variant_handle.h"

#include "qbruntime/ndarray.h"

#include "qbruntime/status_code.h"

#include "qbruntime/type.h"


namespace mobilint {


class Accelerator;

class ModelImpl;


class QBRUNTIME_EXPORT Model {

public:

    static std::unique_ptr<Model> create(const std::string& mxq_path, StatusCode& sc);


    static std::unique_ptr<Model> create(const std::string& mxq_path,

                                         const ModelConfig& config, StatusCode& sc);


    Model(const Model& other) = delete;

    Model(Model&& other) noexcept;

    Model& operator=(const Model& rhs) = delete;

    Model& operator=(Model&& rhs) noexcept;

    ~Model();


    StatusCode launch(Accelerator& acc);


    StatusCode dispose();


    CoreMode getCoreMode() const;


    bool isTarget(CoreId core_id) const;


    std::vector<CoreId> getTargetCores() const;


    StatusCode infer(const std::vector<NDArray<float>>& input,

                     std::vector<NDArray<float>>& output);


    std::vector<NDArray<float>> infer(const std::vector<NDArray<float>>& input,

                                      StatusCode& sc);


    StatusCode infer(const std::vector<float*>& input,

                     std::vector<std::vector<float>>& output);


    std::vector<std::vector<float>> infer(const std::vector<float*>& input,

                                          StatusCode& sc);


    StatusCode infer(const std::vector<float*>& input,

                     std::vector<std::vector<float>>& output,

                     const std::vector<std::vector<int64_t>>& shape);

    std::vector<std::vector<float>> infer(const std::vector<float*>& input,

                                          const std::vector<std::vector<int64_t>>& shape,

                                          StatusCode& sc);


    StatusCode infer(const std::vector<NDArray<float>>& input,

                     std::vector<NDArray<float>>& output, uint32_t cache_size);


    StatusCode infer(const std::vector<NDArray<float>>& input,

                     std::vector<NDArray<float>>& output,

                     const std::vector<BatchParam>& params);


    std::vector<NDArray<float>> infer(const std::vector<NDArray<float>>& input,

                                      uint32_t cache_size, StatusCode& sc);


    std::vector<NDArray<float>> infer(const std::vector<NDArray<float>>& input,

                                      const std::vector<BatchParam>& params,

                                      StatusCode& sc);


    StatusCode infer(const std::vector<float*>& input,

                     std::vector<std::vector<float>>& output,

                     const std::vector<std::vector<int64_t>>& shape, uint32_t cache_size);


    std::vector<std::vector<float>> infer(const std::vector<float*>& input,

                                          const std::vector<std::vector<int64_t>>& shape,

                                          uint32_t cache_size, StatusCode& sc);


    StatusCode infer(const std::vector<float*>& input,

                     std::vector<std::vector<float>>& output,

                     const std::vector<std::vector<int64_t>>& shape,

                     const std::vector<BatchParam>& params);


    std::vector<std::vector<float>> infer(const std::vector<float*>& input,

                                          const std::vector<std::vector<int64_t>>& shape,

                                          const std::vector<BatchParam>& params,

                                          StatusCode& sc);


    StatusCode inferCHW(const std::vector<NDArray<float>>& input,

                        std::vector<NDArray<float>>& output);


    std::vector<NDArray<float>> inferCHW(const std::vector<NDArray<float>>& input,

                                         StatusCode& sc);


    StatusCode inferCHW(const std::vector<float*>& input,

                        std::vector<std::vector<float>>& output);


    std::vector<std::vector<float>> inferCHW(const std::vector<float*>& input,

                                             StatusCode& sc);


    StatusCode inferCHW(const std::vector<float*>& input,

                        std::vector<std::vector<float>>& output,

                        const std::vector<std::vector<int64_t>>& shape);


    std::vector<std::vector<float>> inferCHW(

        const std::vector<float*>& input, const std::vector<std::vector<int64_t>>& shape,

        StatusCode& sc);


    StatusCode inferCHW(const std::vector<NDArray<float>>& input,

                        std::vector<NDArray<float>>& output, uint32_t cache_size);


    std::vector<NDArray<float>> inferCHW(const std::vector<NDArray<float>>& input,

                                         uint32_t cache_size, StatusCode& sc);


    StatusCode inferCHW(const std::vector<float*>& input,

                        std::vector<std::vector<float>>& output,

                        const std::vector<std::vector<int64_t>>& shape,

                        uint32_t cache_size);


    std::vector<std::vector<float>> inferCHW(

        const std::vector<float*>& input, const std::vector<std::vector<int64_t>>& shape,

        uint32_t cache_size, StatusCode& sc);


    StatusCode infer(const std::vector<NDArray<uint8_t>>& input,

                     std::vector<NDArray<float>>& output);

    std::vector<NDArray<float>> infer(const std::vector<NDArray<uint8_t>>& input,

                                      StatusCode& sc);

    StatusCode infer(const std::vector<uint8_t*>& input,

                     std::vector<std::vector<float>>& output);

    std::vector<std::vector<float>> infer(const std::vector<uint8_t*>& input,

                                          StatusCode& sc);

    StatusCode infer(const std::vector<uint8_t*>& input,

                     std::vector<std::vector<float>>& output,

                     const std::vector<std::vector<int64_t>>& shape);

    std::vector<std::vector<float>> infer(const std::vector<uint8_t*>& input,

                                          const std::vector<std::vector<int64_t>>& shape,

                                          StatusCode& sc);


    StatusCode infer(const std::vector<NDArray<uint8_t>>& input,

                     std::vector<NDArray<float>>& output, uint32_t cache_size);

    std::vector<NDArray<float>> infer(const std::vector<NDArray<uint8_t>>& input,

                                      uint32_t cache_size, StatusCode& sc);

    StatusCode infer(const std::vector<uint8_t*>& input,

                     std::vector<std::vector<float>>& output,

                     const std::vector<std::vector<int64_t>>& shape, uint32_t cache_size);

    std::vector<std::vector<float>> infer(const std::vector<uint8_t*>& input,

                                          const std::vector<std::vector<int64_t>>& shape,

                                          uint32_t cache_size, StatusCode& sc);


    StatusCode infer(const std::vector<NDArray<uint8_t>>& input,

                     std::vector<NDArray<float>>& output,

                     const std::vector<BatchParam>& params);

    std::vector<NDArray<float>> infer(const std::vector<NDArray<uint8_t>>& input,

                                      const std::vector<BatchParam>& params,

                                      StatusCode& sc);

    StatusCode infer(const std::vector<uint8_t*>& input,

                     std::vector<std::vector<float>>& output,

                     const std::vector<std::vector<int64_t>>& shape,

                     const std::vector<BatchParam>& params);

    std::vector<std::vector<float>> infer(const std::vector<uint8_t*>& input,

                                          const std::vector<std::vector<int64_t>>& shape,

                                          const std::vector<BatchParam>& params,

                                          StatusCode& sc);


    StatusCode inferCHW(const std::vector<NDArray<uint8_t>>& input,

                        std::vector<NDArray<float>>& output);

    std::vector<NDArray<float>> inferCHW(const std::vector<NDArray<uint8_t>>& input,

                                         StatusCode& sc);

    StatusCode inferCHW(const std::vector<uint8_t*>& input,

                        std::vector<std::vector<float>>& output);

    std::vector<std::vector<float>> inferCHW(const std::vector<uint8_t*>& input,

                                             StatusCode& sc);

    StatusCode inferCHW(const std::vector<uint8_t*>& input,

                        std::vector<std::vector<float>>& output,

                        const std::vector<std::vector<int64_t>>& shape);

    std::vector<std::vector<float>> inferCHW(

        const std::vector<uint8_t*>& input,

        const std::vector<std::vector<int64_t>>& shape, StatusCode& sc);


    StatusCode inferCHW(const std::vector<NDArray<uint8_t>>& input,

                        std::vector<NDArray<float>>& output, uint32_t cache_size);

    std::vector<NDArray<float>> inferCHW(const std::vector<NDArray<uint8_t>>& input,

                                         uint32_t cache_size, StatusCode& sc);

    StatusCode inferCHW(const std::vector<uint8_t*>& input,

                        std::vector<std::vector<float>>& output,

                        const std::vector<std::vector<int64_t>>& shape,

                        uint32_t cache_size);

    std::vector<std::vector<float>> inferCHW(

        const std::vector<uint8_t*>& input,

        const std::vector<std::vector<int64_t>>& shape, uint32_t cache_size,

        StatusCode& sc);


    StatusCode infer(const std::vector<NDArray<int8_t>>& input,

                     std::vector<NDArray<int8_t>>& output);

    std::vector<NDArray<int8_t>> infer(const std::vector<NDArray<int8_t>>& input,

                                       StatusCode& sc);

    StatusCode infer(const std::vector<int8_t*>& input,

                     std::vector<std::vector<int8_t>>& output);

    std::vector<std::vector<int8_t>> infer(const std::vector<int8_t*>& input,

                                           StatusCode& sc);

    StatusCode infer(const std::vector<int8_t*>& input,

                     std::vector<std::vector<int8_t>>& output,

                     const std::vector<std::vector<int64_t>>& shape);

    std::vector<std::vector<int8_t>> infer(const std::vector<int8_t*>& input,

                                           const std::vector<std::vector<int64_t>>& shape,

                                           StatusCode& sc);


    StatusCode infer(const std::vector<NDArray<int8_t>>& input,

                     std::vector<NDArray<int8_t>>& output, uint32_t cache_size);

    std::vector<NDArray<int8_t>> infer(const std::vector<NDArray<int8_t>>& input,

                                       uint32_t cache_size, StatusCode& sc);

    StatusCode infer(const std::vector<int8_t*>& input,

                     std::vector<std::vector<int8_t>>& output,

                     const std::vector<std::vector<int64_t>>& shape, uint32_t cache_size);

    std::vector<std::vector<int8_t>> infer(const std::vector<int8_t*>& input,

                                           const std::vector<std::vector<int64_t>>& shape,

                                           uint32_t cache_size, StatusCode& sc);


    StatusCode infer(const std::vector<NDArray<int8_t>>& input,

                     std::vector<NDArray<int8_t>>& output,

                     const std::vector<BatchParam>& params);

    std::vector<NDArray<int8_t>> infer(const std::vector<NDArray<int8_t>>& input,

                                       const std::vector<BatchParam>& params,

                                       StatusCode& sc);

    StatusCode infer(const std::vector<int8_t*>& input,

                     std::vector<std::vector<int8_t>>& output,

                     const std::vector<std::vector<int64_t>>& shape,

                     const std::vector<BatchParam>& params);

    std::vector<std::vector<int8_t>> infer(const std::vector<int8_t*>& input,

                                           const std::vector<std::vector<int64_t>>& shape,

                                           const std::vector<BatchParam>& params,

                                           StatusCode& sc);


    StatusCode inferCHW(const std::vector<NDArray<int8_t>>& input,

                        std::vector<NDArray<int8_t>>& output);

    std::vector<NDArray<int8_t>> inferCHW(const std::vector<NDArray<int8_t>>& input,

                                          StatusCode& sc);

    StatusCode inferCHW(const std::vector<int8_t*>& input,

                        std::vector<std::vector<int8_t>>& output);

    std::vector<std::vector<int8_t>> inferCHW(const std::vector<int8_t*>& input,

                                              StatusCode& sc);

    StatusCode inferCHW(const std::vector<int8_t*>& input,

                        std::vector<std::vector<int8_t>>& output,

                        const std::vector<std::vector<int64_t>>& shape);

    std::vector<std::vector<int8_t>> inferCHW(

        const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,

        StatusCode& sc);


    StatusCode inferCHW(const std::vector<NDArray<int8_t>>& input,

                        std::vector<NDArray<int8_t>>& output, uint32_t cache_size);

    std::vector<NDArray<int8_t>> inferCHW(const std::vector<NDArray<int8_t>>& input,

                                          uint32_t cache_size, StatusCode& sc);

    StatusCode inferCHW(const std::vector<int8_t*>& input,

                        std::vector<std::vector<int8_t>>& output,

                        const std::vector<std::vector<int64_t>>& shape,

                        uint32_t cache_size);

    std::vector<std::vector<int8_t>> inferCHW(

        const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,

        uint32_t cache_size, StatusCode& sc);


    std::vector<NDArray<float>> inferToFloat(const std::vector<NDArray<int8_t>>& input,

                                             StatusCode& sc);

    std::vector<std::vector<float>> inferToFloat(const std::vector<int8_t*>& input,

                                                 StatusCode& sc);

    std::vector<std::vector<float>> inferToFloat(

        const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,

        StatusCode& sc);


    std::vector<NDArray<float>> inferToFloat(const std::vector<NDArray<int8_t>>& input,

                                             uint32_t cache_size, StatusCode& sc);

    std::vector<std::vector<float>> inferToFloat(

        const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,

        uint32_t cache_size, StatusCode& sc);


    std::vector<NDArray<float>> inferCHWToFloat(const std::vector<NDArray<int8_t>>& input,

                                                StatusCode& sc);

    std::vector<std::vector<float>> inferCHWToFloat(const std::vector<int8_t*>& input,

                                                    StatusCode& sc);

    std::vector<std::vector<float>> inferCHWToFloat(

        const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,

        StatusCode& sc);


    std::vector<NDArray<float>> inferCHWToFloat(const std::vector<NDArray<int8_t>>& input,

                                                uint32_t cache_size, StatusCode& sc);

    std::vector<std::vector<float>> inferCHWToFloat(

        const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,

        uint32_t cache_size, StatusCode& sc);


    StatusCode inferBuffer(const std::vector<Buffer>& input, std::vector<Buffer>& output,

                           const std::vector<std::vector<int64_t>>& shape = {},

                           uint32_t cache_size = 0);

    StatusCode inferBuffer(const std::vector<std::vector<Buffer>>& input,

                           std::vector<std::vector<Buffer>>& output,

                           const std::vector<std::vector<int64_t>>& shape = {},

                           uint32_t cache_size = 0);


    StatusCode inferBufferToFloat(const std::vector<Buffer>& input,

                                  std::vector<NDArray<float>>& output,

                                  const std::vector<std::vector<int64_t>>& shape = {},

                                  uint32_t cache_size = 0);

    StatusCode inferBufferToFloat(const std::vector<std::vector<Buffer>>& input,

                                  std::vector<NDArray<float>>& output,

                                  const std::vector<std::vector<int64_t>>& shape = {},

                                  uint32_t cache_size = 0);

    StatusCode inferBufferToFloat(const std::vector<Buffer>& input,

                                  std::vector<std::vector<float>>& output,

                                  const std::vector<std::vector<int64_t>>& shape = {},

                                  uint32_t cache_size = 0);

    StatusCode inferBufferToFloat(const std::vector<std::vector<Buffer>>& input,

                                  std::vector<std::vector<float>>& output,

                                  const std::vector<std::vector<int64_t>>& shape = {},

                                  uint32_t cache_size = 0);


    StatusCode inferSpeedrun(int variant_idx = 0);


    Future<float> inferAsync(const std::vector<NDArray<float>>& input, StatusCode& sc);


    Future<float> inferAsyncCHW(const std::vector<NDArray<float>>& input, StatusCode& sc);


    Future<int8_t> inferAsync(const std::vector<NDArray<int8_t>>& input, StatusCode& sc);


    Future<int8_t> inferAsyncCHW(const std::vector<NDArray<int8_t>>& input,

                                 StatusCode& sc);


    Future<float> inferAsyncToFloat(const std::vector<NDArray<int8_t>>& input,

                                    StatusCode& sc);


    Future<float> inferAsyncCHWToFloat(const std::vector<NDArray<int8_t>>& input,

                                       StatusCode& sc);


    Future<float> inferAsync(const std::vector<NDArray<uint8_t>>& input, StatusCode& sc);


    Future<float> inferAsyncCHW(const std::vector<NDArray<uint8_t>>& input,

                                StatusCode& sc);


    // Acquire buffer

    std::vector<Buffer> acquireInputBuffer(

        const std::vector<std::vector<int>>& seqlens = {}) const;

    std::vector<Buffer> acquireOutputBuffer(

        const std::vector<std::vector<int>>& seqlens = {}) const;

    std::vector<std::vector<Buffer>> acquireInputBuffers(

        const int batch_size, const std::vector<std::vector<int>>& seqlens = {}) const;

    std::vector<std::vector<Buffer>> acquireOutputBuffers(

        const int batch_size, const std::vector<std::vector<int>>& seqlens = {}) const;


    // Deallocate acquired Input/Output buffer

    StatusCode releaseBuffer(std::vector<Buffer>& buffer) const;

    StatusCode releaseBuffers(std::vector<std::vector<Buffer>>& buffers) const;


    // Reposition single batch

    StatusCode repositionInputs(const std::vector<float*>& input,

                                std::vector<Buffer>& input_buf,

                                const std::vector<std::vector<int>>& seqlens = {}) const;

    StatusCode repositionOutputs(const std::vector<Buffer>& output_buf,

                                 std::vector<float*>& output,

                                 const std::vector<std::vector<int>>& seqlens = {}) const;

    StatusCode repositionOutputs(const std::vector<Buffer>& output_buf,

                                 std::vector<std::vector<float>>& output,

                                 const std::vector<std::vector<int>>& seqlens = {}) const;

    StatusCode repositionInputs(const std::vector<uint8_t*>& input,

                                std::vector<Buffer>& input_buf,

                                const std::vector<std::vector<int>>& seqlens = {}) const;


    // Reposition multiple batches

    StatusCode repositionInputs(const std::vector<float*>& input,

                                std::vector<std::vector<Buffer>>& input_buf,

                                const std::vector<std::vector<int>>& seqlens = {}) const;

    StatusCode repositionOutputs(const std::vector<std::vector<Buffer>>& output_buf,

                                 std::vector<float*>& output,

                                 const std::vector<std::vector<int>>& seqlens = {}) const;

    StatusCode repositionOutputs(const std::vector<std::vector<Buffer>>& output_buf,

                                 std::vector<std::vector<float>>& output,

                                 const std::vector<std::vector<int>>& seqlens = {}) const;

    StatusCode repositionInputs(const std::vector<uint8_t*>& input,

                                std::vector<std::vector<Buffer>>& input_buf,

                                const std::vector<std::vector<int>>& seqlens = {}) const;


    int getNumModelVariants() const;


    std::unique_ptr<ModelVariantHandle> getModelVariantHandle(int variant_idx,

                                                              StatusCode& sc) const;


    const std::vector<std::vector<int64_t>>& getModelInputShape() const;


    const std::vector<std::vector<int64_t>>& getModelOutputShape() const;


    const std::vector<BufferInfo>& getInputBufferInfo() const;


    const std::vector<BufferInfo>& getOutputBufferInfo() const;


    std::vector<Scale> getInputScale() const;


    std::vector<Scale> getOutputScale() const;


    DataType getModelInputDataType() const;


    DataType getModelOutputDataType() const;


    uint32_t getIdentifier() const;


    std::string getModelPath() const;


    std::vector<CacheInfo> getCacheInfos() const;


    StatusCode dumpCacheMemory(std::vector<std::vector<int8_t>>& bufs, int cache_id = 0);


    std::vector<std::vector<int8_t>> dumpCacheMemory(StatusCode& sc);

    std::vector<std::vector<int8_t>> dumpCacheMemory(int cache_id, StatusCode& sc);


    StatusCode dumpCacheMemory(const std::string& cache_dir, int cache_id = 0);


    StatusCode loadCacheMemory(const std::vector<std::vector<int8_t>>& bufs,

                               int cache_id = 0);


    StatusCode loadCacheMemory(const std::string& cache_dir, int cache_id = 0);


    int filterCacheTail(int cache_size, int tail_size, const std::vector<bool>& mask,

                        StatusCode& sc);


    int moveCacheTail(int num_head, int num_tail, int cache_size, StatusCode& sc);


    StatusCode infer(const std::vector<float*>& input,

                     std::vector<std::vector<float>>& output, int batch_size);


    std::vector<std::vector<float>> infer(const std::vector<float*>& input,

                                          int batch_size, StatusCode& sc);


    uint64_t getLatencyConsumed(const int npu_op_idx) const;


    uint64_t getLatencyFinished(const int npu_op_idx) const;


private:

    Model();


#ifndef _MSC_VER

    std::experimental::propagate_const<std::unique_ptr<ModelImpl>> mImpl;

#else

    std::unique_ptr<ModelImpl> mImpl;

#endif


    friend class Accelerator;

};


}  // namespace mobilint


#endif

mobilint::Accelerator
Represents an accelerator, i.e., an NPU, used for executing models.
Definition acc.h:33

mobilint::Future
Represents a future for retrieving the result of asynchronous inference.
Definition future.h:43

mobilint::ModelConfig
Configures a core mode and core allocation of a model for NPU inference.
Definition type.h:235

mobilint::Model::infer
StatusCode infer(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output, const std::vector< BatchParam > &params)
This overload is supports inference with BatchParam for BatchLLM.

mobilint::Model::getModelPath
std::string getModelPath() const
Returns the path to the MXQ model file associated with the Model.

mobilint::Model::inferAsyncToFloat
Future< float > inferAsyncToFloat(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-float asynchronous inference.

mobilint::Model::getLatencyConsumed
uint64_t getLatencyConsumed(const int npu_op_idx) const

mobilint::Model::infer
std::vector< std::vector< float > > infer(const std::vector< float * > &input, int batch_size, StatusCode &sc)

mobilint::Model::isTarget
bool isTarget(CoreId core_id) const
Checks if the NPU core specified by CoreId is the target of the model. In other words,...

mobilint::Model::create
static std::unique_ptr< Model > create(const std::string &mxq_path, StatusCode &sc)
Creates a Model object from the specified MXQ model file.

mobilint::Model::inferAsyncCHW
Future< float > inferAsyncCHW(const std::vector< NDArray< float > > &input, StatusCode &sc)
Initiates asynchronous inference with input in NCHW (batch N, channels C, height H,...

mobilint::Model::infer
std::vector< NDArray< float > > infer(const std::vector< NDArray< float > > &input, StatusCode &sc)
This overload differs from the above function in that it directly returns the inference results inste...

mobilint::Model::inferAsync
Future< int8_t > inferAsync(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-int8_t asynchronous inference.

mobilint::Model::infer
std::vector< NDArray< float > > infer(const std::vector< NDArray< float > > &input, const std::vector< BatchParam > &params, StatusCode &sc)
This overload is supports inference with BatchParam for BatchLLM.

mobilint::Model::inferAsyncCHWToFloat
Future< float > inferAsyncCHWToFloat(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-float asynchronous inference.

mobilint::Model::infer
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape, const std::vector< BatchParam > &params)
This overload is supports inference with BatchParam for BatchLLM.

mobilint::Model::getOutputScale
std::vector< Scale > getOutputScale() const
Returns the output quantization scale(s) of the model.

mobilint::Model::loadCacheMemory
StatusCode loadCacheMemory(const std::vector< std::vector< int8_t > > &bufs, int cache_id=0)
Loads the KV cache memory from buffers.

mobilint::Model::dumpCacheMemory
StatusCode dumpCacheMemory(std::vector< std::vector< int8_t > > &bufs, int cache_id=0)
Dumps the KV cache memory into buffers.

mobilint::Model::getModelOutputShape
const std::vector< std::vector< int64_t > > & getModelOutputShape() const
Returns the output shape of the model.

mobilint::Model::getModelVariantHandle
std::unique_ptr< ModelVariantHandle > getModelVariantHandle(int variant_idx, StatusCode &sc) const
Retrieves a handle to the specified model variant.

mobilint::Model::loadCacheMemory
StatusCode loadCacheMemory(const std::string &cache_dir, int cache_id=0)
Loads the KV cache memory from files in the specified directory.

mobilint::Model::launch
StatusCode launch(Accelerator &acc)
Launches the model on the specified Accelerator, which represents the actual NPU.

mobilint::Model::inferAsync
Future< float > inferAsync(const std::vector< NDArray< uint8_t > > &input, StatusCode &sc)
This overload supports uint8_t-to-float asynchronous inference.

mobilint::Model::getTargetCores
std::vector< CoreId > getTargetCores() const
Returns the NPU cores the model is configured to use.

mobilint::Model::inferCHW
std::vector< std::vector< float > > inferCHW(const std::vector< float * > &input, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the qbrunti...

mobilint::Model::inferCHW
std::vector< std::vector< float > > inferCHW(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the qbrunti...

mobilint::Model::getInputBufferInfo
const std::vector< BufferInfo > & getInputBufferInfo() const
Returns the input buffer information for the model.

mobilint::Model::moveCacheTail
int moveCacheTail(int num_head, int num_tail, int cache_size, StatusCode &sc)
Moves the tail of the KV cache memory to the end of the head.

mobilint::Model::getModelInputDataType
DataType getModelInputDataType() const
Returns a data type for model inputs.

mobilint::Model::infer
std::vector< std::vector< float > > infer(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.

mobilint::Model::infer
StatusCode infer(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output)
Performs inference.

mobilint::Model::getCoreMode
CoreMode getCoreMode() const
Retrieves the core mode of the model.

mobilint::Model::getNumModelVariants
int getNumModelVariants() const
Returns the total number of model variants available in this model.

mobilint::Model::inferAsync
Future< float > inferAsync(const std::vector< NDArray< float > > &input, StatusCode &sc)
Initiates asynchronous inference with input in NHWC (batch N, height H, width W, channels C) or HWC f...

mobilint::Model::getIdentifier
uint32_t getIdentifier() const
Returns the model's unique identifier.

mobilint::Model::inferCHW
std::vector< NDArray< float > > inferCHW(const std::vector< NDArray< float > > &input, StatusCode &sc)
This overload differs from the above function in that it directly returns the inference results inste...

mobilint::Model::infer
std::vector< NDArray< float > > infer(const std::vector< NDArray< float > > &input, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.

mobilint::Model::getModelOutputDataType
DataType getModelOutputDataType() const
Returns a data type for model outputs.

mobilint::Model::dispose
StatusCode dispose()
Disposes of the model loaded onto the NPU.

mobilint::Model::inferCHW
StatusCode inferCHW(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output, uint32_t cache_size)
This overload supports inference with KV cache.

mobilint::Model::create
static std::unique_ptr< Model > create(const std::string &mxq_path, const ModelConfig &config, StatusCode &sc)
Creates a Model object from the specified MXQ model file and configuration.

mobilint::Model::filterCacheTail
int filterCacheTail(int cache_size, int tail_size, const std::vector< bool > &mask, StatusCode &sc)
Filter the tail of the KV cache memory.

mobilint::Model::getModelInputShape
const std::vector< std::vector< int64_t > > & getModelInputShape() const
Returns the input shape of the model.

mobilint::Model::inferCHW
std::vector< NDArray< float > > inferCHW(const std::vector< NDArray< float > > &input, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.

mobilint::Model::inferAsyncCHW
Future< int8_t > inferAsyncCHW(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-int8_t asynchronous inference.

mobilint::Model::dumpCacheMemory
StatusCode dumpCacheMemory(const std::string &cache_dir, int cache_id=0)
Dumps KV cache memory to files in the specified directory.

mobilint::Model::infer
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output)
This overload is provided for convenience but may result in additional data copies within the qbrunti...

mobilint::Model::getOutputBufferInfo
const std::vector< BufferInfo > & getOutputBufferInfo() const
Returns the output buffer information of the model.

mobilint::Model::infer
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size)
This overload supports inference with KV cache.

mobilint::Model::getCacheInfos
std::vector< CacheInfo > getCacheInfos() const
Returns informations of KV-cache of the model.

mobilint::Model::infer
StatusCode infer(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output, uint32_t cache_size)
This overload supports inference with KV cache.

mobilint::Model::inferAsyncCHW
Future< float > inferAsyncCHW(const std::vector< NDArray< uint8_t > > &input, StatusCode &sc)
This overload supports uint8_t-to-float asynchronous inference.

mobilint::Model::infer
std::vector< std::vector< float > > infer(const std::vector< float * > &input, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the qbrunti...

mobilint::Model::inferCHW
std::vector< std::vector< float > > inferCHW(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.

mobilint::Model::infer
std::vector< std::vector< float > > infer(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the qbrunti...

mobilint::Model::infer
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output, int batch_size)

mobilint::Model::dumpCacheMemory
std::vector< std::vector< int8_t > > dumpCacheMemory(StatusCode &sc)
Dumps the KV cache memory into buffers.

mobilint::Model::inferCHW
StatusCode inferCHW(const std::vector< float * > &input, std::vector< std::vector< float > > &output)
This overload is provided for convenience but may result in additional data copies within the qbrunti...

mobilint::Model::getInputScale
std::vector< Scale > getInputScale() const
Returns the input quantization scale(s) of the model.

mobilint::Model::inferSpeedrun
StatusCode inferSpeedrun(int variant_idx=0)
Development-only API for measuring pure NPU inference speed.

mobilint::Model::inferCHW
StatusCode inferCHW(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output)
Performs inference.

mobilint::Model::infer
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape)
This overload is provided for convenience but may result in additional data copies within the qbrunti...

mobilint::Model::inferCHW
StatusCode inferCHW(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape)
This overload is provided for convenience but may result in additional data copies within the qbrunti...

mobilint::Model::getLatencyFinished
uint64_t getLatencyFinished(const int npu_op_idx) const

mobilint::Model::inferCHW
StatusCode inferCHW(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size)
This overload supports inference with KV cache.

mobilint::Model::dumpCacheMemory
std::vector< std::vector< int8_t > > dumpCacheMemory(int cache_id, StatusCode &sc)
Dumps the KV cache memory into buffers.

mobilint::Model::infer
std::vector< std::vector< float > > infer(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, const std::vector< BatchParam > &params, StatusCode &sc)
This overload is supports inference with BatchParam for BatchLLM.

mobilint::NDArray
A class representing an N-dimensional array (NDArray).
Definition ndarray.h:77

future.h

mobilint::DataType
DataType
DataType.
Definition type.h:508

mobilint::CoreMode
CoreMode
Defines the core mode for NPU execution.
Definition type.h:170

mobilint::StatusCode
StatusCode
Enumerates status codes for the qbruntime.
Definition status_code.h:26

model_variant_handle.h

ndarray.h

status_code.h

mobilint::CoreId
Represents a unique identifier for an NPU core.
Definition type.h:123

type.h