model.h Source File

model.h Source File#

Runtime Library: model.h Source File
Runtime Library v0.30
Mobilint SDK qb
model.h
Go to the documentation of this file.
1// Copyright ⓒ 2019- Mobilint Inc. All rights reserved.
5
6#ifndef MACCEL_MODEL_H_
7#define MACCEL_MODEL_H_
8
9#include <cstdint>
10#ifndef _MSC_VER
11#include <experimental/propagate_const>
12#endif
13#include <memory>
14#include <string>
15#include <vector>
16
17#include "maccel/export.h"
18#include "maccel/future.h"
20#include "maccel/ndarray.h"
21#include "maccel/status_code.h"
22#include "maccel/type.h"
23
24namespace mobilint {
25
30
31class Accelerator;
32class ModelImpl;
33
40class MACCEL_EXPORT Model {
41public:
56 static std::unique_ptr<Model> create(const std::string& mxq_path, StatusCode& sc);
57
73 static std::unique_ptr<Model> create(const std::string& mxq_path,
74 const ModelConfig& config, StatusCode& sc);
75
76 Model(const Model& other) = delete;
77 Model(Model&& other) noexcept;
78 Model& operator=(const Model& rhs) = delete;
79 Model& operator=(Model&& rhs) noexcept;
80 ~Model();
81
90 StatusCode launch(Accelerator& acc);
91
101
108
117 bool isTarget(CoreId core_id) const;
118
124 std::vector<CoreId> getTargetCores() const;
125
143
154 StatusCode infer(const std::vector<NDArray<float>>& input,
155 std::vector<NDArray<float>>& output);
156
168 std::vector<NDArray<float>> infer(const std::vector<NDArray<float>>& input,
169 StatusCode& sc);
170
182 StatusCode infer(const std::vector<float*>& input,
183 std::vector<std::vector<float>>& output);
184
199 std::vector<std::vector<float>> infer(const std::vector<float*>& input,
200 StatusCode& sc);
201
218 StatusCode infer(const std::vector<float*>& input,
219 std::vector<std::vector<float>>& output,
220 const std::vector<std::vector<int64_t>>& shape);
237 std::vector<std::vector<float>> infer(const std::vector<float*>& input,
238 const std::vector<std::vector<int64_t>>& shape,
239 StatusCode& sc);
240
254 StatusCode infer(const std::vector<NDArray<float>>& input,
255 std::vector<NDArray<float>>& output, uint32_t cache_size);
256
273 std::vector<NDArray<float>> infer(const std::vector<NDArray<float>>& input,
274 uint32_t cache_size, StatusCode& sc);
275
291 StatusCode infer(const std::vector<float*>& input,
292 std::vector<std::vector<float>>& output,
293 const std::vector<std::vector<int64_t>>& shape, uint32_t cache_size);
294
313 std::vector<std::vector<float>> infer(const std::vector<float*>& input,
314 const std::vector<std::vector<int64_t>>& shape,
315 uint32_t cache_size, StatusCode& sc);
316
318
343
354 StatusCode inferCHW(const std::vector<NDArray<float>>& input,
355 std::vector<NDArray<float>>& output);
356
368 std::vector<NDArray<float>> inferCHW(const std::vector<NDArray<float>>& input,
369 StatusCode& sc);
370
382 StatusCode inferCHW(const std::vector<float*>& input,
383 std::vector<std::vector<float>>& output);
384
399 std::vector<std::vector<float>> inferCHW(const std::vector<float*>& input,
400 StatusCode& sc);
401
418 StatusCode inferCHW(const std::vector<float*>& input,
419 std::vector<std::vector<float>>& output,
420 const std::vector<std::vector<int64_t>>& shape);
421
438 std::vector<std::vector<float>> inferCHW(
439 const std::vector<float*>& input, const std::vector<std::vector<int64_t>>& shape,
440 StatusCode& sc);
441
455 StatusCode inferCHW(const std::vector<NDArray<float>>& input,
456 std::vector<NDArray<float>>& output, uint32_t cache_size);
457
474 std::vector<NDArray<float>> inferCHW(const std::vector<NDArray<float>>& input,
475 uint32_t cache_size, StatusCode& sc);
476
492 StatusCode inferCHW(const std::vector<float*>& input,
493 std::vector<std::vector<float>>& output,
494 const std::vector<std::vector<int64_t>>& shape,
495 uint32_t cache_size);
496
515 std::vector<std::vector<float>> inferCHW(
516 const std::vector<float*>& input, const std::vector<std::vector<int64_t>>& shape,
517 uint32_t cache_size, StatusCode& sc);
518
520
533
534 StatusCode infer(const std::vector<NDArray<int8_t>>& input,
535 std::vector<NDArray<int8_t>>& output);
536 std::vector<NDArray<int8_t>> infer(const std::vector<NDArray<int8_t>>& input,
537 StatusCode& sc);
538 StatusCode infer(const std::vector<int8_t*>& input,
539 std::vector<std::vector<int8_t>>& output);
540 std::vector<std::vector<int8_t>> infer(const std::vector<int8_t*>& input,
541 StatusCode& sc);
542 StatusCode infer(const std::vector<int8_t*>& input,
543 std::vector<std::vector<int8_t>>& output,
544 const std::vector<std::vector<int64_t>>& shape);
545 std::vector<std::vector<int8_t>> infer(const std::vector<int8_t*>& input,
546 const std::vector<std::vector<int64_t>>& shape,
547 StatusCode& sc);
548
549 StatusCode infer(const std::vector<NDArray<int8_t>>& input,
550 std::vector<NDArray<int8_t>>& output, uint32_t cache_size);
551 std::vector<NDArray<int8_t>> infer(const std::vector<NDArray<int8_t>>& input,
552 uint32_t cache_size, StatusCode& sc);
553 StatusCode infer(const std::vector<int8_t*>& input,
554 std::vector<std::vector<int8_t>>& output,
555 const std::vector<std::vector<int64_t>>& shape, uint32_t cache_size);
556 std::vector<std::vector<int8_t>> infer(const std::vector<int8_t*>& input,
557 const std::vector<std::vector<int64_t>>& shape,
558 uint32_t cache_size, StatusCode& sc);
559
561
574 StatusCode inferCHW(const std::vector<NDArray<int8_t>>& input,
575 std::vector<NDArray<int8_t>>& output);
576 std::vector<NDArray<int8_t>> inferCHW(const std::vector<NDArray<int8_t>>& input,
577 StatusCode& sc);
578 StatusCode inferCHW(const std::vector<int8_t*>& input,
579 std::vector<std::vector<int8_t>>& output);
580 std::vector<std::vector<int8_t>> inferCHW(const std::vector<int8_t*>& input,
581 StatusCode& sc);
582 StatusCode inferCHW(const std::vector<int8_t*>& input,
583 std::vector<std::vector<int8_t>>& output,
584 const std::vector<std::vector<int64_t>>& shape);
585 std::vector<std::vector<int8_t>> inferCHW(
586 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
587 StatusCode& sc);
588
589 StatusCode inferCHW(const std::vector<NDArray<int8_t>>& input,
590 std::vector<NDArray<int8_t>>& output, uint32_t cache_size);
591 std::vector<NDArray<int8_t>> inferCHW(const std::vector<NDArray<int8_t>>& input,
592 uint32_t cache_size, StatusCode& sc);
593 StatusCode inferCHW(const std::vector<int8_t*>& input,
594 std::vector<std::vector<int8_t>>& output,
595 const std::vector<std::vector<int64_t>>& shape,
596 uint32_t cache_size);
597 std::vector<std::vector<int8_t>> inferCHW(
598 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
599 uint32_t cache_size, StatusCode& sc);
601
614 std::vector<NDArray<float>> inferToFloat(const std::vector<NDArray<int8_t>>& input,
615 StatusCode& sc);
616 std::vector<std::vector<float>> inferToFloat(const std::vector<int8_t*>& input,
617 StatusCode& sc);
618 std::vector<std::vector<float>> inferToFloat(
619 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
620 StatusCode& sc);
621
622 std::vector<NDArray<float>> inferToFloat(const std::vector<NDArray<int8_t>>& input,
623 uint32_t cache_size, StatusCode& sc);
624 std::vector<std::vector<float>> inferToFloat(
625 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
626 uint32_t cache_size, StatusCode& sc);
628
641 std::vector<NDArray<float>> inferCHWToFloat(const std::vector<NDArray<int8_t>>& input,
642 StatusCode& sc);
643 std::vector<std::vector<float>> inferCHWToFloat(const std::vector<int8_t*>& input,
644 StatusCode& sc);
645 std::vector<std::vector<float>> inferCHWToFloat(
646 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
647 StatusCode& sc);
648
649 std::vector<NDArray<float>> inferCHWToFloat(const std::vector<NDArray<int8_t>>& input,
650 uint32_t cache_size, StatusCode& sc);
651 std::vector<std::vector<float>> inferCHWToFloat(
652 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
653 uint32_t cache_size, StatusCode& sc);
655
678 StatusCode inferBuffer(const std::vector<Buffer>& input, std::vector<Buffer>& output,
679 const std::vector<std::vector<int64_t>>& shape = {},
680 uint32_t cache_size = 0);
681 StatusCode inferBuffer(const std::vector<std::vector<Buffer>>& input,
682 std::vector<std::vector<Buffer>>& output,
683 const std::vector<std::vector<int64_t>>& shape = {},
684 uint32_t cache_size = 0);
686
704 StatusCode inferBufferToFloat(const std::vector<Buffer>& input,
705 std::vector<NDArray<float>>& output,
706 const std::vector<std::vector<int64_t>>& shape = {},
707 uint32_t cache_size = 0);
708 StatusCode inferBufferToFloat(const std::vector<std::vector<Buffer>>& input,
709 std::vector<NDArray<float>>& output,
710 const std::vector<std::vector<int64_t>>& shape = {},
711 uint32_t cache_size = 0);
712 StatusCode inferBufferToFloat(const std::vector<Buffer>& input,
713 std::vector<std::vector<float>>& output,
714 const std::vector<std::vector<int64_t>>& shape = {},
715 uint32_t cache_size = 0);
716 StatusCode inferBufferToFloat(const std::vector<std::vector<Buffer>>& input,
717 std::vector<std::vector<float>>& output,
718 const std::vector<std::vector<int64_t>>& shape = {},
719 uint32_t cache_size = 0);
721
730 StatusCode inferSpeedrun(int variant_idx = 0);
731
786
798 Future<float> inferAsync(const std::vector<NDArray<float>>& input, StatusCode& sc);
799
811 Future<float> inferAsyncCHW(const std::vector<NDArray<float>>& input, StatusCode& sc);
812
823 Future<int8_t> inferAsync(const std::vector<NDArray<int8_t>>& input, StatusCode& sc);
824
836 StatusCode& sc);
837
849 StatusCode& sc);
850
862 StatusCode& sc);
863
865
887
888 // Acquire buffer
889 std::vector<Buffer> acquireInputBuffer(
890 const std::vector<std::vector<int>>& seqlens = {}) const;
891 std::vector<Buffer> acquireOutputBuffer(
892 const std::vector<std::vector<int>>& seqlens = {}) const;
893 std::vector<std::vector<Buffer>> acquireInputBuffers(
894 const int batch_size, const std::vector<std::vector<int>>& seqlens = {}) const;
895 std::vector<std::vector<Buffer>> acquireOutputBuffers(
896 const int batch_size, const std::vector<std::vector<int>>& seqlens = {}) const;
897
898 // Deallocate acquired Input/Output buffer
899 StatusCode releaseBuffer(std::vector<Buffer>& buffer) const;
900 StatusCode releaseBuffers(std::vector<std::vector<Buffer>>& buffers) const;
901
902 // Reposition single batch
903 StatusCode repositionInputs(const std::vector<float*>& input,
904 std::vector<Buffer>& input_buf,
905 const std::vector<std::vector<int>>& seqlens = {}) const;
906 StatusCode repositionOutputs(const std::vector<Buffer>& output_buf,
907 std::vector<float*>& output,
908 const std::vector<std::vector<int>>& seqlens = {}) const;
909 StatusCode repositionOutputs(const std::vector<Buffer>& output_buf,
910 std::vector<std::vector<float>>& output,
911 const std::vector<std::vector<int>>& seqlens = {}) const;
912
913 // Reposition multiple batches
914 StatusCode repositionInputs(const std::vector<float*>& input,
915 std::vector<std::vector<Buffer>>& input_buf,
916 const std::vector<std::vector<int>>& seqlens = {}) const;
917 StatusCode repositionOutputs(const std::vector<std::vector<Buffer>>& output_buf,
918 std::vector<float*>& output,
919 const std::vector<std::vector<int>>& seqlens = {}) const;
920 StatusCode repositionOutputs(const std::vector<std::vector<Buffer>>& output_buf,
921 std::vector<std::vector<float>>& output,
922 const std::vector<std::vector<int>>& seqlens = {}) const;
923
925
935
949 std::unique_ptr<ModelVariantHandle> getModelVariantHandle(int variant_idx,
950 StatusCode& sc) const;
951
957 const std::vector<std::vector<int64_t>>& getModelInputShape() const;
958
964 const std::vector<std::vector<int64_t>>& getModelOutputShape() const;
965
971 const std::vector<BufferInfo>& getInputBufferInfo() const;
972
978 const std::vector<BufferInfo>& getOutputBufferInfo() const;
979
985 std::vector<Scale> getInputScale() const;
986
992 std::vector<Scale> getOutputScale() const;
993
1002 uint32_t getIdentifier() const;
1003
1009 std::string getModelPath() const;
1010
1016 std::vector<CacheInfo> getCacheInfos() const;
1017
1024
1031
1042 StatusCode dumpCacheMemory(std::vector<std::vector<int8_t>>& bufs);
1043
1053 std::vector<std::vector<int8_t>> dumpCacheMemory(StatusCode& sc);
1054
1065 StatusCode dumpCacheMemory(const std::string& cache_dir);
1066
1077 StatusCode loadCacheMemory(const std::vector<std::vector<int8_t>>& bufs);
1078
1089 StatusCode loadCacheMemory(const std::string& cache_dir);
1090
1104 int filterCacheTail(int cache_size, int tail_size, const std::vector<bool>& mask,
1105 StatusCode& sc);
1106
1120 int moveCacheTail(int num_head, int num_tail, int cache_size, StatusCode& sc);
1121
1123
1130
1134 StatusCode infer(const std::vector<float*>& input,
1135 std::vector<std::vector<float>>& output, int batch_size);
1136
1140 std::vector<std::vector<float>> infer(const std::vector<float*>& input,
1141 int batch_size, StatusCode& sc);
1142
1146 StatusCode inferHeightBatch(const std::vector<float*>& input,
1147 std::vector<std::vector<float>>& output,
1148 int height_batch_size);
1149
1154
1159
1164
1168 uint64_t getLatencyConsumed(const int npu_op_idx) const;
1169
1173 uint64_t getLatencyFinished(const int npu_op_idx) const;
1174
1178 std::shared_ptr<Statistics> getStatistics() const;
1179
1181
1182private:
1183 Model();
1184
1185#ifndef _MSC_VER
1186 std::experimental::propagate_const<std::unique_ptr<ModelImpl>> mImpl;
1187#else
1188 std::unique_ptr<ModelImpl> mImpl;
1189#endif
1190
1191 friend class Accelerator;
1192};
1193
1195
1196} // namespace mobilint
1197
1198#endif
Represents an accelerator, i.e., an NPU, used for executing models.
Definition acc.h:66
Represents a future for retrieving the result of asynchronous inference.
Definition future.h:43
Configures a core mode and core allocation of a model for NPU inference.
Definition type.h:257
std::string getModelPath() const
Returns the path to the MXQ model file associated with the Model.
Future< float > inferAsyncToFloat(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-float asynchronous inference.
uint64_t getLatencyConsumed(const int npu_op_idx) const
std::vector< std::vector< float > > infer(const std::vector< float * > &input, int batch_size, StatusCode &sc)
StatusCode inferHeightBatch(const std::vector< float * > &input, std::vector< std::vector< float > > &output, int height_batch_size)
bool isTarget(CoreId core_id) const
Checks if the NPU core specified by CoreId is the target of the model. In other words,...
static std::unique_ptr< Model > create(const std::string &mxq_path, StatusCode &sc)
Creates a Model object from the specified MXQ model file.
Future< float > inferAsyncCHW(const std::vector< NDArray< float > > &input, StatusCode &sc)
Initiates asynchronous inference with input in NCHW (batch N, channels C, height H,...
std::shared_ptr< Statistics > getStatistics() const
std::vector< NDArray< float > > infer(const std::vector< NDArray< float > > &input, StatusCode &sc)
This overload differs from the above function in that it directly returns the inference results inste...
Future< int8_t > inferAsync(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-int8_t asynchronous inference.
StatusCode loadCacheMemory(const std::vector< std::vector< int8_t > > &bufs)
Loads the KV cache memory from buffers.
Future< float > inferAsyncCHWToFloat(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-float asynchronous inference.
SchedulePolicy getSchedulePolicy() const
std::vector< Scale > getOutputScale() const
Returns the output quantization scale(s) of the model.
StatusCode loadCacheMemory(const std::string &cache_dir)
Loads the KV cache memory from files in the specified directory.
StatusCode dumpCacheMemory(const std::string &cache_dir)
Dumps KV cache memory to files in the specified directory.
const std::vector< std::vector< int64_t > > & getModelOutputShape() const
Returns the output shape of the model.
std::unique_ptr< ModelVariantHandle > getModelVariantHandle(int variant_idx, StatusCode &sc) const
Retrieves a handle to the specified model variant.
StatusCode launch(Accelerator &acc)
Launches the model on the specified Accelerator, which represents the actual NPU.
LatencySetPolicy getLatencySetPolicy() const
std::vector< CoreId > getTargetCores() const
Returns the NPU cores the model is configured to use.
std::vector< std::vector< float > > inferCHW(const std::vector< float * > &input, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the maccel ...
std::vector< std::vector< float > > inferCHW(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the maccel ...
const std::vector< BufferInfo > & getInputBufferInfo() const
Returns the input buffer information for the model.
int moveCacheTail(int num_head, int num_tail, int cache_size, StatusCode &sc)
Moves the tail of the KV cache memory to the end of the head.
std::vector< std::vector< float > > infer(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.
StatusCode infer(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output)
Performs inference.
CoreMode getCoreMode() const
Retrieves the core mode of the model.
int getNumModelVariants() const
Returns the total number of model variants available in this model.
Future< float > inferAsync(const std::vector< NDArray< float > > &input, StatusCode &sc)
Initiates asynchronous inference with input in NHWC (batch N, height H, width W, channels C) or HWC f...
uint32_t getIdentifier() const
Returns the model's unique identifier.
std::vector< NDArray< float > > inferCHW(const std::vector< NDArray< float > > &input, StatusCode &sc)
This overload differs from the above function in that it directly returns the inference results inste...
std::vector< NDArray< float > > infer(const std::vector< NDArray< float > > &input, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.
StatusCode dispose()
Disposes of the model loaded onto the NPU.
StatusCode inferCHW(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output, uint32_t cache_size)
This overload supports inference with KV cache.
static std::unique_ptr< Model > create(const std::string &mxq_path, const ModelConfig &config, StatusCode &sc)
Creates a Model object from the specified MXQ model file and configuration.
int filterCacheTail(int cache_size, int tail_size, const std::vector< bool > &mask, StatusCode &sc)
Filter the tail of the KV cache memory.
const std::vector< std::vector< int64_t > > & getModelInputShape() const
Returns the input shape of the model.
void resetCacheMemory()
Resets the KV cache memory.
std::vector< NDArray< float > > inferCHW(const std::vector< NDArray< float > > &input, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.
MaintenancePolicy getMaintenancePolicy() const
Future< int8_t > inferAsyncCHW(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-int8_t asynchronous inference.
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output)
This overload is provided for convenience but may result in additional data copies within the maccel ...
const std::vector< BufferInfo > & getOutputBufferInfo() const
Returns the output buffer information of the model.
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size)
This overload supports inference with KV cache.
std::vector< CacheInfo > getCacheInfos() const
Returns informations of KV-cache of the model.
StatusCode infer(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output, uint32_t cache_size)
This overload supports inference with KV cache.
std::vector< std::vector< float > > infer(const std::vector< float * > &input, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the maccel ...
std::vector< std::vector< float > > inferCHW(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.
std::vector< std::vector< float > > infer(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the maccel ...
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output, int batch_size)
std::vector< std::vector< int8_t > > dumpCacheMemory(StatusCode &sc)
Dumps the KV cache memory into buffers.
StatusCode inferCHW(const std::vector< float * > &input, std::vector< std::vector< float > > &output)
This overload is provided for convenience but may result in additional data copies within the maccel ...
std::vector< Scale > getInputScale() const
Returns the input quantization scale(s) of the model.
StatusCode inferSpeedrun(int variant_idx=0)
Development-only API for measuring pure NPU inference speed.
StatusCode inferCHW(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output)
Performs inference.
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape)
This overload is provided for convenience but may result in additional data copies within the maccel ...
StatusCode inferCHW(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape)
This overload is provided for convenience but may result in additional data copies within the maccel ...
uint64_t getLatencyFinished(const int npu_op_idx) const
StatusCode inferCHW(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size)
This overload supports inference with KV cache.
StatusCode dumpCacheMemory(std::vector< std::vector< int8_t > > &bufs)
Dumps the KV cache memory into buffers.
A class representing an N-dimensional array (NDArray).
Definition ndarray.h:77
MaintenancePolicy
Definition type.h:94
SchedulePolicy
Definition type.h:76
CoreMode
Defines the core mode for NPU execution.
Definition type.h:193
StatusCode
Enumerates status codes for the maccel runtime.
Definition status_code.h:26
LatencySetPolicy
Definition type.h:86
Represents a unique identifier for an NPU core.
Definition type.h:147