model.h Source File

model.h Source File#

SDK qb Runtime Library: model.h Source File
SDK qb Runtime Library v1.2
MCS001-
model.h
Go to the documentation of this file.
1// Copyright ⓒ 2019- Mobilint Inc. All rights reserved.
5
6#ifndef QBRUNTIME_MODEL_H_
7#define QBRUNTIME_MODEL_H_
8
9#include <cstdint>
10#ifndef _MSC_VER
11#include <experimental/propagate_const>
12#endif
13#include <memory>
14#include <string>
15#include <vector>
16
17#include "qbruntime/export.h"
18#include "qbruntime/future.h"
20#include "qbruntime/ndarray.h"
22#include "qbruntime/type.h"
23
24namespace mobilint {
25
30
31class Accelerator;
32class ModelImpl;
33
40class QBRUNTIME_EXPORT Model {
41public:
56 static std::unique_ptr<Model> create(const std::string& mxq_path, StatusCode& sc);
57
73 static std::unique_ptr<Model> create(const std::string& mxq_path,
74 const ModelConfig& config, StatusCode& sc);
75
76 Model(const Model& other) = delete;
77 Model(Model&& other) noexcept;
78 Model& operator=(const Model& rhs) = delete;
79 Model& operator=(Model&& rhs) noexcept;
80 ~Model();
81
90 StatusCode launch(Accelerator& acc);
91
101
108
117 bool isTarget(CoreId core_id) const;
118
124 std::vector<CoreId> getTargetCores() const;
125
143
154 StatusCode infer(const std::vector<NDArray<float>>& input,
155 std::vector<NDArray<float>>& output);
156
168 std::vector<NDArray<float>> infer(const std::vector<NDArray<float>>& input,
169 StatusCode& sc);
170
182 StatusCode infer(const std::vector<float*>& input,
183 std::vector<std::vector<float>>& output);
184
199 std::vector<std::vector<float>> infer(const std::vector<float*>& input,
200 StatusCode& sc);
201
218 StatusCode infer(const std::vector<float*>& input,
219 std::vector<std::vector<float>>& output,
220 const std::vector<std::vector<int64_t>>& shape);
237 std::vector<std::vector<float>> infer(const std::vector<float*>& input,
238 const std::vector<std::vector<int64_t>>& shape,
239 StatusCode& sc);
240
254 StatusCode infer(const std::vector<NDArray<float>>& input,
255 std::vector<NDArray<float>>& output, uint32_t cache_size);
256
269 StatusCode infer(const std::vector<NDArray<float>>& input,
270 std::vector<NDArray<float>>& output,
271 const std::vector<BatchParam>& params);
272
289 std::vector<NDArray<float>> infer(const std::vector<NDArray<float>>& input,
290 uint32_t cache_size, StatusCode& sc);
291
307 std::vector<NDArray<float>> infer(const std::vector<NDArray<float>>& input,
308 const std::vector<BatchParam>& params,
309 StatusCode& sc);
310
326 StatusCode infer(const std::vector<float*>& input,
327 std::vector<std::vector<float>>& output,
328 const std::vector<std::vector<int64_t>>& shape, uint32_t cache_size);
329
348 std::vector<std::vector<float>> infer(const std::vector<float*>& input,
349 const std::vector<std::vector<int64_t>>& shape,
350 uint32_t cache_size, StatusCode& sc);
351
366 StatusCode infer(const std::vector<float*>& input,
367 std::vector<std::vector<float>>& output,
368 const std::vector<std::vector<int64_t>>& shape,
369 const std::vector<BatchParam>& params);
370
388 std::vector<std::vector<float>> infer(const std::vector<float*>& input,
389 const std::vector<std::vector<int64_t>>& shape,
390 const std::vector<BatchParam>& params,
391 StatusCode& sc);
392
394
419
430 StatusCode inferCHW(const std::vector<NDArray<float>>& input,
431 std::vector<NDArray<float>>& output);
432
444 std::vector<NDArray<float>> inferCHW(const std::vector<NDArray<float>>& input,
445 StatusCode& sc);
446
458 StatusCode inferCHW(const std::vector<float*>& input,
459 std::vector<std::vector<float>>& output);
460
475 std::vector<std::vector<float>> inferCHW(const std::vector<float*>& input,
476 StatusCode& sc);
477
494 StatusCode inferCHW(const std::vector<float*>& input,
495 std::vector<std::vector<float>>& output,
496 const std::vector<std::vector<int64_t>>& shape);
497
514 std::vector<std::vector<float>> inferCHW(
515 const std::vector<float*>& input, const std::vector<std::vector<int64_t>>& shape,
516 StatusCode& sc);
517
531 StatusCode inferCHW(const std::vector<NDArray<float>>& input,
532 std::vector<NDArray<float>>& output, uint32_t cache_size);
533
550 std::vector<NDArray<float>> inferCHW(const std::vector<NDArray<float>>& input,
551 uint32_t cache_size, StatusCode& sc);
552
568 StatusCode inferCHW(const std::vector<float*>& input,
569 std::vector<std::vector<float>>& output,
570 const std::vector<std::vector<int64_t>>& shape,
571 uint32_t cache_size);
572
591 std::vector<std::vector<float>> inferCHW(
592 const std::vector<float*>& input, const std::vector<std::vector<int64_t>>& shape,
593 uint32_t cache_size, StatusCode& sc);
594
596
608
609 StatusCode infer(const std::vector<NDArray<uint8_t>>& input,
610 std::vector<NDArray<float>>& output);
611 std::vector<NDArray<float>> infer(const std::vector<NDArray<uint8_t>>& input,
612 StatusCode& sc);
613 StatusCode infer(const std::vector<uint8_t*>& input,
614 std::vector<std::vector<float>>& output);
615 std::vector<std::vector<float>> infer(const std::vector<uint8_t*>& input,
616 StatusCode& sc);
617 StatusCode infer(const std::vector<uint8_t*>& input,
618 std::vector<std::vector<float>>& output,
619 const std::vector<std::vector<int64_t>>& shape);
620 std::vector<std::vector<float>> infer(const std::vector<uint8_t*>& input,
621 const std::vector<std::vector<int64_t>>& shape,
622 StatusCode& sc);
623
624 StatusCode infer(const std::vector<NDArray<uint8_t>>& input,
625 std::vector<NDArray<float>>& output, uint32_t cache_size);
626 std::vector<NDArray<float>> infer(const std::vector<NDArray<uint8_t>>& input,
627 uint32_t cache_size, StatusCode& sc);
628 StatusCode infer(const std::vector<uint8_t*>& input,
629 std::vector<std::vector<float>>& output,
630 const std::vector<std::vector<int64_t>>& shape, uint32_t cache_size);
631 std::vector<std::vector<float>> infer(const std::vector<uint8_t*>& input,
632 const std::vector<std::vector<int64_t>>& shape,
633 uint32_t cache_size, StatusCode& sc);
634
635 StatusCode infer(const std::vector<NDArray<uint8_t>>& input,
636 std::vector<NDArray<float>>& output,
637 const std::vector<BatchParam>& params);
638 std::vector<NDArray<float>> infer(const std::vector<NDArray<uint8_t>>& input,
639 const std::vector<BatchParam>& params,
640 StatusCode& sc);
641 StatusCode infer(const std::vector<uint8_t*>& input,
642 std::vector<std::vector<float>>& output,
643 const std::vector<std::vector<int64_t>>& shape,
644 const std::vector<BatchParam>& params);
645 std::vector<std::vector<float>> infer(const std::vector<uint8_t*>& input,
646 const std::vector<std::vector<int64_t>>& shape,
647 const std::vector<BatchParam>& params,
648 StatusCode& sc);
649
651
660 StatusCode inferCHW(const std::vector<NDArray<uint8_t>>& input,
661 std::vector<NDArray<float>>& output);
662 std::vector<NDArray<float>> inferCHW(const std::vector<NDArray<uint8_t>>& input,
663 StatusCode& sc);
664 StatusCode inferCHW(const std::vector<uint8_t*>& input,
665 std::vector<std::vector<float>>& output);
666 std::vector<std::vector<float>> inferCHW(const std::vector<uint8_t*>& input,
667 StatusCode& sc);
668 StatusCode inferCHW(const std::vector<uint8_t*>& input,
669 std::vector<std::vector<float>>& output,
670 const std::vector<std::vector<int64_t>>& shape);
671 std::vector<std::vector<float>> inferCHW(
672 const std::vector<uint8_t*>& input,
673 const std::vector<std::vector<int64_t>>& shape, StatusCode& sc);
674
675 StatusCode inferCHW(const std::vector<NDArray<uint8_t>>& input,
676 std::vector<NDArray<float>>& output, uint32_t cache_size);
677 std::vector<NDArray<float>> inferCHW(const std::vector<NDArray<uint8_t>>& input,
678 uint32_t cache_size, StatusCode& sc);
679 StatusCode inferCHW(const std::vector<uint8_t*>& input,
680 std::vector<std::vector<float>>& output,
681 const std::vector<std::vector<int64_t>>& shape,
682 uint32_t cache_size);
683 std::vector<std::vector<float>> inferCHW(
684 const std::vector<uint8_t*>& input,
685 const std::vector<std::vector<int64_t>>& shape, uint32_t cache_size,
686 StatusCode& sc);
688
705
706 StatusCode infer(const std::vector<NDArray<int8_t>>& input,
707 std::vector<NDArray<int8_t>>& output);
708 std::vector<NDArray<int8_t>> infer(const std::vector<NDArray<int8_t>>& input,
709 StatusCode& sc);
710 StatusCode infer(const std::vector<int8_t*>& input,
711 std::vector<std::vector<int8_t>>& output);
712 std::vector<std::vector<int8_t>> infer(const std::vector<int8_t*>& input,
713 StatusCode& sc);
714 StatusCode infer(const std::vector<int8_t*>& input,
715 std::vector<std::vector<int8_t>>& output,
716 const std::vector<std::vector<int64_t>>& shape);
717 std::vector<std::vector<int8_t>> infer(const std::vector<int8_t*>& input,
718 const std::vector<std::vector<int64_t>>& shape,
719 StatusCode& sc);
720
721 StatusCode infer(const std::vector<NDArray<int8_t>>& input,
722 std::vector<NDArray<int8_t>>& output, uint32_t cache_size);
723 std::vector<NDArray<int8_t>> infer(const std::vector<NDArray<int8_t>>& input,
724 uint32_t cache_size, StatusCode& sc);
725 StatusCode infer(const std::vector<int8_t*>& input,
726 std::vector<std::vector<int8_t>>& output,
727 const std::vector<std::vector<int64_t>>& shape, uint32_t cache_size);
728 std::vector<std::vector<int8_t>> infer(const std::vector<int8_t*>& input,
729 const std::vector<std::vector<int64_t>>& shape,
730 uint32_t cache_size, StatusCode& sc);
731
732 StatusCode infer(const std::vector<NDArray<int8_t>>& input,
733 std::vector<NDArray<int8_t>>& output,
734 const std::vector<BatchParam>& params);
735 std::vector<NDArray<int8_t>> infer(const std::vector<NDArray<int8_t>>& input,
736 const std::vector<BatchParam>& params,
737 StatusCode& sc);
738 StatusCode infer(const std::vector<int8_t*>& input,
739 std::vector<std::vector<int8_t>>& output,
740 const std::vector<std::vector<int64_t>>& shape,
741 const std::vector<BatchParam>& params);
742 std::vector<std::vector<int8_t>> infer(const std::vector<int8_t*>& input,
743 const std::vector<std::vector<int64_t>>& shape,
744 const std::vector<BatchParam>& params,
745 StatusCode& sc);
746
748
761 StatusCode inferCHW(const std::vector<NDArray<int8_t>>& input,
762 std::vector<NDArray<int8_t>>& output);
763 std::vector<NDArray<int8_t>> inferCHW(const std::vector<NDArray<int8_t>>& input,
764 StatusCode& sc);
765 StatusCode inferCHW(const std::vector<int8_t*>& input,
766 std::vector<std::vector<int8_t>>& output);
767 std::vector<std::vector<int8_t>> inferCHW(const std::vector<int8_t*>& input,
768 StatusCode& sc);
769 StatusCode inferCHW(const std::vector<int8_t*>& input,
770 std::vector<std::vector<int8_t>>& output,
771 const std::vector<std::vector<int64_t>>& shape);
772 std::vector<std::vector<int8_t>> inferCHW(
773 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
774 StatusCode& sc);
775
776 StatusCode inferCHW(const std::vector<NDArray<int8_t>>& input,
777 std::vector<NDArray<int8_t>>& output, uint32_t cache_size);
778 std::vector<NDArray<int8_t>> inferCHW(const std::vector<NDArray<int8_t>>& input,
779 uint32_t cache_size, StatusCode& sc);
780 StatusCode inferCHW(const std::vector<int8_t*>& input,
781 std::vector<std::vector<int8_t>>& output,
782 const std::vector<std::vector<int64_t>>& shape,
783 uint32_t cache_size);
784 std::vector<std::vector<int8_t>> inferCHW(
785 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
786 uint32_t cache_size, StatusCode& sc);
788
801 std::vector<NDArray<float>> inferToFloat(const std::vector<NDArray<int8_t>>& input,
802 StatusCode& sc);
803 std::vector<std::vector<float>> inferToFloat(const std::vector<int8_t*>& input,
804 StatusCode& sc);
805 std::vector<std::vector<float>> inferToFloat(
806 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
807 StatusCode& sc);
808
809 std::vector<NDArray<float>> inferToFloat(const std::vector<NDArray<int8_t>>& input,
810 uint32_t cache_size, StatusCode& sc);
811 std::vector<std::vector<float>> inferToFloat(
812 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
813 uint32_t cache_size, StatusCode& sc);
815
828 std::vector<NDArray<float>> inferCHWToFloat(const std::vector<NDArray<int8_t>>& input,
829 StatusCode& sc);
830 std::vector<std::vector<float>> inferCHWToFloat(const std::vector<int8_t*>& input,
831 StatusCode& sc);
832 std::vector<std::vector<float>> inferCHWToFloat(
833 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
834 StatusCode& sc);
835
836 std::vector<NDArray<float>> inferCHWToFloat(const std::vector<NDArray<int8_t>>& input,
837 uint32_t cache_size, StatusCode& sc);
838 std::vector<std::vector<float>> inferCHWToFloat(
839 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
840 uint32_t cache_size, StatusCode& sc);
842
865 StatusCode inferBuffer(const std::vector<Buffer>& input, std::vector<Buffer>& output,
866 const std::vector<std::vector<int64_t>>& shape = {},
867 uint32_t cache_size = 0);
868 StatusCode inferBuffer(const std::vector<std::vector<Buffer>>& input,
869 std::vector<std::vector<Buffer>>& output,
870 const std::vector<std::vector<int64_t>>& shape = {},
871 uint32_t cache_size = 0);
873
891 StatusCode inferBufferToFloat(const std::vector<Buffer>& input,
892 std::vector<NDArray<float>>& output,
893 const std::vector<std::vector<int64_t>>& shape = {},
894 uint32_t cache_size = 0);
895 StatusCode inferBufferToFloat(const std::vector<std::vector<Buffer>>& input,
896 std::vector<NDArray<float>>& output,
897 const std::vector<std::vector<int64_t>>& shape = {},
898 uint32_t cache_size = 0);
899 StatusCode inferBufferToFloat(const std::vector<Buffer>& input,
900 std::vector<std::vector<float>>& output,
901 const std::vector<std::vector<int64_t>>& shape = {},
902 uint32_t cache_size = 0);
903 StatusCode inferBufferToFloat(const std::vector<std::vector<Buffer>>& input,
904 std::vector<std::vector<float>>& output,
905 const std::vector<std::vector<int64_t>>& shape = {},
906 uint32_t cache_size = 0);
908
917 StatusCode inferSpeedrun(int variant_idx = 0);
918
973
985 Future<float> inferAsync(const std::vector<NDArray<float>>& input, StatusCode& sc);
986
998 Future<float> inferAsyncCHW(const std::vector<NDArray<float>>& input, StatusCode& sc);
999
1010 Future<int8_t> inferAsync(const std::vector<NDArray<int8_t>>& input, StatusCode& sc);
1011
1023 StatusCode& sc);
1024
1036 StatusCode& sc);
1037
1049 StatusCode& sc);
1050
1061 Future<float> inferAsync(const std::vector<NDArray<uint8_t>>& input, StatusCode& sc);
1062
1074 StatusCode& sc);
1075
1077
1099
1100 // Acquire buffer
1101 std::vector<Buffer> acquireInputBuffer(
1102 const std::vector<std::vector<int>>& seqlens = {}) const;
1103 std::vector<Buffer> acquireOutputBuffer(
1104 const std::vector<std::vector<int>>& seqlens = {}) const;
1105 std::vector<std::vector<Buffer>> acquireInputBuffers(
1106 const int batch_size, const std::vector<std::vector<int>>& seqlens = {}) const;
1107 std::vector<std::vector<Buffer>> acquireOutputBuffers(
1108 const int batch_size, const std::vector<std::vector<int>>& seqlens = {}) const;
1109
1110 // Deallocate acquired Input/Output buffer
1111 StatusCode releaseBuffer(std::vector<Buffer>& buffer) const;
1112 StatusCode releaseBuffers(std::vector<std::vector<Buffer>>& buffers) const;
1113
1114 // Reposition single batch
1115 StatusCode repositionInputs(const std::vector<float*>& input,
1116 std::vector<Buffer>& input_buf,
1117 const std::vector<std::vector<int>>& seqlens = {}) const;
1118 StatusCode repositionOutputs(const std::vector<Buffer>& output_buf,
1119 std::vector<float*>& output,
1120 const std::vector<std::vector<int>>& seqlens = {}) const;
1121 StatusCode repositionOutputs(const std::vector<Buffer>& output_buf,
1122 std::vector<std::vector<float>>& output,
1123 const std::vector<std::vector<int>>& seqlens = {}) const;
1124 StatusCode repositionInputs(const std::vector<uint8_t*>& input,
1125 std::vector<Buffer>& input_buf,
1126 const std::vector<std::vector<int>>& seqlens = {}) const;
1127
1128 // Reposition multiple batches
1129 StatusCode repositionInputs(const std::vector<float*>& input,
1130 std::vector<std::vector<Buffer>>& input_buf,
1131 const std::vector<std::vector<int>>& seqlens = {}) const;
1132 StatusCode repositionOutputs(const std::vector<std::vector<Buffer>>& output_buf,
1133 std::vector<float*>& output,
1134 const std::vector<std::vector<int>>& seqlens = {}) const;
1135 StatusCode repositionOutputs(const std::vector<std::vector<Buffer>>& output_buf,
1136 std::vector<std::vector<float>>& output,
1137 const std::vector<std::vector<int>>& seqlens = {}) const;
1138 StatusCode repositionInputs(const std::vector<uint8_t*>& input,
1139 std::vector<std::vector<Buffer>>& input_buf,
1140 const std::vector<std::vector<int>>& seqlens = {}) const;
1142
1152
1166 std::unique_ptr<ModelVariantHandle> getModelVariantHandle(int variant_idx,
1167 StatusCode& sc) const;
1168
1174 const std::vector<std::vector<int64_t>>& getModelInputShape() const;
1175
1181 const std::vector<std::vector<int64_t>>& getModelOutputShape() const;
1182
1188 const std::vector<BufferInfo>& getInputBufferInfo() const;
1189
1195 const std::vector<BufferInfo>& getOutputBufferInfo() const;
1196
1202 std::vector<Scale> getInputScale() const;
1203
1209 std::vector<Scale> getOutputScale() const;
1210
1217
1224
1233 uint32_t getIdentifier() const;
1234
1240 std::string getModelPath() const;
1241
1247 std::vector<CacheInfo> getCacheInfos() const;
1248
1255
1267 StatusCode dumpCacheMemory(std::vector<std::vector<int8_t>>& bufs, int cache_id = 0);
1268
1279 std::vector<std::vector<int8_t>> dumpCacheMemory(StatusCode& sc);
1280 std::vector<std::vector<int8_t>> dumpCacheMemory(int cache_id, StatusCode& sc);
1281
1293 StatusCode dumpCacheMemory(const std::string& cache_dir, int cache_id = 0);
1294
1306 StatusCode loadCacheMemory(const std::vector<std::vector<int8_t>>& bufs,
1307 int cache_id = 0);
1308
1320 StatusCode loadCacheMemory(const std::string& cache_dir, int cache_id = 0);
1321
1335 int filterCacheTail(int cache_size, int tail_size, const std::vector<bool>& mask,
1336 StatusCode& sc);
1337
1351 int moveCacheTail(int num_head, int num_tail, int cache_size, StatusCode& sc);
1352
1354
1361
1365 StatusCode infer(const std::vector<float*>& input,
1366 std::vector<std::vector<float>>& output, int batch_size);
1367
1371 std::vector<std::vector<float>> infer(const std::vector<float*>& input,
1372 int batch_size, StatusCode& sc);
1373
1377 uint64_t getLatencyConsumed(const int npu_op_idx) const;
1378
1382 uint64_t getLatencyFinished(const int npu_op_idx) const;
1383
1385
1386private:
1387 Model();
1388
1389#ifndef _MSC_VER
1390 std::experimental::propagate_const<std::unique_ptr<ModelImpl>> mImpl;
1391#else
1392 std::unique_ptr<ModelImpl> mImpl;
1393#endif
1394
1395 friend class Accelerator;
1396};
1397
1399
1400} // namespace mobilint
1401
1402#endif
Represents an accelerator, i.e., an NPU, used for executing models.
Definition acc.h:33
Represents a future for retrieving the result of asynchronous inference.
Definition future.h:43
Configures a core mode and core allocation of a model for NPU inference.
Definition type.h:235
StatusCode infer(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output, const std::vector< BatchParam > &params)
This overload is supports inference with BatchParam for BatchLLM.
std::string getModelPath() const
Returns the path to the MXQ model file associated with the Model.
Future< float > inferAsyncToFloat(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-float asynchronous inference.
uint64_t getLatencyConsumed(const int npu_op_idx) const
std::vector< std::vector< float > > infer(const std::vector< float * > &input, int batch_size, StatusCode &sc)
bool isTarget(CoreId core_id) const
Checks if the NPU core specified by CoreId is the target of the model. In other words,...
static std::unique_ptr< Model > create(const std::string &mxq_path, StatusCode &sc)
Creates a Model object from the specified MXQ model file.
Future< float > inferAsyncCHW(const std::vector< NDArray< float > > &input, StatusCode &sc)
Initiates asynchronous inference with input in NCHW (batch N, channels C, height H,...
std::vector< NDArray< float > > infer(const std::vector< NDArray< float > > &input, StatusCode &sc)
This overload differs from the above function in that it directly returns the inference results inste...
Future< int8_t > inferAsync(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-int8_t asynchronous inference.
std::vector< NDArray< float > > infer(const std::vector< NDArray< float > > &input, const std::vector< BatchParam > &params, StatusCode &sc)
This overload is supports inference with BatchParam for BatchLLM.
Future< float > inferAsyncCHWToFloat(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-float asynchronous inference.
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape, const std::vector< BatchParam > &params)
This overload is supports inference with BatchParam for BatchLLM.
std::vector< Scale > getOutputScale() const
Returns the output quantization scale(s) of the model.
StatusCode loadCacheMemory(const std::vector< std::vector< int8_t > > &bufs, int cache_id=0)
Loads the KV cache memory from buffers.
StatusCode dumpCacheMemory(std::vector< std::vector< int8_t > > &bufs, int cache_id=0)
Dumps the KV cache memory into buffers.
const std::vector< std::vector< int64_t > > & getModelOutputShape() const
Returns the output shape of the model.
std::unique_ptr< ModelVariantHandle > getModelVariantHandle(int variant_idx, StatusCode &sc) const
Retrieves a handle to the specified model variant.
StatusCode loadCacheMemory(const std::string &cache_dir, int cache_id=0)
Loads the KV cache memory from files in the specified directory.
StatusCode launch(Accelerator &acc)
Launches the model on the specified Accelerator, which represents the actual NPU.
Future< float > inferAsync(const std::vector< NDArray< uint8_t > > &input, StatusCode &sc)
This overload supports uint8_t-to-float asynchronous inference.
std::vector< CoreId > getTargetCores() const
Returns the NPU cores the model is configured to use.
std::vector< std::vector< float > > inferCHW(const std::vector< float * > &input, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
std::vector< std::vector< float > > inferCHW(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
const std::vector< BufferInfo > & getInputBufferInfo() const
Returns the input buffer information for the model.
int moveCacheTail(int num_head, int num_tail, int cache_size, StatusCode &sc)
Moves the tail of the KV cache memory to the end of the head.
DataType getModelInputDataType() const
Returns a data type for model inputs.
std::vector< std::vector< float > > infer(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.
StatusCode infer(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output)
Performs inference.
CoreMode getCoreMode() const
Retrieves the core mode of the model.
int getNumModelVariants() const
Returns the total number of model variants available in this model.
Future< float > inferAsync(const std::vector< NDArray< float > > &input, StatusCode &sc)
Initiates asynchronous inference with input in NHWC (batch N, height H, width W, channels C) or HWC f...
uint32_t getIdentifier() const
Returns the model's unique identifier.
std::vector< NDArray< float > > inferCHW(const std::vector< NDArray< float > > &input, StatusCode &sc)
This overload differs from the above function in that it directly returns the inference results inste...
std::vector< NDArray< float > > infer(const std::vector< NDArray< float > > &input, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.
DataType getModelOutputDataType() const
Returns a data type for model outputs.
StatusCode dispose()
Disposes of the model loaded onto the NPU.
StatusCode inferCHW(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output, uint32_t cache_size)
This overload supports inference with KV cache.
static std::unique_ptr< Model > create(const std::string &mxq_path, const ModelConfig &config, StatusCode &sc)
Creates a Model object from the specified MXQ model file and configuration.
int filterCacheTail(int cache_size, int tail_size, const std::vector< bool > &mask, StatusCode &sc)
Filter the tail of the KV cache memory.
const std::vector< std::vector< int64_t > > & getModelInputShape() const
Returns the input shape of the model.
std::vector< NDArray< float > > inferCHW(const std::vector< NDArray< float > > &input, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.
Future< int8_t > inferAsyncCHW(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-int8_t asynchronous inference.
StatusCode dumpCacheMemory(const std::string &cache_dir, int cache_id=0)
Dumps KV cache memory to files in the specified directory.
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
const std::vector< BufferInfo > & getOutputBufferInfo() const
Returns the output buffer information of the model.
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size)
This overload supports inference with KV cache.
std::vector< CacheInfo > getCacheInfos() const
Returns informations of KV-cache of the model.
StatusCode infer(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output, uint32_t cache_size)
This overload supports inference with KV cache.
Future< float > inferAsyncCHW(const std::vector< NDArray< uint8_t > > &input, StatusCode &sc)
This overload supports uint8_t-to-float asynchronous inference.
std::vector< std::vector< float > > infer(const std::vector< float * > &input, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
std::vector< std::vector< float > > inferCHW(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.
std::vector< std::vector< float > > infer(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output, int batch_size)
std::vector< std::vector< int8_t > > dumpCacheMemory(StatusCode &sc)
Dumps the KV cache memory into buffers.
StatusCode inferCHW(const std::vector< float * > &input, std::vector< std::vector< float > > &output)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
std::vector< Scale > getInputScale() const
Returns the input quantization scale(s) of the model.
StatusCode inferSpeedrun(int variant_idx=0)
Development-only API for measuring pure NPU inference speed.
StatusCode inferCHW(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output)
Performs inference.
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
StatusCode inferCHW(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
uint64_t getLatencyFinished(const int npu_op_idx) const
StatusCode inferCHW(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size)
This overload supports inference with KV cache.
std::vector< std::vector< int8_t > > dumpCacheMemory(int cache_id, StatusCode &sc)
Dumps the KV cache memory into buffers.
std::vector< std::vector< float > > infer(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, const std::vector< BatchParam > &params, StatusCode &sc)
This overload is supports inference with BatchParam for BatchLLM.
A class representing an N-dimensional array (NDArray).
Definition ndarray.h:77
DataType
DataType.
Definition type.h:508
CoreMode
Defines the core mode for NPU execution.
Definition type.h:170
StatusCode
Enumerates status codes for the qbruntime.
Definition status_code.h:26
Represents a unique identifier for an NPU core.
Definition type.h:123