model.h Source File

model.h Source File#

SDK qb Runtime Library: model.h Source File
SDK qb Runtime Library v1.0
MCS001-
model.h
Go to the documentation of this file.
1// Copyright ⓒ 2019- Mobilint Inc. All rights reserved.
5
6#ifndef QBRUNTIME_MODEL_H_
7#define QBRUNTIME_MODEL_H_
8
9#include <cstdint>
10#ifndef _MSC_VER
11#include <experimental/propagate_const>
12#endif
13#include <memory>
14#include <string>
15#include <vector>
16
17#include "qbruntime/export.h"
18#include "qbruntime/future.h"
20#include "qbruntime/ndarray.h"
22#include "qbruntime/type.h"
23
24namespace mobilint {
25
30
31class Accelerator;
32class ModelImpl;
33
40class QBRUNTIME_EXPORT Model {
41public:
56 static std::unique_ptr<Model> create(const std::string& mxq_path, StatusCode& sc);
57
73 static std::unique_ptr<Model> create(const std::string& mxq_path,
74 const ModelConfig& config, StatusCode& sc);
75
76 Model(const Model& other) = delete;
77 Model(Model&& other) noexcept;
78 Model& operator=(const Model& rhs) = delete;
79 Model& operator=(Model&& rhs) noexcept;
80 ~Model();
81
90 StatusCode launch(Accelerator& acc);
91
101
108
117 bool isTarget(CoreId core_id) const;
118
124 std::vector<CoreId> getTargetCores() const;
125
143
154 StatusCode infer(const std::vector<NDArray<float>>& input,
155 std::vector<NDArray<float>>& output);
156
168 std::vector<NDArray<float>> infer(const std::vector<NDArray<float>>& input,
169 StatusCode& sc);
170
182 StatusCode infer(const std::vector<float*>& input,
183 std::vector<std::vector<float>>& output);
184
199 std::vector<std::vector<float>> infer(const std::vector<float*>& input,
200 StatusCode& sc);
201
218 StatusCode infer(const std::vector<float*>& input,
219 std::vector<std::vector<float>>& output,
220 const std::vector<std::vector<int64_t>>& shape);
237 std::vector<std::vector<float>> infer(const std::vector<float*>& input,
238 const std::vector<std::vector<int64_t>>& shape,
239 StatusCode& sc);
240
254 StatusCode infer(const std::vector<NDArray<float>>& input,
255 std::vector<NDArray<float>>& output, uint32_t cache_size);
256
273 std::vector<NDArray<float>> infer(const std::vector<NDArray<float>>& input,
274 uint32_t cache_size, StatusCode& sc);
275
291 StatusCode infer(const std::vector<float*>& input,
292 std::vector<std::vector<float>>& output,
293 const std::vector<std::vector<int64_t>>& shape, uint32_t cache_size);
294
313 std::vector<std::vector<float>> infer(const std::vector<float*>& input,
314 const std::vector<std::vector<int64_t>>& shape,
315 uint32_t cache_size, StatusCode& sc);
316
318
343
354 StatusCode inferCHW(const std::vector<NDArray<float>>& input,
355 std::vector<NDArray<float>>& output);
356
368 std::vector<NDArray<float>> inferCHW(const std::vector<NDArray<float>>& input,
369 StatusCode& sc);
370
382 StatusCode inferCHW(const std::vector<float*>& input,
383 std::vector<std::vector<float>>& output);
384
399 std::vector<std::vector<float>> inferCHW(const std::vector<float*>& input,
400 StatusCode& sc);
401
418 StatusCode inferCHW(const std::vector<float*>& input,
419 std::vector<std::vector<float>>& output,
420 const std::vector<std::vector<int64_t>>& shape);
421
438 std::vector<std::vector<float>> inferCHW(
439 const std::vector<float*>& input, const std::vector<std::vector<int64_t>>& shape,
440 StatusCode& sc);
441
455 StatusCode inferCHW(const std::vector<NDArray<float>>& input,
456 std::vector<NDArray<float>>& output, uint32_t cache_size);
457
474 std::vector<NDArray<float>> inferCHW(const std::vector<NDArray<float>>& input,
475 uint32_t cache_size, StatusCode& sc);
476
492 StatusCode inferCHW(const std::vector<float*>& input,
493 std::vector<std::vector<float>>& output,
494 const std::vector<std::vector<int64_t>>& shape,
495 uint32_t cache_size);
496
515 std::vector<std::vector<float>> inferCHW(
516 const std::vector<float*>& input, const std::vector<std::vector<int64_t>>& shape,
517 uint32_t cache_size, StatusCode& sc);
518
520
529
530 StatusCode infer(const std::vector<NDArray<uint8_t>>& input,
531 std::vector<NDArray<float>>& output);
532 std::vector<NDArray<float>> infer(const std::vector<NDArray<uint8_t>>& input,
533 StatusCode& sc);
534 StatusCode infer(const std::vector<uint8_t*>& input,
535 std::vector<std::vector<float>>& output);
536 std::vector<std::vector<float>> infer(const std::vector<uint8_t*>& input,
537 StatusCode& sc);
538 StatusCode infer(const std::vector<uint8_t*>& input,
539 std::vector<std::vector<float>>& output,
540 const std::vector<std::vector<int64_t>>& shape);
541 std::vector<std::vector<float>> infer(const std::vector<uint8_t*>& input,
542 const std::vector<std::vector<int64_t>>& shape,
543 StatusCode& sc);
544
545 StatusCode infer(const std::vector<NDArray<uint8_t>>& input,
546 std::vector<NDArray<float>>& output, uint32_t cache_size);
547 std::vector<NDArray<float>> infer(const std::vector<NDArray<uint8_t>>& input,
548 uint32_t cache_size, StatusCode& sc);
549 StatusCode infer(const std::vector<uint8_t*>& input,
550 std::vector<std::vector<float>>& output,
551 const std::vector<std::vector<int64_t>>& shape, uint32_t cache_size);
552 std::vector<std::vector<float>> infer(const std::vector<uint8_t*>& input,
553 const std::vector<std::vector<int64_t>>& shape,
554 uint32_t cache_size, StatusCode& sc);
555
557
566 StatusCode inferCHW(const std::vector<NDArray<uint8_t>>& input,
567 std::vector<NDArray<float>>& output);
568 std::vector<NDArray<float>> inferCHW(const std::vector<NDArray<uint8_t>>& input,
569 StatusCode& sc);
570 StatusCode inferCHW(const std::vector<uint8_t*>& input,
571 std::vector<std::vector<float>>& output);
572 std::vector<std::vector<float>> inferCHW(const std::vector<uint8_t*>& input,
573 StatusCode& sc);
574 StatusCode inferCHW(const std::vector<uint8_t*>& input,
575 std::vector<std::vector<float>>& output,
576 const std::vector<std::vector<int64_t>>& shape);
577 std::vector<std::vector<float>> inferCHW(
578 const std::vector<uint8_t*>& input,
579 const std::vector<std::vector<int64_t>>& shape, StatusCode& sc);
580
581 StatusCode inferCHW(const std::vector<NDArray<uint8_t>>& input,
582 std::vector<NDArray<float>>& output, uint32_t cache_size);
583 std::vector<NDArray<float>> inferCHW(const std::vector<NDArray<uint8_t>>& input,
584 uint32_t cache_size, StatusCode& sc);
585 StatusCode inferCHW(const std::vector<uint8_t*>& input,
586 std::vector<std::vector<float>>& output,
587 const std::vector<std::vector<int64_t>>& shape,
588 uint32_t cache_size);
589 std::vector<std::vector<float>> inferCHW(
590 const std::vector<uint8_t*>& input,
591 const std::vector<std::vector<int64_t>>& shape, uint32_t cache_size,
592 StatusCode& sc);
594
607
608 StatusCode infer(const std::vector<NDArray<int8_t>>& input,
609 std::vector<NDArray<int8_t>>& output);
610 std::vector<NDArray<int8_t>> infer(const std::vector<NDArray<int8_t>>& input,
611 StatusCode& sc);
612 StatusCode infer(const std::vector<int8_t*>& input,
613 std::vector<std::vector<int8_t>>& output);
614 std::vector<std::vector<int8_t>> infer(const std::vector<int8_t*>& input,
615 StatusCode& sc);
616 StatusCode infer(const std::vector<int8_t*>& input,
617 std::vector<std::vector<int8_t>>& output,
618 const std::vector<std::vector<int64_t>>& shape);
619 std::vector<std::vector<int8_t>> infer(const std::vector<int8_t*>& input,
620 const std::vector<std::vector<int64_t>>& shape,
621 StatusCode& sc);
622
623 StatusCode infer(const std::vector<NDArray<int8_t>>& input,
624 std::vector<NDArray<int8_t>>& output, uint32_t cache_size);
625 std::vector<NDArray<int8_t>> infer(const std::vector<NDArray<int8_t>>& input,
626 uint32_t cache_size, StatusCode& sc);
627 StatusCode infer(const std::vector<int8_t*>& input,
628 std::vector<std::vector<int8_t>>& output,
629 const std::vector<std::vector<int64_t>>& shape, uint32_t cache_size);
630 std::vector<std::vector<int8_t>> infer(const std::vector<int8_t*>& input,
631 const std::vector<std::vector<int64_t>>& shape,
632 uint32_t cache_size, StatusCode& sc);
633
635
648 StatusCode inferCHW(const std::vector<NDArray<int8_t>>& input,
649 std::vector<NDArray<int8_t>>& output);
650 std::vector<NDArray<int8_t>> inferCHW(const std::vector<NDArray<int8_t>>& input,
651 StatusCode& sc);
652 StatusCode inferCHW(const std::vector<int8_t*>& input,
653 std::vector<std::vector<int8_t>>& output);
654 std::vector<std::vector<int8_t>> inferCHW(const std::vector<int8_t*>& input,
655 StatusCode& sc);
656 StatusCode inferCHW(const std::vector<int8_t*>& input,
657 std::vector<std::vector<int8_t>>& output,
658 const std::vector<std::vector<int64_t>>& shape);
659 std::vector<std::vector<int8_t>> inferCHW(
660 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
661 StatusCode& sc);
662
663 StatusCode inferCHW(const std::vector<NDArray<int8_t>>& input,
664 std::vector<NDArray<int8_t>>& output, uint32_t cache_size);
665 std::vector<NDArray<int8_t>> inferCHW(const std::vector<NDArray<int8_t>>& input,
666 uint32_t cache_size, StatusCode& sc);
667 StatusCode inferCHW(const std::vector<int8_t*>& input,
668 std::vector<std::vector<int8_t>>& output,
669 const std::vector<std::vector<int64_t>>& shape,
670 uint32_t cache_size);
671 std::vector<std::vector<int8_t>> inferCHW(
672 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
673 uint32_t cache_size, StatusCode& sc);
675
688 std::vector<NDArray<float>> inferToFloat(const std::vector<NDArray<int8_t>>& input,
689 StatusCode& sc);
690 std::vector<std::vector<float>> inferToFloat(const std::vector<int8_t*>& input,
691 StatusCode& sc);
692 std::vector<std::vector<float>> inferToFloat(
693 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
694 StatusCode& sc);
695
696 std::vector<NDArray<float>> inferToFloat(const std::vector<NDArray<int8_t>>& input,
697 uint32_t cache_size, StatusCode& sc);
698 std::vector<std::vector<float>> inferToFloat(
699 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
700 uint32_t cache_size, StatusCode& sc);
702
715 std::vector<NDArray<float>> inferCHWToFloat(const std::vector<NDArray<int8_t>>& input,
716 StatusCode& sc);
717 std::vector<std::vector<float>> inferCHWToFloat(const std::vector<int8_t*>& input,
718 StatusCode& sc);
719 std::vector<std::vector<float>> inferCHWToFloat(
720 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
721 StatusCode& sc);
722
723 std::vector<NDArray<float>> inferCHWToFloat(const std::vector<NDArray<int8_t>>& input,
724 uint32_t cache_size, StatusCode& sc);
725 std::vector<std::vector<float>> inferCHWToFloat(
726 const std::vector<int8_t*>& input, const std::vector<std::vector<int64_t>>& shape,
727 uint32_t cache_size, StatusCode& sc);
729
752 StatusCode inferBuffer(const std::vector<Buffer>& input, std::vector<Buffer>& output,
753 const std::vector<std::vector<int64_t>>& shape = {},
754 uint32_t cache_size = 0);
755 StatusCode inferBuffer(const std::vector<std::vector<Buffer>>& input,
756 std::vector<std::vector<Buffer>>& output,
757 const std::vector<std::vector<int64_t>>& shape = {},
758 uint32_t cache_size = 0);
760
778 StatusCode inferBufferToFloat(const std::vector<Buffer>& input,
779 std::vector<NDArray<float>>& output,
780 const std::vector<std::vector<int64_t>>& shape = {},
781 uint32_t cache_size = 0);
782 StatusCode inferBufferToFloat(const std::vector<std::vector<Buffer>>& input,
783 std::vector<NDArray<float>>& output,
784 const std::vector<std::vector<int64_t>>& shape = {},
785 uint32_t cache_size = 0);
786 StatusCode inferBufferToFloat(const std::vector<Buffer>& input,
787 std::vector<std::vector<float>>& output,
788 const std::vector<std::vector<int64_t>>& shape = {},
789 uint32_t cache_size = 0);
790 StatusCode inferBufferToFloat(const std::vector<std::vector<Buffer>>& input,
791 std::vector<std::vector<float>>& output,
792 const std::vector<std::vector<int64_t>>& shape = {},
793 uint32_t cache_size = 0);
795
804 StatusCode inferSpeedrun(int variant_idx = 0);
805
860
872 Future<float> inferAsync(const std::vector<NDArray<float>>& input, StatusCode& sc);
873
885 Future<float> inferAsyncCHW(const std::vector<NDArray<float>>& input, StatusCode& sc);
886
897 Future<int8_t> inferAsync(const std::vector<NDArray<int8_t>>& input, StatusCode& sc);
898
910 StatusCode& sc);
911
923 StatusCode& sc);
924
936 StatusCode& sc);
937
948 Future<float> inferAsync(const std::vector<NDArray<uint8_t>>& input, StatusCode& sc);
949
961 StatusCode& sc);
962
964
986
987 // Acquire buffer
988 std::vector<Buffer> acquireInputBuffer(
989 const std::vector<std::vector<int>>& seqlens = {}) const;
990 std::vector<Buffer> acquireOutputBuffer(
991 const std::vector<std::vector<int>>& seqlens = {}) const;
992 std::vector<std::vector<Buffer>> acquireInputBuffers(
993 const int batch_size, const std::vector<std::vector<int>>& seqlens = {}) const;
994 std::vector<std::vector<Buffer>> acquireOutputBuffers(
995 const int batch_size, const std::vector<std::vector<int>>& seqlens = {}) const;
996
997 // Deallocate acquired Input/Output buffer
998 StatusCode releaseBuffer(std::vector<Buffer>& buffer) const;
999 StatusCode releaseBuffers(std::vector<std::vector<Buffer>>& buffers) const;
1000
1001 // Reposition single batch
1002 StatusCode repositionInputs(const std::vector<float*>& input,
1003 std::vector<Buffer>& input_buf,
1004 const std::vector<std::vector<int>>& seqlens = {}) const;
1005 StatusCode repositionOutputs(const std::vector<Buffer>& output_buf,
1006 std::vector<float*>& output,
1007 const std::vector<std::vector<int>>& seqlens = {}) const;
1008 StatusCode repositionOutputs(const std::vector<Buffer>& output_buf,
1009 std::vector<std::vector<float>>& output,
1010 const std::vector<std::vector<int>>& seqlens = {}) const;
1011 StatusCode repositionInputs(const std::vector<uint8_t*>& input,
1012 std::vector<Buffer>& input_buf,
1013 const std::vector<std::vector<int>>& seqlens = {}) const;
1014
1015 // Reposition multiple batches
1016 StatusCode repositionInputs(const std::vector<float*>& input,
1017 std::vector<std::vector<Buffer>>& input_buf,
1018 const std::vector<std::vector<int>>& seqlens = {}) const;
1019 StatusCode repositionOutputs(const std::vector<std::vector<Buffer>>& output_buf,
1020 std::vector<float*>& output,
1021 const std::vector<std::vector<int>>& seqlens = {}) const;
1022 StatusCode repositionOutputs(const std::vector<std::vector<Buffer>>& output_buf,
1023 std::vector<std::vector<float>>& output,
1024 const std::vector<std::vector<int>>& seqlens = {}) const;
1025 StatusCode repositionInputs(const std::vector<uint8_t*>& input,
1026 std::vector<std::vector<Buffer>>& input_buf,
1027 const std::vector<std::vector<int>>& seqlens = {}) const;
1029
1039
1053 std::unique_ptr<ModelVariantHandle> getModelVariantHandle(int variant_idx,
1054 StatusCode& sc) const;
1055
1061 const std::vector<std::vector<int64_t>>& getModelInputShape() const;
1062
1068 const std::vector<std::vector<int64_t>>& getModelOutputShape() const;
1069
1075 const std::vector<BufferInfo>& getInputBufferInfo() const;
1076
1082 const std::vector<BufferInfo>& getOutputBufferInfo() const;
1083
1089 std::vector<Scale> getInputScale() const;
1090
1096 std::vector<Scale> getOutputScale() const;
1097
1106 uint32_t getIdentifier() const;
1107
1113 std::string getModelPath() const;
1114
1120 std::vector<CacheInfo> getCacheInfos() const;
1121
1128
1139 StatusCode dumpCacheMemory(std::vector<std::vector<int8_t>>& bufs);
1140
1150 std::vector<std::vector<int8_t>> dumpCacheMemory(StatusCode& sc);
1151
1162 StatusCode dumpCacheMemory(const std::string& cache_dir);
1163
1174 StatusCode loadCacheMemory(const std::vector<std::vector<int8_t>>& bufs);
1175
1186 StatusCode loadCacheMemory(const std::string& cache_dir);
1187
1201 int filterCacheTail(int cache_size, int tail_size, const std::vector<bool>& mask,
1202 StatusCode& sc);
1203
1217 int moveCacheTail(int num_head, int num_tail, int cache_size, StatusCode& sc);
1218
1220
1227
1231 StatusCode infer(const std::vector<float*>& input,
1232 std::vector<std::vector<float>>& output, int batch_size);
1233
1237 std::vector<std::vector<float>> infer(const std::vector<float*>& input,
1238 int batch_size, StatusCode& sc);
1239
1243 uint64_t getLatencyConsumed(const int npu_op_idx) const;
1244
1248 uint64_t getLatencyFinished(const int npu_op_idx) const;
1249
1251
1252private:
1253 Model();
1254
1255#ifndef _MSC_VER
1256 std::experimental::propagate_const<std::unique_ptr<ModelImpl>> mImpl;
1257#else
1258 std::unique_ptr<ModelImpl> mImpl;
1259#endif
1260
1261 friend class Accelerator;
1262};
1263
1265
1266} // namespace mobilint
1267
1268#endif
Represents an accelerator, i.e., an NPU, used for executing models.
Definition acc.h:33
Represents a future for retrieving the result of asynchronous inference.
Definition future.h:43
Configures a core mode and core allocation of a model for NPU inference.
Definition type.h:233
std::string getModelPath() const
Returns the path to the MXQ model file associated with the Model.
Future< float > inferAsyncToFloat(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-float asynchronous inference.
uint64_t getLatencyConsumed(const int npu_op_idx) const
std::vector< std::vector< float > > infer(const std::vector< float * > &input, int batch_size, StatusCode &sc)
bool isTarget(CoreId core_id) const
Checks if the NPU core specified by CoreId is the target of the model. In other words,...
static std::unique_ptr< Model > create(const std::string &mxq_path, StatusCode &sc)
Creates a Model object from the specified MXQ model file.
Future< float > inferAsyncCHW(const std::vector< NDArray< float > > &input, StatusCode &sc)
Initiates asynchronous inference with input in NCHW (batch N, channels C, height H,...
std::vector< NDArray< float > > infer(const std::vector< NDArray< float > > &input, StatusCode &sc)
This overload differs from the above function in that it directly returns the inference results inste...
Future< int8_t > inferAsync(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-int8_t asynchronous inference.
StatusCode loadCacheMemory(const std::vector< std::vector< int8_t > > &bufs)
Loads the KV cache memory from buffers.
Future< float > inferAsyncCHWToFloat(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-float asynchronous inference.
std::vector< Scale > getOutputScale() const
Returns the output quantization scale(s) of the model.
StatusCode loadCacheMemory(const std::string &cache_dir)
Loads the KV cache memory from files in the specified directory.
StatusCode dumpCacheMemory(const std::string &cache_dir)
Dumps KV cache memory to files in the specified directory.
const std::vector< std::vector< int64_t > > & getModelOutputShape() const
Returns the output shape of the model.
std::unique_ptr< ModelVariantHandle > getModelVariantHandle(int variant_idx, StatusCode &sc) const
Retrieves a handle to the specified model variant.
StatusCode launch(Accelerator &acc)
Launches the model on the specified Accelerator, which represents the actual NPU.
Future< float > inferAsync(const std::vector< NDArray< uint8_t > > &input, StatusCode &sc)
This overload supports uint8_t-to-float asynchronous inference.
std::vector< CoreId > getTargetCores() const
Returns the NPU cores the model is configured to use.
std::vector< std::vector< float > > inferCHW(const std::vector< float * > &input, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
std::vector< std::vector< float > > inferCHW(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
const std::vector< BufferInfo > & getInputBufferInfo() const
Returns the input buffer information for the model.
int moveCacheTail(int num_head, int num_tail, int cache_size, StatusCode &sc)
Moves the tail of the KV cache memory to the end of the head.
std::vector< std::vector< float > > infer(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.
StatusCode infer(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output)
Performs inference.
CoreMode getCoreMode() const
Retrieves the core mode of the model.
int getNumModelVariants() const
Returns the total number of model variants available in this model.
Future< float > inferAsync(const std::vector< NDArray< float > > &input, StatusCode &sc)
Initiates asynchronous inference with input in NHWC (batch N, height H, width W, channels C) or HWC f...
uint32_t getIdentifier() const
Returns the model's unique identifier.
std::vector< NDArray< float > > inferCHW(const std::vector< NDArray< float > > &input, StatusCode &sc)
This overload differs from the above function in that it directly returns the inference results inste...
std::vector< NDArray< float > > infer(const std::vector< NDArray< float > > &input, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.
StatusCode dispose()
Disposes of the model loaded onto the NPU.
StatusCode inferCHW(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output, uint32_t cache_size)
This overload supports inference with KV cache.
static std::unique_ptr< Model > create(const std::string &mxq_path, const ModelConfig &config, StatusCode &sc)
Creates a Model object from the specified MXQ model file and configuration.
int filterCacheTail(int cache_size, int tail_size, const std::vector< bool > &mask, StatusCode &sc)
Filter the tail of the KV cache memory.
const std::vector< std::vector< int64_t > > & getModelInputShape() const
Returns the input shape of the model.
std::vector< NDArray< float > > inferCHW(const std::vector< NDArray< float > > &input, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.
Future< int8_t > inferAsyncCHW(const std::vector< NDArray< int8_t > > &input, StatusCode &sc)
This overload supports int8_t-to-int8_t asynchronous inference.
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
const std::vector< BufferInfo > & getOutputBufferInfo() const
Returns the output buffer information of the model.
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size)
This overload supports inference with KV cache.
std::vector< CacheInfo > getCacheInfos() const
Returns informations of KV-cache of the model.
StatusCode infer(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output, uint32_t cache_size)
This overload supports inference with KV cache.
Future< float > inferAsyncCHW(const std::vector< NDArray< uint8_t > > &input, StatusCode &sc)
This overload supports uint8_t-to-float asynchronous inference.
std::vector< std::vector< float > > infer(const std::vector< float * > &input, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
std::vector< std::vector< float > > inferCHW(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size, StatusCode &sc)
This overload supports inference with KV cache.
std::vector< std::vector< float > > infer(const std::vector< float * > &input, const std::vector< std::vector< int64_t > > &shape, StatusCode &sc)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output, int batch_size)
std::vector< std::vector< int8_t > > dumpCacheMemory(StatusCode &sc)
Dumps the KV cache memory into buffers.
StatusCode inferCHW(const std::vector< float * > &input, std::vector< std::vector< float > > &output)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
std::vector< Scale > getInputScale() const
Returns the input quantization scale(s) of the model.
StatusCode inferSpeedrun(int variant_idx=0)
Development-only API for measuring pure NPU inference speed.
StatusCode inferCHW(const std::vector< NDArray< float > > &input, std::vector< NDArray< float > > &output)
Performs inference.
StatusCode infer(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
StatusCode inferCHW(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape)
This overload is provided for convenience but may result in additional data copies within the qbrunti...
uint64_t getLatencyFinished(const int npu_op_idx) const
StatusCode inferCHW(const std::vector< float * > &input, std::vector< std::vector< float > > &output, const std::vector< std::vector< int64_t > > &shape, uint32_t cache_size)
This overload supports inference with KV cache.
StatusCode dumpCacheMemory(std::vector< std::vector< int8_t > > &bufs)
Dumps the KV cache memory into buffers.
A class representing an N-dimensional array (NDArray).
Definition ndarray.h:77
CoreMode
Defines the core mode for NPU execution.
Definition type.h:169
StatusCode
Enumerates status codes for the qbruntime.
Definition status_code.h:26
Represents a unique identifier for an NPU core.
Definition type.h:123