5from typing
import List, Optional, Tuple, Union
9import qbruntime.qbruntime
as _cQbRuntime
10from .accelerator
import Accelerator
12from .model_variant_handle
import *
15_Shape = Tuple[int, ...]
17__all__ = [
"Model",
"load"]
25def _is_valid_shape(input_shape: _Shape, shape: _Shape) -> bool:
26 if (len(input_shape) < len(shape))
or (len(input_shape) > len(shape) + 1):
29 offset = 1
if len(input_shape) > len(shape)
else 0
30 for s1, s2
in zip(input_shape[offset:], shape):
33 if s1 % s2 != 0
or (s2 > 0
and s1 != s2):
40def _find_memory_format(
41 inputs: List[np.ndarray], shapes: List[_Shape]
42) -> Optional[Tuple[bool, bool]]:
43 if len(inputs) != len(shapes):
48 for arr, shape
in zip(inputs, shapes):
49 shape_hwc = (shape[0], shape[1], shape[2])
50 shape_chw = (shape[2], shape[0], shape[1])
51 is_hwc = is_hwc
and _is_valid_shape(arr.shape, shape_hwc)
52 is_chw = is_chw
and _is_valid_shape(arr.shape, shape_chw)
54 if not is_hwc
and not is_chw:
60def _find_matching_variant_idx_and_memory_format(
61 model, inputs: List[np.ndarray]
62) -> Tuple[int, Tuple[bool, bool]]:
66 for i
in range(model.get_num_model_variants()):
67 res = _find_memory_format(
68 inputs, model.get_model_variant_handle(i).get_model_input_shape()
75 if variant_idx
is None:
76 raise ValueError(
"Input shape is invalid.")
77 return variant_idx, (is_hwc, is_chw)
82 shapes: List[_Shape], is_hwc: bool, dtype: np.dtype
87 shape = (shape[0], shape[1], shape[2])
89 shape = (shape[2], shape[0], shape[1])
90 outputs.append(np.empty(shape, dtype=dtype))
95def _check_output_shapes(
96 outputs: List[np.ndarray], shapes: List[_Shape], is_hwc: bool, dtype: np.dtype
98 if len(outputs) != len(shapes):
99 raise ValueError(
"The number of outputs is different.")
101 for output, shape
in zip(outputs, shapes):
102 if output.dtype != dtype:
103 raise ValueError(
"Output dtype mismatch.")
106 shape = (shape[0], shape[1], shape[2])
108 shape = (shape[2], shape[0], shape[1])
109 if output.shape != shape:
110 raise ValueError(
"Output shape mismatch.")
115 @brief Represents an AI model loaded from an MXQ file.
117 This class loads an AI model from an MXQ file and provides functions to launch it
118 on the NPU and perform inference.
121 def __init__(self, path: str, model_config: Optional[ModelConfig] =
None):
123 @brief Creates a Model object from the specified MXQ model file and configuration.
125 Parses the MXQ file and constructs a Model object using the provided configuration,
126 initializing the model with the given settings.
128 @note The created Model object must be launched before performing inference.
129 See Model.launch for more details.
131 @param[in] path The path to the MXQ model file.
132 @param[in] model_config The configuration settings to initialize the Model.
134 if model_config
is None:
135 self.
_model = _cQbRuntime.Model(path)
137 self.
_model = _cQbRuntime.Model(path, model_config._model_config)
144 def launch(self, acc: Accelerator) ->
None:
146 @brief Launches the model on the specified Accelerator, which represents
149 @param[in] acc The accelerator on which to launch the model.
156 @brief Disposes of the model loaded onto the NPU.
158 Releases any resources associated with the model on the NPU.
165 @brief Checks if the NPU core specified by CoreId is the target of the model.
166 In other words, whether the model is configured to use the given NPU core.
168 @param[in] core_id The CoreId to check.
169 @return True if the model is configured to use the specified CoreId, false
176 @brief Retrieves the core mode of the model.
178 @return The CoreMode of the model.
184 @brief Returns the NPU cores the model is configured to use.
186 @return A list of CoreIds representing the target NPU cores.
188 return [CoreId.from_cpp(target)
for target
in self.
_model.target_cores]
193 return [CoreId.from_cpp(target)
for target
in self.
_model.target_cores]
197 inputs: Union[np.ndarray, List[np.ndarray]],
198 outputs: Optional[List[np.ndarray]] =
None,
200 ) -> Optional[List[np.ndarray]]:
202 @brief Performs inference.
204 Fowllowing types of inference supported.
205 1. infer(in:List[numpy]) -> List[numpy] (float / int)
206 2. infer(in:numpy) -> List[numpy] (float / int)
207 3. infer(in:List[numpy], out:List[numpy]) (float / int)
208 4. infer(in:List[numpy], out:List[]) (float / int)
209 5. infer(in:numpy, out:List[numpy]) (float / int)
210 6. infer(in:numpy, out:List[]) (float / int)
212 @param[in] inputs Input data as a single numpy.ndarray or a list
214 @param[out] outputs Optional pre-allocated list of numpy.ndarray's
215 to store inference results.
216 @return Inference results as a list of numpy.ndarray.
218 return self.
_infer(inputs, outputs, cache_size)
222 inputs: Union[np.ndarray, List[np.ndarray]],
223 outputs: Optional[List[np.ndarray]] =
None,
225 ) -> Optional[List[np.ndarray]]:
226 return self.
_infer(inputs, outputs, cache_size,
True)
230 inputs: Union[np.ndarray, List[np.ndarray]],
231 outputs: Optional[List[np.ndarray]] =
None,
233 ) -> Optional[List[np.ndarray]]:
234 return self.
_infer(inputs, outputs, cache_size,
False)
238 inputs: Union[np.ndarray, List[np.ndarray]],
239 outputs: Optional[List[np.ndarray]],
241 is_target_hwc: Optional[bool] =
None,
242 ) -> Optional[List[np.ndarray]]:
243 if not isinstance(inputs, list):
246 variant_idx, (is_hwc, is_chw) = _find_matching_variant_idx_and_memory_format(
249 if (is_target_hwc
is not None)
and (
250 (is_target_hwc != is_hwc)
and (is_target_hwc == is_chw)
252 raise ValueError(
"Input shape is invalid.")
253 elif is_target_hwc
is None:
254 is_target_hwc = is_hwc
255 inputs = [np.ascontiguousarray(i)
for i
in inputs]
259 infer_func = self.
_model.infer
if is_target_hwc
else self.
_model.infer_chw
260 return [np.asarray(o)
for o
in infer_func(inputs, cache_size)]
264 _check_output_shapes(
270 for oi
in range(len(outputs)):
271 outputs[oi] = np.ascontiguousarray(outputs[oi])
273 outputs[:] = _build_outputs(
282 self.
_model.infer_chw(inputs, outputs, cache_size)
291 ) -> List[np.ndarray]:
293 @brief int8_t-to-float inference
294 Performs inference with input and output elements of type `int8_t`
296 Using these inference APIs requires manual scaling (quantization)
297 of float values to `int8_t` for input.
299 @note These APIs are intended for advanced use rather than typical usage.
303 def infer_hwc_to_float(
310 ) -> List[np.ndarray]:
313 def infer_chw_to_float(
320 ) -> List[np.ndarray]:
330 is_target_hwc: Optional[bool] =
None,
331 ) -> List[np.ndarray]:
333 @brief int8_t-to-float inference
334 Performs inference with input and output elements of type `int8_t`
336 Using these inference APIs requires manual scaling (quantization)
337 of float values to `int8_t` for input.
339 @note These APIs are intended for advanced use rather than typical usage.
341 if not isinstance(inputs, list):
344 _, (is_hwc, is_chw) = _find_matching_variant_idx_and_memory_format(self, inputs)
345 if (is_target_hwc
is not None)
and (
346 (is_target_hwc != is_hwc)
and (is_target_hwc == is_chw)
348 raise ValueError(
"Input shape is invalid.")
349 elif is_target_hwc
is None:
350 is_target_hwc = is_hwc
351 inputs = [np.ascontiguousarray(i)
for i
in inputs]
356 outputs = self.
_model.infer_chw_to_float(inputs, cache_size)
358 return [np.asarray(o)
for o
in outputs]
362 inputs: List[Buffer],
363 outputs: List[Buffer],
364 shape: List[List[int]] = [],
368 @brief Buffer-to-Buffer inference
370 Performs inference using input and output elements in the NPU’s internal data type.
371 The inference operates on buffers allocated via the following APIs:
373 - `Model.acquire_input_buffer()`
374 - `Model.acquire_output_buffer()`
375 - `ModelVariantHandle.acquire_input_buffer()`
376 - `ModelVariantHandle.acquire_output_buffer()`
378 Additionally, `Model.reposition_inputs()`, `Model.reposition_outputs()`,
379 `ModelVariantHandle.reposition_inputs()`, `ModelVariantHandle.reposition_outputs()`
380 must be used properly.
382 @note These APIs are intended for advanced use rather than typical usage.
385 [i._buffer
for i
in inputs], [o._buffer
for o
in outputs], shape, cache_size
390 @brief Development-only API for measuring pure NPU inference speed.
392 Runs NPU inference without uploading inputs and without retrieving outputs.
398 inputs: Union[np.ndarray, List[np.ndarray]],
401 @brief Asynchronous Inference
403 Performs inference asynchronously.
405 To use asynchronous inference, the model must be created using a `ModelConfig`
406 object with the async pipeline configured to be enabled. This is done by calling
407 @ref ModelConfig.set_async_pipeline_enabled
408 "ModelConfig.set_async_pipeline_enabled(True)" before passing the configuration to
415 mc = qbruntime.ModelConfig()
416 mc.set_async_pipeline_enabled(True)
418 model = qbruntime.Model(MXQ_PATH, mc)
419 acc = qbruntime.Accelerator()
423 future = model.infer_async(inputs)
428 @note Currently, only CNN-based models are supported, as asynchronous execution is
429 particularly effective for this type of workload.
432 - RNN/LSTM and LLM models are not supported yet.
433 - Models requiring CPU offloading are not supported yet.
434 - Currently, only single-batch inference is supported (i.e., N = 1).
435 - Currently, Buffer inference is not supported. The following types
436 are supported in the synchronous API for advanced use cases, but are not
437 yet available for asynchronous inference:
441 if not isinstance(inputs, list):
443 _, (is_hwc, _) = _find_matching_variant_idx_and_memory_format(self, inputs)
444 inputs = [np.ascontiguousarray(i)
for i
in inputs]
446 self.
_model.infer_async
if is_hwc
else self.
_model.infer_async_chw
448 return Future.from_cpp(infer_async_func(inputs), inputs)
452 inputs: Union[np.ndarray, List[np.ndarray]],
455 @brief This method supports int8_t-to-float asynchronous inference.
457 @param[in] inputs Input data as a single numpy.ndarray or a list
460 @return A future that can be used to retrieve the inference result.
462 if not isinstance(inputs, list):
464 _, (is_hwc, _) = _find_matching_variant_idx_and_memory_format(self, inputs)
465 inputs = [np.ascontiguousarray(i)
for i
in inputs]
467 self.
_model.infer_async_to_float
469 else self.
_model.infer_async_chw_to_float
471 return Future.from_cpp(infer_async_func(inputs), inputs)
475 inputs: List[np.ndarray],
476 input_bufs: List[Buffer],
477 seqlens: List[List[int]] = [],
479 """Reposition input"""
480 inputs = [np.ascontiguousarray(i)
for i
in inputs]
482 inputs, [buf._buffer
for buf
in input_bufs], seqlens
487 output_bufs: List[Buffer],
488 outputs: List[np.ndarray],
489 seqlens: List[List[int]] = [],
491 """Reposition output"""
495 outputs.append(np.empty(shape=shape, dtype=np.float32))
497 for oi
in range(len(outputs)):
498 outputs[oi] = np.ascontiguousarray(outputs[oi])
500 [buf._buffer
for buf
in output_bufs], outputs, seqlens
505 @brief Returns the total number of model variants available in this model.
507 The `variant_idx` parameter passed to `Model.get_model_variant_handle()` must be
508 in the range [0, return value of this function).
510 @return The total number of model variants.
516 @brief Retrieves a handle to the specified model variant.
518 Use the returned `ModelVariantHandle` to query details such as input and output
519 shapes for the selected variant.
521 @param[in] variant_idx Index of the model variant to retrieve.
522 Must be in the range [0, getNumModelVariants()).
524 @return A `ModelVariantHandle` object if successful;
525 otherwise, raise qbruntime.QbRuntimeError "Model_InvalidVariantIdx".
527 return ModelVariantHandle.from_cpp(
533 @brief Returns the input shape of the model.
535 @return A list of input shape of the model.
541 @brief Returns the output shape of the model.
543 @return A list of output shape of the model.
549 @brief Returns the input quantization scale(s) of the model.
551 @return A list of input scales.
557 @brief Returns the output quantization scale(s) of the model.
559 @return A list of output scales.
565 @brief Returns the input buffer information for the model.
567 @return A list of input buffer information.
573 @brief Returns the output buffer information of the model.
575 @return A list of output buffer information.
581 @brief Buffer Management API
583 Acquires list of `Buffer` for input.
584 These API is required when calling `Model.infer_buffer()`.
586 @note These APIs are intended for advanced use rather than typical usage.
592 @brief Buffer Management API
594 Acquires list of `Buffer` for output.
595 These API is required when calling `Model.infer_buffer()`.
597 @note These APIs are intended for advanced use rather than typical usage.
603 @brief Buffer Management API
605 Deallocate acquired Input/Output buffer
607 @note These APIs are intended for advanced use rather than typical usage.
613 @brief Returns the model's unique identifier.
615 This identifier distinguishes multiple models within a single user program.
616 It is assigned incrementally, starting from 0 (e.g., 0, 1, 2, 3, ...).
618 @return The model identifier.
624 @brief Returns the path to the MXQ model file associated with the Model.
626 @return The MXQ file path.
632 @brief Returns informations of KV-cache of the model.
634 @return A list of CacheInfo objects.
648 @brief Dumps the KV cache memory into buffers.
650 Writes the current KV cache data into provided buffers.
652 @return A list of bytes containing the KV cache data.
655 return [np.asarray(buf, np.int8).tobytes()
for buf
in bufs]
659 @brief Loads the KV cache memory from buffers.
661 Restores the KV cache from the provided buffers.
663 @param[in] bufs A list of bytes containing the KV cache
666 [np.frombuffer(buf, dtype=np.int8)
for buf
in bufs]
671 @brief Dumps KV cache memory to files in the specified directory.
673 Writes the KV cache data to binary files within the given directory.
674 Each file is named using the format: `cache_<layer_hash>.bin`.
676 @param[in] cache_dir Path to the directory where KV cache files will be saved.
682 @brief Loads the KV cache memory from files in the specified directory.
684 Reads KV cache data from files within the given directory and restores them.
685 Each file is named using the format: `cache_<layer_hash>.bin`.
687 @param[in] cache_dir Path to the directory where KV cache files are saved.
692 self, cache_size: int, tail_size: int, mask: List[bool]
695 @brief Filter the tail of the KV cache memory
697 Retains the desired caches in the tail of the KV cache memory, excludes the others,
698 and shifts the remaining caches forward.
700 @param[in] cache_size The number of tokens accumulated in the KV cache so far.
701 @param[in] tail_size The tail size of the KV cache to filter (<=32).
702 @param[in] mask A mask indicating tokens to retain or exclude at the tail of the KV
705 @return New cache size after tail filtering.
711 @brief Moves the tail of the KV cache memory to the end of the head.
713 Slice the tail of the KV cache memory up to the specified size
714 and moves it to the designated cache position.
716 @param[in] num_head The size of the KV cache head where the tail is appended.
717 @param[in] num_tail The size of the KV cache tail to be moved.
718 @param[in] cache_size The total number of tokens accumulated in the KV cache so
721 @return The updated cache size after moving the tail.
726def load(path: str, model_config: Optional[ModelConfig] =
None) -> Model:
728 @brief Single-step inference API. Creates model and uploads the model
729 into NPU immediately.
731 This operation performs the Accelerator declaration, Model declaration,
732 and launch in a single step.
735 model =
Model(path, model_config)
Represents an accelerator, i.e., an NPU, used for executing models.
Represents an AI model loaded from an MXQ file.
None load_cache_memory(self, List[bytes] bufs)
Loads the KV cache memory from buffers.
None launch(self, Accelerator acc)
Launches the model on the specified Accelerator, which represents the actual NPU.
None reposition_outputs(self, List[Buffer] output_bufs, List[np.ndarray] outputs, List[List[int]] seqlens=[])
Reposition output.
List[Scale] get_input_scale(self)
Returns the input quantization scale(s) of the model.
CoreMode get_core_mode(self)
Retrieves the core mode of the model.
List[CoreId] get_target_cores(self)
Returns the NPU cores the model is configured to use.
List[_Shape] _input_shape
List[CoreId] target_cores(self)
Optional[List[np.ndarray]] infer(self, Union[np.ndarray, List[np.ndarray]] inputs, Optional[List[np.ndarray]] outputs=None, int cache_size=0)
Performs inference.
List[_Shape] _output_shape
List[Buffer] acquire_input_buffer(self, List[List[int]] seqlens=[])
Buffer Management API.
List[BufferInfo] get_output_buffer_info(self)
Returns the output buffer information of the model.
str get_model_path(self)
Returns the path to the MXQ model file associated with the Model.
Future infer_async(self, Union[np.ndarray, List[np.ndarray]] inputs)
Asynchronous Inference.
Optional[List[np.ndarray]] _infer(self, Union[np.ndarray, List[np.ndarray]] inputs, Optional[List[np.ndarray]] outputs, int cache_size, Optional[bool] is_target_hwc=None)
List[Buffer] acquire_output_buffer(self, List[List[int]] seqlens=[])
Buffer Management API.
int filter_cache_tail(self, int cache_size, int tail_size, List[bool] mask)
Filter the tail of the KV cache memory.
List[np.ndarray] _infer_to_float(self, Union[np.ndarray, List[np.ndarray],] inputs, int cache_size, Optional[bool] is_target_hwc=None)
int8_t-to-float inference Performs inference with input and output elements of type int8_t
int get_num_model_variants(self)
Returns the total number of model variants available in this model.
List[bytes] dump_cache_memory(self)
Dumps the KV cache memory into buffers.
bool is_target(self, CoreId core_id)
Checks if the NPU core specified by CoreId is the target of the model.
None infer_speedrun(self)
Development-only API for measuring pure NPU inference speed.
int get_latency_consumed(self)
Future infer_async_to_float(self, Union[np.ndarray, List[np.ndarray]] inputs)
This method supports int8_t-to-float asynchronous inference.
ModelVariantHandle get_model_variant_handle(self, variant_idx)
Retrieves a handle to the specified model variant.
List[_Shape] get_model_output_shape(self)
Returns the output shape of the model.
List[CacheInfo] get_cache_infos(self)
Returns informations of KV-cache of the model.
int get_latency_finished(self)
None release_buffer(self, List[Buffer] buffer)
Buffer Management API.
None dispose(self)
Disposes of the model loaded onto the NPU.
List[_Shape] get_model_input_shape(self)
Returns the input shape of the model.
List[Scale] get_output_scale(self)
Returns the output quantization scale(s) of the model.
None load_cache_memory_from(self, str cache_dir)
Loads the KV cache memory from files in the specified directory.
None infer_buffer(self, List[Buffer] inputs, List[Buffer] outputs, List[List[int]] shape=[], int cache_size=0)
Buffer-to-Buffer inference.
List[np.ndarray] infer_to_float(self, Union[np.ndarray, List[np.ndarray],] inputs, int cache_size=0)
int8_t-to-float inference Performs inference with input and output elements of type int8_t
__init__(self, str path, Optional[ModelConfig] model_config=None)
Creates a Model object from the specified MXQ model file and configuration.
None dump_cache_memory_to(self, str cache_dir)
Dumps KV cache memory to files in the specified directory.
None reposition_inputs(self, List[np.ndarray] inputs, List[Buffer] input_bufs, List[List[int]] seqlens=[])
Reposition input.
List[BufferInfo] get_input_buffer_info(self)
Returns the input buffer information for the model.
int get_identifier(self)
Returns the model's unique identifier.
int move_cache_tail(self, int num_head, int num_tail, int cache_size)
Moves the tail of the KV cache memory to the end of the head.
A simple byte-sized buffer.
Defines the core mode for NPU execution.
Model load(str path, Optional[ModelConfig] model_config=None)
Single-step inference API.