5from typing
import List, Optional, Tuple, Union
9import qbruntime.qbruntime
as _cQbRuntime
10from .accelerator
import Accelerator
12from .model_variant_handle
import *
15_Shape = Tuple[int, ...]
17__all__ = [
"Model",
"load"]
25def _is_valid_shape(input_shape: _Shape, shape: _Shape) -> bool:
26 if (len(input_shape) < len(shape))
or (len(input_shape) > len(shape) + 1):
29 offset = 1
if len(input_shape) > len(shape)
else 0
30 for s1, s2
in zip(input_shape[offset:], shape):
33 if s1 % s2 != 0
or (s2 > 0
and s1 != s2):
40def _find_memory_format(
41 inputs: List[np.ndarray], shapes: List[_Shape]
42) -> Optional[Tuple[bool, bool]]:
43 if len(inputs) != len(shapes):
48 for arr, shape
in zip(inputs, shapes):
49 shape_hwc = (shape[0], shape[1], shape[2])
50 shape_chw = (shape[2], shape[0], shape[1])
51 is_hwc = is_hwc
and _is_valid_shape(arr.shape, shape_hwc)
52 is_chw = is_chw
and _is_valid_shape(arr.shape, shape_chw)
54 if not is_hwc
and not is_chw:
60def _find_matching_variant_idx_and_memory_format(
61 model, inputs: List[np.ndarray]
62) -> Tuple[int, Tuple[bool, bool]]:
66 for i
in range(model.get_num_model_variants()):
67 res = _find_memory_format(
68 inputs, model.get_model_variant_handle(i).get_model_input_shape()
75 if variant_idx
is None:
76 raise ValueError(
"Input shape is invalid.")
77 return variant_idx, (is_hwc, is_chw)
82 shapes: List[_Shape], is_hwc: bool, dtype: np.dtype
87 shape = (shape[0], shape[1], shape[2])
89 shape = (shape[2], shape[0], shape[1])
90 outputs.append(np.empty(shape, dtype=dtype))
95def _check_output_shapes(
96 outputs: List[np.ndarray], shapes: List[_Shape], is_hwc: bool, dtype: np.dtype
98 if len(outputs) != len(shapes):
99 raise ValueError(
"The number of outputs is different.")
101 for output, shape
in zip(outputs, shapes):
102 if output.dtype != dtype:
103 raise ValueError(
"Output dtype mismatch.")
106 shape = (shape[0], shape[1], shape[2])
108 shape = (shape[2], shape[0], shape[1])
109 if output.shape != shape:
110 raise ValueError(
"Output shape mismatch.")
115 @brief Represents an AI model loaded from an MXQ file.
117 This class loads an AI model from an MXQ file and provides functions to launch it
118 on the NPU and perform inference.
121 def __init__(self, path: str, model_config: Optional[ModelConfig] =
None):
123 @brief Creates a Model object from the specified MXQ model file and configuration.
125 Parses the MXQ file and constructs a Model object using the provided configuration,
126 initializing the model with the given settings.
128 @note The created Model object must be launched before performing inference.
129 See Model.launch for more details.
131 @param[in] path The path to the MXQ model file.
132 @param[in] model_config The configuration settings to initialize the Model.
134 if model_config
is None:
135 self.
_model = _cQbRuntime.Model(path)
137 self.
_model = _cQbRuntime.Model(path, model_config._model_config)
144 def launch(self, acc: Accelerator) ->
None:
146 @brief Launches the model on the specified Accelerator, which represents
149 @param[in] acc The accelerator on which to launch the model.
156 @brief Disposes of the model loaded onto the NPU.
158 Releases any resources associated with the model on the NPU.
165 @brief Checks if the NPU core specified by CoreId is the target of the model.
166 In other words, whether the model is configured to use the given NPU core.
168 @param[in] core_id The CoreId to check.
169 @return True if the model is configured to use the specified CoreId, false
176 @brief Retrieves the core mode of the model.
178 @return The CoreMode of the model.
184 @brief Returns the NPU cores the model is configured to use.
186 @return A list of CoreIds representing the target NPU cores.
188 return [CoreId.from_cpp(target)
for target
in self.
_model.target_cores]
193 return [CoreId.from_cpp(target)
for target
in self.
_model.target_cores]
197 inputs: Union[np.ndarray, List[np.ndarray]],
198 outputs: Optional[List[np.ndarray]] =
None,
200 params: Optional[List[BatchParam]] =
None,
201 ) -> Optional[List[np.ndarray]]:
203 @brief Performs inference.
205 Fowllowing types of inference supported.
206 1. infer(in:List[numpy]) -> List[numpy] (float / int)
207 2. infer(in:numpy) -> List[numpy] (float / int)
208 3. infer(in:List[numpy], out:List[numpy]) (float / int)
209 4. infer(in:List[numpy], out:List[]) (float / int)
210 5. infer(in:numpy, out:List[numpy]) (float / int)
211 6. infer(in:numpy, out:List[]) (float / int)
213 @param[in] inputs Input data as a single numpy.ndarray or a list
215 @param[out] outputs Optional pre-allocated list of numpy.ndarray's
216 to store inference results.
217 @param[in] cache_size The number of tokens accumulated in the KV cache so far.
218 @param[in] params A List of `BatchParam`, specifying each batch's information
219 for BatchLLM inference. If `params` is specified,
220 `cache_size` is ignored.
221 @return Inference results as a list of numpy.ndarray.
223 return self.
_infer(inputs, outputs, cache_size, params=params)
227 inputs: Union[np.ndarray, List[np.ndarray]],
228 outputs: Optional[List[np.ndarray]] =
None,
230 params: Optional[List[BatchParam]] =
None,
231 ) -> Optional[List[np.ndarray]]:
232 return self.
_infer(inputs, outputs, cache_size,
True, params)
236 inputs: Union[np.ndarray, List[np.ndarray]],
237 outputs: Optional[List[np.ndarray]] =
None,
239 params: Optional[List[BatchParam]] =
None,
240 ) -> Optional[List[np.ndarray]]:
241 return self.
_infer(inputs, outputs, cache_size,
False, params)
245 inputs: Union[np.ndarray, List[np.ndarray]],
246 outputs: Optional[List[np.ndarray]],
248 is_target_hwc: Optional[bool] =
None,
249 params: Optional[List[BatchParam]] =
None,
250 ) -> Optional[List[np.ndarray]]:
251 if not isinstance(inputs, list):
254 variant_idx, (is_hwc, is_chw) = _find_matching_variant_idx_and_memory_format(
257 if (is_target_hwc
is not None)
and (
258 (is_target_hwc != is_hwc)
and (is_target_hwc == is_chw)
260 raise ValueError(
"Input shape is invalid.")
261 elif is_target_hwc
is None:
262 is_target_hwc = is_hwc
263 inputs = [np.ascontiguousarray(i)
for i
in inputs]
265 infer_func = self.
_model.infer
if is_target_hwc
else self.
_model.infer_chw
269 return [np.asarray(o)
for o
in infer_func(inputs, cache_size)]
274 inputs, [param._batch_param
for param
in params]
279 _check_output_shapes(
285 for oi
in range(len(outputs)):
286 outputs[oi] = np.ascontiguousarray(outputs[oi])
288 outputs[:] = _build_outputs(
295 infer_func(inputs, outputs, cache_size)
297 infer_func(inputs, outputs, [param._batch_param
for param
in params])
306 ) -> List[np.ndarray]:
308 @brief int8_t-to-float inference
309 Performs inference with input and output elements of type `int8_t`
311 Using these inference APIs requires manual scaling (quantization)
312 of float values to `int8_t` for input.
314 @note These APIs are intended for advanced use rather than typical usage.
318 def infer_hwc_to_float(
325 ) -> List[np.ndarray]:
328 def infer_chw_to_float(
335 ) -> List[np.ndarray]:
345 is_target_hwc: Optional[bool] =
None,
346 ) -> List[np.ndarray]:
348 @brief int8_t-to-float inference
349 Performs inference with input and output elements of type `int8_t`
351 Using these inference APIs requires manual scaling (quantization)
352 of float values to `int8_t` for input.
354 @note These APIs are intended for advanced use rather than typical usage.
356 if not isinstance(inputs, list):
359 _, (is_hwc, is_chw) = _find_matching_variant_idx_and_memory_format(self, inputs)
360 if (is_target_hwc
is not None)
and (
361 (is_target_hwc != is_hwc)
and (is_target_hwc == is_chw)
363 raise ValueError(
"Input shape is invalid.")
364 elif is_target_hwc
is None:
365 is_target_hwc = is_hwc
366 inputs = [np.ascontiguousarray(i)
for i
in inputs]
371 outputs = self.
_model.infer_chw_to_float(inputs, cache_size)
373 return [np.asarray(o)
for o
in outputs]
377 inputs: List[Buffer],
378 outputs: List[Buffer],
379 shape: List[List[int]] = [],
383 @brief Buffer-to-Buffer inference
385 Performs inference using input and output elements in the NPU’s internal data type.
386 The inference operates on buffers allocated via the following APIs:
388 - `Model.acquire_input_buffer()`
389 - `Model.acquire_output_buffer()`
390 - `ModelVariantHandle.acquire_input_buffer()`
391 - `ModelVariantHandle.acquire_output_buffer()`
393 Additionally, `Model.reposition_inputs()`, `Model.reposition_outputs()`,
394 `ModelVariantHandle.reposition_inputs()`, `ModelVariantHandle.reposition_outputs()`
395 must be used properly.
397 @note These APIs are intended for advanced use rather than typical usage.
400 [i._buffer
for i
in inputs], [o._buffer
for o
in outputs], shape, cache_size
405 @brief Development-only API for measuring pure NPU inference speed.
407 Runs NPU inference without uploading inputs and without retrieving outputs.
413 inputs: Union[np.ndarray, List[np.ndarray]],
416 @brief Asynchronous Inference
418 Performs inference asynchronously.
420 To use asynchronous inference, the model must be created using a `ModelConfig`
421 object with the async pipeline configured to be enabled. This is done by calling
422 @ref ModelConfig.set_async_pipeline_enabled
423 "ModelConfig.set_async_pipeline_enabled(True)" before passing the configuration to
430 mc = qbruntime.ModelConfig()
431 mc.set_async_pipeline_enabled(True)
433 model = qbruntime.Model(MXQ_PATH, mc)
434 acc = qbruntime.Accelerator()
438 future = model.infer_async(inputs)
443 @note Currently, only CNN-based models are supported, as asynchronous execution is
444 particularly effective for this type of workload.
447 - RNN/LSTM and LLM models are not supported yet.
448 - Models requiring CPU offloading are not supported yet.
449 - Currently, only single-batch inference is supported (i.e., N = 1).
450 - Currently, Buffer inference is not supported. The following types
451 are supported in the synchronous API for advanced use cases, but are not
452 yet available for asynchronous inference:
456 if not isinstance(inputs, list):
458 _, (is_hwc, _) = _find_matching_variant_idx_and_memory_format(self, inputs)
459 inputs = [np.ascontiguousarray(i)
for i
in inputs]
461 self.
_model.infer_async
if is_hwc
else self.
_model.infer_async_chw
463 return Future.from_cpp(infer_async_func(inputs), inputs)
467 inputs: Union[np.ndarray, List[np.ndarray]],
470 @brief This method supports int8_t-to-float asynchronous inference.
472 @param[in] inputs Input data as a single numpy.ndarray or a list
475 @return A future that can be used to retrieve the inference result.
477 if not isinstance(inputs, list):
479 _, (is_hwc, _) = _find_matching_variant_idx_and_memory_format(self, inputs)
480 inputs = [np.ascontiguousarray(i)
for i
in inputs]
482 self.
_model.infer_async_to_float
484 else self.
_model.infer_async_chw_to_float
486 return Future.from_cpp(infer_async_func(inputs), inputs)
490 inputs: List[np.ndarray],
491 input_bufs: List[Buffer],
492 seqlens: List[List[int]] = [],
494 """Reposition input"""
495 inputs = [np.ascontiguousarray(i)
for i
in inputs]
497 inputs, [buf._buffer
for buf
in input_bufs], seqlens
502 output_bufs: List[Buffer],
503 outputs: List[np.ndarray],
504 seqlens: List[List[int]] = [],
506 """Reposition output"""
510 outputs.append(np.empty(shape=shape, dtype=np.float32))
512 for oi
in range(len(outputs)):
513 outputs[oi] = np.ascontiguousarray(outputs[oi])
515 [buf._buffer
for buf
in output_bufs], outputs, seqlens
520 @brief Returns the total number of model variants available in this model.
522 The `variant_idx` parameter passed to `Model.get_model_variant_handle()` must be
523 in the range [0, return value of this function).
525 @return The total number of model variants.
531 @brief Retrieves a handle to the specified model variant.
533 Use the returned `ModelVariantHandle` to query details such as input and output
534 shapes for the selected variant.
536 @param[in] variant_idx Index of the model variant to retrieve.
537 Must be in the range [0, getNumModelVariants()).
539 @return A `ModelVariantHandle` object if successful;
540 otherwise, raise qbruntime.QbRuntimeError "Model_InvalidVariantIdx".
542 return ModelVariantHandle.from_cpp(
548 @brief Returns the input shape of the model.
550 @return A list of input shape of the model.
556 @brief Returns the output shape of the model.
558 @return A list of output shape of the model.
564 @brief Returns the input quantization scale(s) of the model.
566 @return A list of input scales.
572 @brief Returns the output quantization scale(s) of the model.
574 @return A list of output scales.
580 @brief Returns the input buffer information for the model.
582 @return A list of input buffer information.
588 @brief Returns the output buffer information of the model.
590 @return A list of output buffer information.
596 @brief Returns a data type for model inputs.
598 @return An input data type.
604 @brief Returns a data type for model outputs.
606 @return An output data type.
612 @brief Buffer Management API
614 Acquires list of `Buffer` for input.
615 These API is required when calling `Model.infer_buffer()`.
617 @note These APIs are intended for advanced use rather than typical usage.
623 @brief Buffer Management API
625 Acquires list of `Buffer` for output.
626 These API is required when calling `Model.infer_buffer()`.
628 @note These APIs are intended for advanced use rather than typical usage.
634 @brief Buffer Management API
636 Deallocate acquired Input/Output buffer
638 @note These APIs are intended for advanced use rather than typical usage.
644 @brief Returns the model's unique identifier.
646 This identifier distinguishes multiple models within a single user program.
647 It is assigned incrementally, starting from 0 (e.g., 0, 1, 2, 3, ...).
649 @return The model identifier.
655 @brief Returns the path to the MXQ model file associated with the Model.
657 @return The MXQ file path.
663 @brief Returns informations of KV-cache of the model.
665 @return A list of CacheInfo objects.
679 @brief Dumps the KV cache memory into buffers.
681 Writes the current KV cache data into provided buffers.
683 @param[in] cache_id Index of target cache.
685 @return A list of bytes containing the KV cache data.
688 return [np.asarray(buf, np.int8).tobytes()
for buf
in bufs]
692 @brief Loads the KV cache memory from buffers.
694 Restores the KV cache from the provided buffers.
696 @param[in] bufs A list of bytes containing the KV cache
699 [np.frombuffer(buf, dtype=np.int8)
for buf
in bufs], cache_id
704 @brief Dumps KV cache memory to files in the specified directory.
706 Writes the KV cache data to binary files within the given directory.
707 Each file is named using the format: `cache_<layer_hash>.bin`.
709 @param[in] cache_dir Path to the directory where KV cache files will be saved.
710 @param[in] cache_id Index of target cache.
716 @brief Loads the KV cache memory from files in the specified directory.
718 Reads KV cache data from files within the given directory and restores them.
719 Each file is named using the format: `cache_<layer_hash>.bin`.
721 @param[in] cache_dir Path to the directory where KV cache files are saved.
726 self, cache_size: int, tail_size: int, mask: List[bool]
729 @brief Filter the tail of the KV cache memory
731 Retains the desired caches in the tail of the KV cache memory, excludes the others,
732 and shifts the remaining caches forward.
734 @param[in] cache_size The number of tokens accumulated in the KV cache so far.
735 @param[in] tail_size The tail size of the KV cache to filter (<=32).
736 @param[in] mask A mask indicating tokens to retain or exclude at the tail of the KV
739 @return New cache size after tail filtering.
745 @brief Moves the tail of the KV cache memory to the end of the head.
747 Slice the tail of the KV cache memory up to the specified size
748 and moves it to the designated cache position.
750 @param[in] num_head The size of the KV cache head where the tail is appended.
751 @param[in] num_tail The size of the KV cache tail to be moved.
752 @param[in] cache_size The total number of tokens accumulated in the KV cache so
755 @return The updated cache size after moving the tail.
760def load(path: str, model_config: Optional[ModelConfig] =
None) -> Model:
762 @brief Single-step inference API. Creates model and uploads the model
763 into NPU immediately.
765 This operation performs the Accelerator declaration, Model declaration,
766 and launch in a single step.
769 model =
Model(path, model_config)
Represents an accelerator, i.e., an NPU, used for executing models.
Represents an AI model loaded from an MXQ file.
DataType get_model_input_data_type(self)
Returns a data type for model inputs.
None launch(self, Accelerator acc)
Launches the model on the specified Accelerator, which represents the actual NPU.
None reposition_outputs(self, List[Buffer] output_bufs, List[np.ndarray] outputs, List[List[int]] seqlens=[])
Reposition output.
List[Scale] get_input_scale(self)
Returns the input quantization scale(s) of the model.
CoreMode get_core_mode(self)
Retrieves the core mode of the model.
List[CoreId] get_target_cores(self)
Returns the NPU cores the model is configured to use.
List[_Shape] _input_shape
List[CoreId] target_cores(self)
List[_Shape] _output_shape
List[Buffer] acquire_input_buffer(self, List[List[int]] seqlens=[])
Buffer Management API.
List[BufferInfo] get_output_buffer_info(self)
Returns the output buffer information of the model.
str get_model_path(self)
Returns the path to the MXQ model file associated with the Model.
Future infer_async(self, Union[np.ndarray, List[np.ndarray]] inputs)
Asynchronous Inference.
DataType get_model_output_data_type(self)
Returns a data type for model outputs.
List[Buffer] acquire_output_buffer(self, List[List[int]] seqlens=[])
Buffer Management API.
int filter_cache_tail(self, int cache_size, int tail_size, List[bool] mask)
Filter the tail of the KV cache memory.
List[np.ndarray] _infer_to_float(self, Union[np.ndarray, List[np.ndarray],] inputs, int cache_size, Optional[bool] is_target_hwc=None)
int8_t-to-float inference Performs inference with input and output elements of type int8_t
int get_num_model_variants(self)
Returns the total number of model variants available in this model.
None load_cache_memory_from(self, str cache_dir, int cache_id=0)
Loads the KV cache memory from files in the specified directory.
bool is_target(self, CoreId core_id)
Checks if the NPU core specified by CoreId is the target of the model.
None infer_speedrun(self)
Development-only API for measuring pure NPU inference speed.
int get_latency_consumed(self)
None dump_cache_memory_to(self, str cache_dir, int cache_id=0)
Dumps KV cache memory to files in the specified directory.
Future infer_async_to_float(self, Union[np.ndarray, List[np.ndarray]] inputs)
This method supports int8_t-to-float asynchronous inference.
ModelVariantHandle get_model_variant_handle(self, variant_idx)
Retrieves a handle to the specified model variant.
None load_cache_memory(self, List[bytes] bufs, int cache_id=0)
Loads the KV cache memory from buffers.
List[_Shape] get_model_output_shape(self)
Returns the output shape of the model.
List[CacheInfo] get_cache_infos(self)
Returns informations of KV-cache of the model.
int get_latency_finished(self)
None release_buffer(self, List[Buffer] buffer)
Buffer Management API.
List[bytes] dump_cache_memory(self, int cache_id=0)
Dumps the KV cache memory into buffers.
None dispose(self)
Disposes of the model loaded onto the NPU.
List[_Shape] get_model_input_shape(self)
Returns the input shape of the model.
List[Scale] get_output_scale(self)
Returns the output quantization scale(s) of the model.
None infer_buffer(self, List[Buffer] inputs, List[Buffer] outputs, List[List[int]] shape=[], int cache_size=0)
Buffer-to-Buffer inference.
List[np.ndarray] infer_to_float(self, Union[np.ndarray, List[np.ndarray],] inputs, int cache_size=0)
int8_t-to-float inference Performs inference with input and output elements of type int8_t
__init__(self, str path, Optional[ModelConfig] model_config=None)
Creates a Model object from the specified MXQ model file and configuration.
Optional[List[np.ndarray]] _infer(self, Union[np.ndarray, List[np.ndarray]] inputs, Optional[List[np.ndarray]] outputs, int cache_size, Optional[bool] is_target_hwc=None, Optional[List[BatchParam]] params=None)
Optional[List[np.ndarray]] infer(self, Union[np.ndarray, List[np.ndarray]] inputs, Optional[List[np.ndarray]] outputs=None, int cache_size=0, Optional[List[BatchParam]] params=None)
Performs inference.
None reposition_inputs(self, List[np.ndarray] inputs, List[Buffer] input_bufs, List[List[int]] seqlens=[])
Reposition input.
List[BufferInfo] get_input_buffer_info(self)
Returns the input buffer information for the model.
int get_identifier(self)
Returns the model's unique identifier.
int move_cache_tail(self, int num_head, int num_tail, int cache_size)
Moves the tail of the KV cache memory to the end of the head.
A simple byte-sized buffer.
Defines the core mode for NPU execution.
Model load(str path, Optional[ModelConfig] model_config=None)
Single-step inference API.