5from typing
import List, Optional, Tuple, Union
9import maccel.maccel
as _cMaccel
10from .accelerator
import Accelerator
12from .model_variant_handle
import *
15_Shape = Tuple[int, ...]
17__all__ = [
"Model",
"load"]
25def _is_valid_shape(input_shape: _Shape, shape: _Shape) -> bool:
26 if (len(input_shape) < len(shape))
or (len(input_shape) > len(shape) + 1):
29 offset = 1
if len(input_shape) > len(shape)
else 0
30 for s1, s2
in zip(input_shape[offset:], shape):
33 if s1 % s2 != 0
or (s2 > 0
and s1 != s2):
39def _is_shape_hwc(inputs: List[np.ndarray], shapes: List[_Shape]) -> Optional[bool]:
40 if len(inputs) != len(shapes):
45 for arr, shape
in zip(inputs, shapes):
46 shape_hwc = (shape[0], shape[1], shape[2])
47 shape_chw = (shape[2], shape[0], shape[1])
48 is_hwc = is_hwc
and _is_valid_shape(arr.shape, shape_hwc)
49 is_chw = is_chw
and _is_valid_shape(arr.shape, shape_chw)
51 if not is_hwc
and not is_chw:
58def _find_matching_variant_idx_and_is_hwc(
59 model, inputs: List[np.ndarray]
63 for i
in range(model.get_num_model_variants()):
64 is_hwc = _is_shape_hwc(
65 inputs, model.get_model_variant_handle(i).get_model_input_shape()
67 if is_hwc
is not None:
72 raise ValueError(
"Input shape is invalid.")
73 assert variant_idx
is not None
74 return variant_idx, is_hwc
79 shapes: List[_Shape], is_hwc: bool, dtype: np.dtype
84 shape = (shape[0], shape[1], shape[2])
86 shape = (shape[2], shape[0], shape[1])
87 outputs.append(np.empty(shape, dtype=dtype))
92def _check_output_shapes(
93 outputs: List[np.ndarray], shapes: List[_Shape], is_hwc: bool, dtype: np.dtype
95 if len(outputs) != len(shapes):
96 raise ValueError(
"The number of outputs is different.")
98 for output, shape
in zip(outputs, shapes):
99 if output.dtype != dtype:
100 raise ValueError(
"Output dtype mismatch.")
103 shape = (shape[0], shape[1], shape[2])
105 shape = (shape[2], shape[0], shape[1])
106 if output.shape != shape:
107 raise ValueError(
"Output shape mismatch.")
112 @brief Represents an AI model loaded from an MXQ file.
114 This class loads an AI model from an MXQ file and provides functions to launch it
115 on the NPU and perform inference.
118 def __init__(self, path: str, model_config: Optional[ModelConfig] =
None):
120 @brief Creates a Model object from the specified MXQ model file and configuration.
122 Parses the MXQ file and constructs a Model object using the provided configuration,
123 initializing the model with the given settings.
125 @note The created Model object must be launched before performing inference.
126 See Model.launch for more details.
128 @param[in] path The path to the MXQ model file.
129 @param[in] model_config The configuration settings to initialize the Model.
131 if model_config
is None:
132 self.
_model = _cMaccel.Model(path)
134 self.
_model = _cMaccel.Model(path, model_config._model_config)
141 def launch(self, acc: Accelerator) ->
None:
143 @brief Launches the model on the specified Accelerator, which represents
146 @param[in] acc The accelerator on which to launch the model.
153 @brief Disposes of the model loaded onto the NPU.
155 Releases any resources associated with the model on the NPU.
162 @brief Checks if the NPU core specified by CoreId is the target of the model.
163 In other words, whether the model is configured to use the given NPU core.
165 @param[in] core_id The CoreId to check.
166 @return True if the model is configured to use the specified CoreId, false
173 @brief Retrieves the core mode of the model.
175 @return The CoreMode of the model.
181 @brief Returns the NPU cores the model is configured to use.
183 @return A list of CoreIds representing the target NPU cores.
185 return [CoreId.from_cpp(target)
for target
in self.
_model.target_cores]
190 return [CoreId.from_cpp(target)
for target
in self.
_model.target_cores]
194 inputs: Union[np.ndarray, List[np.ndarray]],
195 outputs: Optional[List[np.ndarray]] =
None,
197 ) -> Optional[List[np.ndarray]]:
199 @brief Performs inference.
201 Fowllowing types of inference supported.
202 1. infer(in:List[numpy]) -> List[numpy] (float / int)
203 2. infer(in:numpy) -> List[numpy] (float / int)
204 3. infer(in:List[numpy], out:List[numpy]) (float / int)
205 4. infer(in:List[numpy], out:List[]) (float / int)
206 5. infer(in:numpy, out:List[numpy]) (float / int)
207 6. infer(in:numpy, out:List[]) (float / int)
209 @param[in] inputs Input data as a single numpy.ndarray or a list
211 @param[out] outputs Optional pre-allocated list of numpy.ndarray's
212 to store inference results.
213 @return Inference results as a list of numpy.ndarray.
215 if not isinstance(inputs, list):
218 variant_idx, is_hwc = _find_matching_variant_idx_and_is_hwc(self, inputs)
219 inputs = [np.ascontiguousarray(i)
for i
in inputs]
223 infer_func = self.
_model.infer
if is_hwc
else self.
_model.infer_chw
224 return [np.asarray(o)
for o
in infer_func(inputs, cache_size)]
228 _check_output_shapes(
234 for oi
in range(len(outputs)):
235 outputs[oi] = np.ascontiguousarray(outputs[oi])
237 outputs[:] = _build_outputs(
246 self.
_model.infer_chw(inputs, outputs, cache_size)
255 ) -> List[np.ndarray]:
257 @brief int8_t-to-float inference
258 Performs inference with input and output elements of type `int8_t`
260 Using these inference APIs requires manual scaling (quantization)
261 of float values to `int8_t` for input.
263 @note These APIs are intended for advanced use rather than typical usage.
265 if not isinstance(inputs, list):
268 _, is_hwc = _find_matching_variant_idx_and_is_hwc(self, inputs)
269 inputs = [np.ascontiguousarray(i)
for i
in inputs]
274 outputs = self.
_model.infer_chw_to_float(inputs, cache_size)
276 return [np.asarray(o)
for o
in outputs]
280 infer_chw_to_float = infer_to_float
284 inputs: List[Buffer],
285 outputs: List[Buffer],
286 shape: List[List[int]] = [],
290 @brief Buffer-to-Buffer inference
292 Performs inference using input and output elements in the NPU’s internal data type.
293 The inference operates on buffers allocated via the following APIs:
295 - `Model.acquire_input_buffer()`
296 - `Model.acquire_output_buffer()`
297 - `ModelVariantHandle.acquire_input_buffer()`
298 - `ModelVariantHandle.acquire_output_buffer()`
300 Additionally, `Model.reposition_inputs()`, `Model.reposition_outputs()`,
301 `ModelVariantHandle.reposition_inputs()`, `ModelVariantHandle.reposition_outputs()`
302 must be used properly.
304 @note These APIs are intended for advanced use rather than typical usage.
307 [i._buffer
for i
in inputs], [o._buffer
for o
in outputs], shape, cache_size
312 @brief Development-only API for measuring pure NPU inference speed.
314 Runs NPU inference without uploading inputs and without retrieving outputs.
320 inputs: Union[np.ndarray, List[np.ndarray]],
323 @brief Asynchronous Inference
325 Performs inference asynchronously.
327 To use asynchronous inference, the model must be created using a `ModelConfig`
328 object with the async pipeline configured to be enabled. This is done by calling
329 @ref ModelConfig.set_async_pipeline_enabled
330 "ModelConfig.set_async_pipeline_enabled(True)" before passing the configuration to
337 mc = maccel.ModelConfig()
338 mc.set_async_pipeline_enabled(True)
340 model = maccel.Model(MXQ_PATH, mc)
341 acc = maccel.Accelerator()
345 future = model.infer_async(inputs)
350 @note Currently, only CNN-based models are supported, as asynchronous execution is
351 particularly effective for this type of workload.
354 - RNN/LSTM and LLM models are not supported yet.
355 - Models requiring CPU offloading are not supported yet.
356 - Currently, only single-batch inference is supported (i.e., N = 1).
357 - Currently, Buffer inference is not supported. The following types
358 are supported in the synchronous API for advanced use cases, but are not
359 yet available for asynchronous inference:
363 if not isinstance(inputs, list):
365 _, is_hwc = _find_matching_variant_idx_and_is_hwc(self, inputs)
366 inputs = [np.ascontiguousarray(i)
for i
in inputs]
368 self.
_model.infer_async
if is_hwc
else self.
_model.infer_async_chw
370 return Future.from_cpp(infer_async_func(inputs), inputs)
374 inputs: Union[np.ndarray, List[np.ndarray]],
377 @brief This method supports int8_t-to-float asynchronous inference.
379 @param[in] inputs Input data as a single numpy.ndarray or a list
382 @return A future that can be used to retrieve the inference result.
384 if not isinstance(inputs, list):
386 _, is_hwc = _find_matching_variant_idx_and_is_hwc(self, inputs)
387 inputs = [np.ascontiguousarray(i)
for i
in inputs]
389 self.
_model.infer_async_to_float
391 else self.
_model.infer_async_chw_to_float
393 return Future.from_cpp(infer_async_func(inputs), inputs)
397 inputs: List[np.ndarray],
398 input_bufs: List[Buffer],
399 seqlens: List[List[int]] = [],
401 """Reposition input"""
402 inputs = [np.ascontiguousarray(i)
for i
in inputs]
404 inputs, [buf._buffer
for buf
in input_bufs], seqlens
409 output_bufs: List[Buffer],
410 outputs: List[np.ndarray],
411 seqlens: List[List[int]] = [],
413 """Reposition output"""
417 outputs.append(np.empty(shape=shape, dtype=np.float32))
419 for oi
in range(len(outputs)):
420 outputs[oi] = np.ascontiguousarray(outputs[oi])
422 [buf._buffer
for buf
in output_bufs], outputs, seqlens
427 @brief Returns the total number of model variants available in this model.
429 The `variant_idx` parameter passed to `Model.get_model_variant_handle()` must be
430 in the range [0, return value of this function).
432 @return The total number of model variants.
438 @brief Retrieves a handle to the specified model variant.
440 Use the returned `ModelVariantHandle` to query details such as input and output
441 shapes for the selected variant.
443 @param[in] variant_idx Index of the model variant to retrieve.
444 Must be in the range [0, getNumModelVariants()).
446 @return A `ModelVariantHandle` object if successful;
447 otherwise, raise maccel.MAccelError "Model_InvalidVariantIdx".
449 return ModelVariantHandle.from_cpp(
455 @brief Returns the input shape of the model.
457 @return A list of input shape of the model.
463 @brief Returns the output shape of the model.
465 @return A list of output shape of the model.
471 @brief Returns the input quantization scale(s) of the model.
473 @return A list of input scales.
479 @brief Returns the output quantization scale(s) of the model.
481 @return A list of output scales.
487 @brief Returns the input buffer information for the model.
489 @return A list of input buffer information.
495 @brief Returns the output buffer information of the model.
497 @return A list of output buffer information.
503 @brief Buffer Management API
505 Acquires list of `Buffer` for input.
506 These API is required when calling `Model.infer_buffer()`.
508 @note These APIs are intended for advanced use rather than typical usage.
514 @brief Buffer Management API
516 Acquires list of `Buffer` for output.
517 These API is required when calling `Model.infer_buffer()`.
519 @note These APIs are intended for advanced use rather than typical usage.
525 @brief Buffer Management API
527 Deallocate acquired Input/Output buffer
529 @note These APIs are intended for advanced use rather than typical usage.
535 @brief Returns the model's unique identifier.
537 This identifier distinguishes multiple models within a single user program.
538 It is assigned incrementally, starting from 0 (e.g., 0, 1, 2, 3, ...).
540 @return The model identifier.
546 @brief Returns the path to the MXQ model file associated with the Model.
548 @return The MXQ file path.
554 @brief Returns informations of KV-cache of the model.
556 @return A list of CacheInfo objects.
582 @brief Resets the KV cache memory.
584 Clears the stored KV cache, restoring it to its initial state.
590 @brief Dumps the KV cache memory into buffers.
592 Writes the current KV cache data into provided buffers.
594 @return A list of bytes containing the KV cache data.
597 return [np.asarray(buf, np.int8).tobytes()
for buf
in bufs]
601 @brief Loads the KV cache memory from buffers.
603 Restores the KV cache from the provided buffers.
605 @param[in] bufs A list of bytes containing the KV cache
608 [np.frombuffer(buf, dtype=np.int8)
for buf
in bufs]
613 @brief Dumps KV cache memory to files in the specified directory.
615 Writes the KV cache data to binary files within the given directory.
616 Each file is named using the format: `cache_<layer_hash>.bin`.
618 @param[in] cache_dir Path to the directory where KV cache files will be saved.
624 @brief Loads the KV cache memory from files in the specified directory.
626 Reads KV cache data from files within the given directory and restores them.
627 Each file is named using the format: `cache_<layer_hash>.bin`.
629 @param[in] cache_dir Path to the directory where KV cache files are saved.
634 self, cache_size: int, tail_size: int, mask: List[bool]
637 @brief Filter the tail of the KV cache memory
639 Retains the desired caches in the tail of the KV cache memory, excludes the others,
640 and shifts the remaining caches forward.
642 @param[in] cache_size The number of tokens accumulated in the KV cache so far.
643 @param[in] tail_size The tail size of the KV cache to filter (<=32).
644 @param[in] mask A mask indicating tokens to retain or exclude at the tail of the KV
647 @return New cache size after tail filtering.
653 @brief Moves the tail of the KV cache memory to the end of the head.
655 Slice the tail of the KV cache memory up to the specified size
656 and moves it to the designated cache position.
658 @param[in] num_head The size of the KV cache head where the tail is appended.
659 @param[in] num_tail The size of the KV cache tail to be moved.
660 @param[in] cache_size The total number of tokens accumulated in the KV cache so
663 @return The updated cache size after moving the tail.
668def load(path: str, model_config: Optional[ModelConfig] =
None) -> Model:
670 @brief Single-step inference API. Creates model and uploads the model
671 into NPU immediately.
673 This operation performs the Accelerator declaration, Model declaration,
674 and launch in a single step.
677 model =
Model(path, model_config)
Represents an accelerator, i.e., an NPU, used for executing models.
Represents an AI model loaded from an MXQ file.
None load_cache_memory_from(self, str cache_dir)
Loads the KV cache memory from files in the specified directory.
int get_identifier(self)
Returns the model's unique identifier.
bool is_target(self, CoreId core_id)
Checks if the NPU core specified by CoreId is the target of the model.
List[CoreId] get_target_cores(self)
Returns the NPU cores the model is configured to use.
None dump_cache_memory_to(self, str cache_dir)
Dumps KV cache memory to files in the specified directory.
str get_model_path(self)
Returns the path to the MXQ model file associated with the Model.
Optional[List[np.ndarray]] infer(self, Union[np.ndarray, List[np.ndarray]] inputs, Optional[List[np.ndarray]] outputs=None, int cache_size=0)
Performs inference.
Future infer_async_to_float(self, Union[np.ndarray, List[np.ndarray]] inputs)
This method supports int8_t-to-float asynchronous inference.
List[_Shape] _input_shape
List[Buffer] acquire_input_buffer(self, List[List[int]] seqlens=[])
Buffer Management API.
None load_cache_memory(self, List[bytes] bufs)
Loads the KV cache memory from buffers.
List[BufferInfo] get_output_buffer_info(self)
Returns the output buffer information of the model.
List[_Shape] get_model_input_shape(self)
Returns the input shape of the model.
List[CoreId] target_cores(self)
List[Scale] get_input_scale(self)
Returns the input quantization scale(s) of the model.
SchedulePolicy get_schedule_policy(self)
None infer_buffer(self, List[Buffer] inputs, List[Buffer] outputs, List[List[int]] shape=[], int cache_size=0)
Buffer-to-Buffer inference.
None dispose(self)
Disposes of the model loaded onto the NPU.
MaintenancePolicy get_maintenance_policy(self)
LatencySetPolicy get_latency_set_policy(self)
List[BufferInfo] get_input_buffer_info(self)
Returns the input buffer information for the model.
List[Scale] get_output_scale(self)
Returns the output quantization scale(s) of the model.
None reset_cache_memory(self)
Resets the KV cache memory.
List[np.ndarray] infer_to_float(self, Union[np.ndarray, List[np.ndarray],] inputs, int cache_size=0)
int8_t-to-float inference Performs inference with input and output elements of type int8_t
ModelVariantHandle get_model_variant_handle(self, variant_idx)
Retrieves a handle to the specified model variant.
List[bytes] dump_cache_memory(self)
Dumps the KV cache memory into buffers.
__init__(self, str path, Optional[ModelConfig] model_config=None)
Creates a Model object from the specified MXQ model file and configuration.
int get_latency_finished(self)
Future infer_async(self, Union[np.ndarray, List[np.ndarray]] inputs)
Asynchronous Inference.
None reposition_outputs(self, List[Buffer] output_bufs, List[np.ndarray] outputs, List[List[int]] seqlens=[])
Reposition output.
int move_cache_tail(self, int num_head, int num_tail, int cache_size)
Moves the tail of the KV cache memory to the end of the head.
List[_Shape] _output_shape
List[Buffer] acquire_output_buffer(self, List[List[int]] seqlens=[])
Buffer Management API.
int get_num_model_variants(self)
Returns the total number of model variants available in this model.
None reposition_inputs(self, List[np.ndarray] inputs, List[Buffer] input_bufs, List[List[int]] seqlens=[])
Reposition input.
List[_Shape] get_model_output_shape(self)
Returns the output shape of the model.
CoreMode get_core_mode(self)
Retrieves the core mode of the model.
None release_buffer(self, List[Buffer] buffer)
Buffer Management API.
int get_latency_consumed(self)
List[CacheInfo] get_cache_infos(self)
Returns informations of KV-cache of the model.
int filter_cache_tail(self, int cache_size, int tail_size, List[bool] mask)
Filter the tail of the KV cache memory.
None launch(self, Accelerator acc)
Launches the model on the specified Accelerator, which represents the actual NPU.
None infer_speedrun(self)
Development-only API for measuring pure NPU inference speed.
A simple byte-sized buffer.
Defines the core mode for NPU execution.
Model load(str path, Optional[ModelConfig] model_config=None)
Single-step inference API.