5from typing
import List, Optional, Tuple
10import qbruntime.qbruntime
as _cQbRuntime
19 @brief Enumerates clusters in the ARIES NPU.
21 @note The ARIES NPU consists of two clusters, each containing one global core and
22 four local cores, totaling eight local cores. REGULUS has only a single cluster
23 (Cluster0) with one local core (Core0).
26 Cluster0 = _cQbRuntime.Cluster.Cluster0
27 Cluster1 = _cQbRuntime.Cluster.Cluster1
28 Error = _cQbRuntime.Cluster.Error
33 @brief Enumerates cores within a cluster in the ARIES NPU.
35 @note The ARIES NPU consists of two clusters, each containing one global core and
36 four local cores, totaling eight local cores. REGULUS has only a single cluster
37 (Cluster0) with one local core (Core0).
40 Core0 = _cQbRuntime.Core.Core0
41 Core1 = _cQbRuntime.Core.Core1
42 Core2 = _cQbRuntime.Core.Core2
43 Core3 = _cQbRuntime.Core.Core3
44 All = _cQbRuntime.Core.All
45 GlobalCore = _cQbRuntime.Core.GlobalCore
46 Error = _cQbRuntime.Core.Error
50 """@brief Core allocation policy"""
52 Auto = _cQbRuntime.CoreAllocationPolicy.Auto
53 Manual = _cQbRuntime.CoreAllocationPolicy.Manual
57 """@brief Struct for scale values."""
59 def __init__(self, scale: float, is_uniform: bool, scale_list: List[float]):
60 self.
_scale = _cQbRuntime.Scale()
62 self.
_scale.is_uniform = is_uniform
63 self.
_scale.scale_list = scale_list
66 def from_cpp(cls, _scale: _cQbRuntime.Scale):
67 return cls(_scale.scale, _scale.is_uniform, _scale.scale_list)
70 def scale_list(self) -> List[float]:
71 return self.
_scale.scale_list
74 def scale(self) -> float:
78 def is_uniform(self) -> bool:
79 return self.
_scale.is_uniform
82 def scale_list(self, value: List[float]):
83 self.
_scale.scale_list = value
86 def scale(self, value: float):
90 def is_uniform(self, value: bool):
91 self.
_scale.is_uniform = value
95 @brief Returns the scale value at the specified index.
108 return "{}({})".format(
109 self.__class__.__name__,
110 ", ".join(
"{}={}".format(k, v)
for k, v
in d.items()),
116 @brief Represents a unique identifier for an NPU core.
118 A CoreId consists of a Cluster and a Core, identifying a specific core
122 def __init__(self, cluster: Cluster, core: Core):
123 self.
_core_id = _cQbRuntime.CoreId()
124 self.
_core_id.cluster = cluster.value
128 def from_cpp(cls, _core_id: _cQbRuntime.CoreId):
129 return cls(
Cluster(_core_id.cluster),
Core(_core_id.core))
132 def cluster(self) -> Cluster:
136 def core(self) -> Core:
140 def cluster(self, value: Cluster):
144 def core(self, value: Core):
149 @brief Checks if two CoreId objects are equal.
151 @return True if both CoreId objects are identical, False otherwise.
153 return self.
_core_id == other._core_id
157 @brief Compares two CoreId objects for ordering.
159 @return True if this CoreId is less than the given CoreId, False otherwise.
161 return self.
_core_id < other._core_id
165 return "{}({})".format(
166 self.__class__.__name__,
167 ", ".join(
"{}={}".format(k, v)
for k, v
in d.items()),
173 @brief A simple byte-sized buffer.
175 This struct represents a contiguous block of memory for storing byte-sized data.
178 def __init__(self, _buffer: Optional[_cQbRuntime.Buffer] =
None):
179 self.
_buffer = _cQbRuntime.Buffer()
if _buffer
is None else _buffer
182 def size(self) -> int:
186 def size(self, value: int):
189 def set_buffer(self, arr: np.ndarray):
190 self.
_buffer.set_buffer(np.ascontiguousarray(arr))
193 return f
"{self.__class__.__name__}(size={self._buffer.size})"
198 @brief Defines the core mode for NPU execution.
200 Supported core modes include single-core, multi-core, global4-core, and global8-core.
201 For detailed explanations of each mode, refer to the following functions:
203 - `ModelConfig.set_single_core_mode()`
204 - `ModelConfig.set_multi_core_mode()`
205 - `ModelConfig.set_global4_core_mode()`
206 - `ModelConfig.set_global8_core_mode()`
209 Single = _cQbRuntime.CoreMode.Single
210 Multi = _cQbRuntime.CoreMode.Multi
211 Global = _cQbRuntime.CoreMode.Global
212 Global4 = _cQbRuntime.CoreMode.Global4
213 Global8 = _cQbRuntime.CoreMode.Global8
214 Error = _cQbRuntime.CoreMode.Error
218 """@brief Struct representing input/output buffer information."""
222 original_height: int = 0,
223 original_width: int = 0,
224 original_channel: int = 0,
225 reshaped_height: int = 0,
226 reshaped_width: int = 0,
227 reshaped_channel: int = 0,
233 max_channel: int = 0,
234 max_cache_size: int = 0,
252 def from_cpp(cls, _buffer_info: _cQbRuntime.BufferInfo):
254 _buffer_info.original_height,
255 _buffer_info.original_width,
256 _buffer_info.original_channel,
257 _buffer_info.reshaped_height,
258 _buffer_info.reshaped_width,
259 _buffer_info.reshaped_channel,
262 _buffer_info.channel,
263 _buffer_info.max_height,
264 _buffer_info.max_width,
265 _buffer_info.max_channel,
266 _buffer_info.max_cache_size,
271 """Height of original input/output"""
276 """Width of original input/output"""
281 """Channel of original input/output"""
286 """Height of reshaped input/output"""
291 """Width of reshaped input/output"""
296 """Channel of reshaped input/output"""
301 """Height of NPU input/output"""
306 """Width of NPU input/output"""
311 """Channel of NPU input/output"""
316 """Maximum height of original input/output if data is sequential."""
321 """Maximum width of original input/output if data is sequential."""
326 """Maximum channel of original input/output if data is sequential."""
331 """Maximum KV-cache size, relevant for LLM models using KV cache."""
334 @original_height.setter
338 @original_width.setter
342 @original_channel.setter
346 @reshaped_height.setter
350 @reshaped_width.setter
354 @reshaped_channel.setter
359 def height(self, value: int):
363 def width(self, value: int):
382 @max_cache_size.setter
388 @brief Returns the total size of the original input/output.
390 @return The data size.
396 @brief Returns the total size of the reshaped input/output.
398 @return The data size.
404 @brief Returns the total size of the NPU input/output.
406 @return The data size.
410 def original_shape(self) -> Tuple[int, int, int]:
413 def original_shape_chw(self) -> Tuple[int, int, int]:
416 def reshaped_shape(self) -> Tuple[int, int, int]:
419 def reshaped_shape_chw(self) -> Tuple[int, int, int]:
422 def shape(self) -> Tuple[int, int, int]:
425 def shape_chw(self) -> Tuple[int, int, int]:
444 return "{}({})".format(
445 self.__class__.__name__,
446 ", ".join(
"{}={}".format(k, v)
for k, v
in d.items()),
452 @brief Configures a core mode and core allocation of a model for NPU inference.
453 The `ModelConfig` class provides methods for setting a core mode and allocating
454 cores for NPU inference. Supported core modes are single-core, multi-core,
455 global4-core, and global8-core. Users can also specify which cores to allocate for
456 the model. Additionally, the configuration offers an option to enforce the use of a
459 @note Deprecated functions are included for backward compatibility, but it is
460 recommended to use the newer core mode configuration methods.
463 def __init__(self, num_cores: Optional[int] =
None):
465 @brief Default constructor. This default-constructed object is initially set to
466 single-core mode with all NPU local cores included.
469 _cQbRuntime.ModelConfig()
471 else _cQbRuntime.ModelConfig(num_cores)
475 self, num_cores: Optional[int] =
None, core_ids: Optional[List[CoreId]] =
None
478 @brief Sets the model to use single-core mode for inference with a specified number
481 In single-core mode, each local core executes model inference independently.
482 The number of cores used is specified by the `num_cores` parameter, and the core
483 allocation policy is set to `CoreAllocationPolicy.Auto`, meaning the model will be
484 automatically allocated to available local cores when the model is launched to the
485 NPU, specifically when the `Model.launch()` function is called. Or The user can
486 specify a list of CoreIds to determine which cores to use for inference.
488 @note Use exactly one of `num_cores` or `core_ids`, not both.
490 @param[in] num_cores The number of local cores to use for inference.
491 @param[in] core_ids A list of CoreIds to be used for model inference.
493 @return True if the mode was successfully set, False otherwise.
495 if num_cores
is not None and core_ids
is None:
497 elif core_ids
is not None and num_cores
is None:
499 [core_id._core_id
for core_id
in core_ids]
502 "`set_single_core_mode` needs either `num_cores` and `core_ids`."
510 self, clusters: List[Cluster] = [Cluster.Cluster0, Cluster.Cluster1]
513 @brief Sets the model to use global4-core mode for inference with a specified set
516 For Aries NPU, there are two clusters, each consisting of four local cores. In
517 global4-core mode, four local cores within the same cluster work together to
518 execute the model inference.
520 @param[in] clusters A list of clusters to be used for model inference.
522 @return True if the mode was successfully set, False otherwise.
528 @brief Sets the model to use global8-core mode for inference.
530 For Aries NPU, there are two clusters, each consisting of four local cores. In
531 global8-core mode, all eight local cores across the two clusters work together to
532 execute the model inference.
534 @return True if the mode was successfully set, False otherwise.
540 @brief Gets the core mode to be applied to the model.
542 This reflects the core mode that will be used when the model is created.
544 @return The `CoreMode` to be applied to the model.
549 self, clusters: List[Cluster] = [Cluster.Cluster0, Cluster.Cluster1]
552 @brief Sets the model to use multi-core mode for batch inference.
554 In multi-core mode, on Aries NPU, the four local cores within a cluster work
555 together to process batch inference tasks efficiently. This mode is optimized for
558 @param[in] clusters A list of clusters to be used for multi-core batch inference.
560 @return True if the mode was successfully set, False otherwise.
566 @brief Gets the core allocation policy to be applied to the model.
568 This reflects the core allocation policy that will be used when the model is
571 @return The `CoreAllocationPolicy` to be applied to the model.
577 @brief Gets the number of cores to be allocated for the model.
579 This represents the number of cores that will be allocated for inference
580 when the model is launched to the NPU.
582 @return The number of cores to be allocated for the model.
588 @brief Forces the use of a specific NPU bundle.
590 This function forces the selection of a specific NPU bundle. If a non-negative
591 index is provided, the corresponding NPU bundle is selected and runs without CPU
592 offloading. If -1 is provided, all NPU bundles are used with CPU offloading
595 @param[in] npu_bundle_index The index of the NPU bundle to force. A non-negative
596 integer selects a specific NPU bundle (runs without CPU
597 offloading), or -1 to enable all NPU bundles with CPU
600 @return True if the index is valid and the NPU bundle is successfully set,
601 False if the index is invalid (less than -1).
607 @brief Retrieves the index of the forced NPU bundle.
609 This function returns the index of the NPU bundle that has been forced using the
610 `force_single_npu_bundle` function. If no NPU bundle is forced, the returned value
613 @return The index of the forced NPU bundle, or -1 if no bundle is forced.
619 @brief Enables or disables the asynchronous pipeline required for asynchronous
622 Call this function with `enable` set to `True` if you intend to use
623 `Model.infer_async()`, as the asynchronous pipeline is necessary for their operation.
625 If you are only using synchronous inference, such as `Model.infer()` or
626 `Model.infer_to_float()`, it is recommended to keep the asynchronous pipeline disabled
627 to avoid unnecessary overhead.
629 @param[in] enable Set to `True` to enable the asynchronous pipeline; set to `False`
636 @brief Returns whether the asynchronous pipeline is enabled in this configuration.
638 @return `True` if the asynchronous pipeline is enabled; `False` otherwise.
644 @brief Sets activation buffer slots for multi-activation supported model.
646 all this function if you want to set the number of activation buffer slots manually.
648 If you do not call this function, the default number of activation buffer slots
649 is set differently depending on the CoreMode.
651 - `CoreMode.Single` : 2 * (the number of target core ids)
652 - `CoreMode.Multi` : 2 * (the number of target clusters)
653 - `CoreMode.Global4` : 2 * (the number of target clusters)
654 - `CoreMode.Global8` : 2
656 @note This function has no effect on MXQ file in version earlier than MXQv7.
658 @note Currently, LLM model's activation slot is fixed to 1 and ignoring `count`.
660 @param[in] count Multi activation counts. Must be >= 1.
666 @brief Returns activation buffer slot count.
668 @note This function has no meaning on MXQ file in version earlier than MXQv7.
670 @return Activation buffer slot count.
675 def early_latencies(self) -> List[int]:
679 def finish_latencies(self) -> List[int]:
682 @early_latencies.setter
683 def early_latencies(self, latencies: List[int]):
684 """@deprecated This setting has no effect."""
687 @finish_latencies.setter
688 def finish_latencies(self, latencies: List[int]):
689 """@deprecated This setting has no effect."""
694 @brief Returns the list of NPU CoreIds to be used for model inference.
696 This function returns a list of NPU CoreIds that the model will use for
697 inference. When `set_single_core_mode(num_cores)` is called and the
698 core allocation policy is set to CoreAllocationPolicy.Auto, it will return an
701 @return A list of NPU CoreIds.
716 return "{}({})".format(
717 self.__class__.__name__,
718 ", ".join(
"{}={}".format(k, v)
for k, v
in d.items()),
723 """@brief LogLevel"""
725 DEBUG = _cQbRuntime.LogLevel.DEBUG
726 INFO = _cQbRuntime.LogLevel.INFO
727 WARN = _cQbRuntime.LogLevel.WARN
728 ERR = _cQbRuntime.LogLevel.ERR
729 FATAL = _cQbRuntime.LogLevel.FATAL
730 OFF = _cQbRuntime.LogLevel.OFF
733def set_log_level(level: LogLevel):
734 _cQbRuntime.set_log_level(level.value)
738 """@brief CacheType"""
740 Default = _cQbRuntime.CacheType.Default
741 Batch = _cQbRuntime.CacheType.Batch
742 Error = _cQbRuntime.CacheType.Error
746 """@brief Struct representing KV-cache information."""
750 cache_type: CacheType = CacheType.Error,
752 layer_hash: str =
"",
754 num_batches: int = 0,
764 def from_cpp(cls, _cache_info: _cQbRuntime.CacheInfo):
768 _cache_info.layer_hash,
770 _cache_info.num_batches,
774 def cache_type(self) -> CacheType:
778 def name(self) -> str:
782 def layer_hash(self) -> str:
786 def size(self) -> int:
790 def num_batches(self) -> int:
794 def cache_type(self, value: CacheType):
798 def name(self, value: str):
802 def layer_hash(self, value: str):
806 def size(self, value: int):
810 def num_batches(self, value: int):
816 @brief Starts event tracing and prepares to save the trace log to a specified file.
818 The trace log is recorded in "Chrome Tracing JSON format," which can be
819 viewed at https://ui.perfetto.dev/.
821 The trace log is not written immediately; it is saved only when
822 stop_tracing_events() is called.
824 @param[in] path The file path where the trace log should be stored.
825 @return True if tracing starts successfully, False otherwise.
827 return _cQbRuntime.start_tracing_events(path)
832 @brief Stops event tracing and writes the recorded trace log.
834 This function finalizes tracing and saves the collected trace data
835 to the file specified when start_tracing_events() was called.
837 _cQbRuntime.stop_tracing_events()
842 @brief Generates a structured summary of the specified MXQ model.
844 Returns an overview of the model contained in the MXQ file, including:
845 - Target NPU hardware
846 - Supported core modes and their associated cores
847 - The total number of model variants
849 - Input and output tensor shapes
850 - A list of layers with their types, output shapes, and input layer indices
852 The summary is returned as a human-readable string in a table and is useful for
853 inspecting model compatibility, structure, and input/output shapes.
855 @param[in] mxq_path Path to the MXQ model file.
856 @return A formatted string containing the model summary.
858 return _cQbRuntime.get_model_summary(mxq_path)
Struct representing input/output buffer information.
int reshaped_width(self)
Width of reshaped input/output.
int max_channel(self)
Maximum channel of original input/output if data is sequential.
int original_height(self)
Height of original input/output.
int width(self)
Width of NPU input/output.
int max_cache_size(self)
Maximum KV-cache size, relevant for LLM models using KV cache.
int max_height(self)
Maximum height of original input/output if data is sequential.
int original_size(self)
Returns the total size of the original input/output.
int height(self)
Height of NPU input/output.
int original_width(self)
Width of original input/output.
int channel(self)
Channel of NPU input/output.
int size(self)
Returns the total size of the NPU input/output.
int max_width(self)
Maximum width of original input/output if data is sequential.
int reshaped_channel(self)
Channel of reshaped input/output.
int original_channel(self)
Channel of original input/output.
int reshaped_size(self)
Returns the total size of the reshaped input/output.
int reshaped_height(self)
Height of reshaped input/output.
A simple byte-sized buffer.
Struct representing KV-cache information.
Enumerates clusters in the ARIES NPU.
Represents a unique identifier for an NPU core.
bool __eq__(self, other)
Checks if two CoreId objects are equal.
bool __lt__(self, other)
Compares two CoreId objects for ordering.
Defines the core mode for NPU execution.
Enumerates cores within a cluster in the ARIES NPU.
Configures a core mode and core allocation of a model for NPU inference.
List[CoreId] get_core_ids(self)
Returns the list of NPU CoreIds to be used for model inference.
bool get_forced_npu_bundle_index(self)
Retrieves the index of the forced NPU bundle.
bool set_global8_core_mode(self)
Sets the model to use global8-core mode for inference.
bool set_multi_core_mode(self, List[Cluster] clusters=[Cluster.Cluster0, Cluster.Cluster1])
Sets the model to use multi-core mode for batch inference.
bool set_global_core_mode(self, List[Cluster] clusters)
bool force_single_npu_bundle(self, int npu_bundle_index)
Forces the use of a specific NPU bundle.
__init__(self, Optional[int] num_cores=None)
Default constructor.
CoreMode get_core_mode(self)
Gets the core mode to be applied to the model.
None set_async_pipeline_enabled(self, bool enable)
Enables or disables the asynchronous pipeline required for asynchronous inference.
bool get_async_pipeline_enabled(self)
Returns whether the asynchronous pipeline is enabled in this configuration.
int get_activation_slots(self)
Returns activation buffer slot count.
None set_activation_slots(self, int num)
Sets activation buffer slots for multi-activation supported model.
int get_num_cores(self)
Gets the number of cores to be allocated for the model.
bool set_global4_core_mode(self, List[Cluster] clusters=[Cluster.Cluster0, Cluster.Cluster1])
Sets the model to use global4-core mode for inference with a specified set of NPU clusters.
CoreAllocationPolicy get_core_allocation_policy(self)
Gets the core allocation policy to be applied to the model.
bool set_single_core_mode(self, Optional[int] num_cores=None, Optional[List[CoreId]] core_ids=None)
Sets the model to use single-core mode for inference with a specified number of local cores.
float __getitem__(self, int i)
Returns the scale value at the specified index.
List[float] scale_list(self)
str get_model_summary(str mxq_path)
Generates a structured summary of the specified MXQ model.
bool start_tracing_events(str path)
Starts event tracing and prepares to save the trace log to a specified file.
stop_tracing_events()
Stops event tracing and writes the recorded trace log.