5from typing
import List, Optional, Tuple
10import qbruntime.qbruntime
as _cQbRuntime
19 @brief Enumerates clusters in the ARIES NPU.
21 @note The ARIES NPU consists of two clusters, each containing one global core and
22 four local cores, totaling eight local cores. REGULUS has only a single cluster
23 (Cluster0) with one local core (Core0).
26 Cluster0 = _cQbRuntime.Cluster.Cluster0
27 Cluster1 = _cQbRuntime.Cluster.Cluster1
28 Error = _cQbRuntime.Cluster.Error
33 @brief Enumerates cores within a cluster in the ARIES NPU.
35 @note The ARIES NPU consists of two clusters, each containing one global core and
36 four local cores, totaling eight local cores. REGULUS has only a single cluster
37 (Cluster0) with one local core (Core0).
40 Core0 = _cQbRuntime.Core.Core0
41 Core1 = _cQbRuntime.Core.Core1
42 Core2 = _cQbRuntime.Core.Core2
43 Core3 = _cQbRuntime.Core.Core3
44 All = _cQbRuntime.Core.All
45 GlobalCore = _cQbRuntime.Core.GlobalCore
46 Error = _cQbRuntime.Core.Error
50 """@brief Core allocation policy"""
52 Auto = _cQbRuntime.CoreAllocationPolicy.Auto
53 Manual = _cQbRuntime.CoreAllocationPolicy.Manual
57 """@brief Struct for scale values."""
59 def __init__(self, scale: float, is_uniform: bool, scale_list: List[float]):
60 self.
_scale = _cQbRuntime.Scale()
62 self.
_scale.is_uniform = is_uniform
63 self.
_scale.scale_list = scale_list
66 def from_cpp(cls, _scale: _cQbRuntime.Scale):
67 return cls(_scale.scale, _scale.is_uniform, _scale.scale_list)
70 def scale_list(self) -> List[float]:
71 return self.
_scale.scale_list
74 def scale(self) -> float:
78 def is_uniform(self) -> bool:
79 return self.
_scale.is_uniform
82 def scale_list(self, value: List[float]):
83 self.
_scale.scale_list = value
86 def scale(self, value: float):
90 def is_uniform(self, value: bool):
91 self.
_scale.is_uniform = value
95 @brief Returns the scale value at the specified index.
108 return "{}({})".format(
109 self.__class__.__name__,
110 ", ".join(
"{}={}".format(k, v)
for k, v
in d.items()),
116 @brief Represents a unique identifier for an NPU core.
118 A CoreId consists of a Cluster and a Core, identifying a specific core
122 def __init__(self, cluster: Cluster, core: Core):
123 self.
_core_id = _cQbRuntime.CoreId()
124 self.
_core_id.cluster = cluster.value
128 def from_cpp(cls, _core_id: _cQbRuntime.CoreId):
129 return cls(
Cluster(_core_id.cluster),
Core(_core_id.core))
132 def cluster(self) -> Cluster:
136 def core(self) -> Core:
140 def cluster(self, value: Cluster):
144 def core(self, value: Core):
149 @brief Checks if two CoreId objects are equal.
151 @return True if both CoreId objects are identical, False otherwise.
153 return self.
_core_id == other._core_id
157 @brief Compares two CoreId objects for ordering.
159 @return True if this CoreId is less than the given CoreId, False otherwise.
161 return self.
_core_id < other._core_id
165 return "{}({})".format(
166 self.__class__.__name__,
167 ", ".join(
"{}={}".format(k, v)
for k, v
in d.items()),
173 @brief A simple byte-sized buffer.
175 This struct represents a contiguous block of memory for storing byte-sized data.
178 def __init__(self, _buffer: Optional[_cQbRuntime.Buffer] =
None):
179 self.
_buffer = _cQbRuntime.Buffer()
if _buffer
is None else _buffer
182 def size(self) -> int:
186 def size(self, value: int):
189 def set_buffer(self, arr: np.ndarray):
190 self.
_buffer.set_buffer(np.ascontiguousarray(arr))
193 return f
"{self.__class__.__name__}(size={self._buffer.size})"
198 @brief Defines the core mode for NPU execution.
200 Supported core modes include single-core, multi-core, global4-core, and global8-core.
201 For detailed explanations of each mode, refer to the following functions:
203 - `ModelConfig.set_auto_core_mode()`
204 - `ModelConfig.set_single_core_mode()`
205 - `ModelConfig.set_multi_core_mode()`
206 - `ModelConfig.set_global4_core_mode()`
207 - `ModelConfig.set_global8_core_mode()`
210 Single = _cQbRuntime.CoreMode.Single
211 Multi = _cQbRuntime.CoreMode.Multi
212 Global = _cQbRuntime.CoreMode.Global
213 Global4 = _cQbRuntime.CoreMode.Global4
214 Global8 = _cQbRuntime.CoreMode.Global8
215 Auto = _cQbRuntime.CoreMode.Auto
216 Error = _cQbRuntime.CoreMode.Error
220 """@brief Struct representing input/output buffer information."""
224 original_height: int = 0,
225 original_width: int = 0,
226 original_channel: int = 0,
227 reshaped_height: int = 0,
228 reshaped_width: int = 0,
229 reshaped_channel: int = 0,
235 max_channel: int = 0,
236 max_cache_size: int = 0,
254 def from_cpp(cls, _buffer_info: _cQbRuntime.BufferInfo):
256 _buffer_info.original_height,
257 _buffer_info.original_width,
258 _buffer_info.original_channel,
259 _buffer_info.reshaped_height,
260 _buffer_info.reshaped_width,
261 _buffer_info.reshaped_channel,
264 _buffer_info.channel,
265 _buffer_info.max_height,
266 _buffer_info.max_width,
267 _buffer_info.max_channel,
268 _buffer_info.max_cache_size,
273 """Height of original input/output"""
278 """Width of original input/output"""
283 """Channel of original input/output"""
288 """Height of reshaped input/output"""
293 """Width of reshaped input/output"""
298 """Channel of reshaped input/output"""
303 """Height of NPU input/output"""
308 """Width of NPU input/output"""
313 """Channel of NPU input/output"""
318 """Maximum height of original input/output if data is sequential."""
323 """Maximum width of original input/output if data is sequential."""
328 """Maximum channel of original input/output if data is sequential."""
333 """Maximum KV-cache size, relevant for LLM models using KV cache."""
336 @original_height.setter
340 @original_width.setter
344 @original_channel.setter
348 @reshaped_height.setter
352 @reshaped_width.setter
356 @reshaped_channel.setter
361 def height(self, value: int):
365 def width(self, value: int):
384 @max_cache_size.setter
390 @brief Returns the total size of the original input/output.
392 @return The data size.
398 @brief Returns the total size of the reshaped input/output.
400 @return The data size.
406 @brief Returns the total size of the NPU input/output.
408 @return The data size.
412 def original_shape(self) -> Tuple[int, int, int]:
415 def original_shape_chw(self) -> Tuple[int, int, int]:
418 def reshaped_shape(self) -> Tuple[int, int, int]:
421 def reshaped_shape_chw(self) -> Tuple[int, int, int]:
424 def shape(self) -> Tuple[int, int, int]:
427 def shape_chw(self) -> Tuple[int, int, int]:
446 return "{}({})".format(
447 self.__class__.__name__,
448 ", ".join(
"{}={}".format(k, v)
for k, v
in d.items()),
454 @brief Configures a core mode and core allocation of a model for NPU inference.
455 The `ModelConfig` class provides methods for setting a core mode and allocating
456 cores for NPU inference. Supported core modes are single-core, multi-core,
457 global4-core, and global8-core. Users can also specify which cores to allocate for
458 the model. Additionally, the configuration offers an option to enforce the use of a
461 @note Deprecated functions are included for backward compatibility, but it is
462 recommended to use the newer core mode configuration methods.
465 def __init__(self, num_cores: Optional[int] =
None):
467 @brief Default constructor. This default-constructed object is initially set to
471 _cQbRuntime.ModelConfig()
473 else _cQbRuntime.ModelConfig(num_cores)
478 @brief Sets the model to detect CoreMode automatically.
480 In auto-core mode, the model automatically detects a supported CoreMode
481 while using all available NPU cores.
483 @note If the model has more than one CoreMode, `CoreMode.Auto` is not supported.
485 @note activation buffer slots will be reset after `set_auto_core_mode` is called.
487 @return True if the mode was successfully set, False otherwise.
492 self, num_cores: Optional[int] =
None, core_ids: Optional[List[CoreId]] =
None
495 @brief Sets the model to use single-core mode for inference with a specified number
498 In single-core mode, each local core executes model inference independently.
499 The number of cores used is specified by the `num_cores` parameter, and the core
500 allocation policy is set to `CoreAllocationPolicy.Auto`, meaning the model will be
501 automatically allocated to available local cores when the model is launched to the
502 NPU, specifically when the `Model.launch()` function is called. Or The user can
503 specify a list of CoreIds to determine which cores to use for inference.
505 @note Use exactly one of `num_cores` or `core_ids`, not both.
507 @param[in] num_cores The number of local cores to use for inference.
508 @param[in] core_ids A list of CoreIds to be used for model inference.
510 @return True if the mode was successfully set, False otherwise.
512 if num_cores
is not None and core_ids
is None:
514 elif core_ids
is not None and num_cores
is None:
516 [core_id._core_id
for core_id
in core_ids]
519 "`set_single_core_mode` needs either `num_cores` and `core_ids`."
527 self, clusters: List[Cluster] = [Cluster.Cluster0, Cluster.Cluster1]
530 @brief Sets the model to use global4-core mode for inference with a specified set
533 For Aries NPU, there are two clusters, each consisting of four local cores. In
534 global4-core mode, four local cores within the same cluster work together to
535 execute the model inference.
537 @param[in] clusters A list of clusters to be used for model inference.
539 @return True if the mode was successfully set, False otherwise.
545 @brief Sets the model to use global8-core mode for inference.
547 For Aries NPU, there are two clusters, each consisting of four local cores. In
548 global8-core mode, all eight local cores across the two clusters work together to
549 execute the model inference.
551 @return True if the mode was successfully set, False otherwise.
557 @brief Gets the core mode to be applied to the model.
559 This reflects the core mode that will be used when the model is created.
561 @return The `CoreMode` to be applied to the model.
566 self, clusters: List[Cluster] = [Cluster.Cluster0, Cluster.Cluster1]
569 @brief Sets the model to use multi-core mode for batch inference.
571 In multi-core mode, on Aries NPU, the four local cores within a cluster work
572 together to process batch inference tasks efficiently. This mode is optimized for
575 @param[in] clusters A list of clusters to be used for multi-core batch inference.
577 @return True if the mode was successfully set, False otherwise.
583 @brief Gets the core allocation policy to be applied to the model.
585 This reflects the core allocation policy that will be used when the model is
588 @return The `CoreAllocationPolicy` to be applied to the model.
594 @brief Gets the number of cores to be allocated for the model.
596 This represents the number of cores that will be allocated for inference
597 when the model is launched to the NPU.
599 @return The number of cores to be allocated for the model.
605 @brief Forces the use of a specific NPU bundle.
607 This function forces the selection of a specific NPU bundle. If a non-negative
608 index is provided, the corresponding NPU bundle is selected and runs without CPU
609 offloading. If -1 is provided, all NPU bundles are used with CPU offloading
612 @param[in] npu_bundle_index The index of the NPU bundle to force. A non-negative
613 integer selects a specific NPU bundle (runs without CPU
614 offloading), or -1 to enable all NPU bundles with CPU
617 @return True if the index is valid and the NPU bundle is successfully set,
618 False if the index is invalid (less than -1).
624 @brief Retrieves the index of the forced NPU bundle.
626 This function returns the index of the NPU bundle that has been forced using the
627 `force_single_npu_bundle` function. If no NPU bundle is forced, the returned value
630 @return The index of the forced NPU bundle, or -1 if no bundle is forced.
636 @brief Enables or disables the asynchronous pipeline required for asynchronous
639 Call this function with `enable` set to `True` if you intend to use
640 `Model.infer_async()`, as the asynchronous pipeline is necessary for their operation.
642 If you are only using synchronous inference, such as `Model.infer()` or
643 `Model.infer_to_float()`, it is recommended to keep the asynchronous pipeline disabled
644 to avoid unnecessary overhead.
646 @param[in] enable Set to `True` to enable the asynchronous pipeline; set to `False`
653 @brief Returns whether the asynchronous pipeline is enabled in this configuration.
655 @return `True` if the asynchronous pipeline is enabled; `False` otherwise.
661 @brief Sets activation buffer slots for multi-activation supported model.
663 all this function if you want to set the number of activation buffer slots manually.
665 If you do not call this function, the default number of activation buffer slots
666 is set differently depending on the CoreMode.
668 - `CoreMode.Single` : 2 * (the number of target core ids)
669 - `CoreMode.Multi` : 2 * (the number of target clusters)
670 - `CoreMode.Global4` : 2 * (the number of target clusters)
671 - `CoreMode.Global8` : 2
673 @note This function has no effect on MXQ file in version earlier than MXQv7.
675 @note Currently, LLM model's activation slot is fixed to 1 and ignoring `count`.
677 @param[in] count Multi activation counts. Must be >= 1.
683 @brief Returns activation buffer slot count.
685 @note This function has no meaning on MXQ file in version earlier than MXQv7.
687 @return Activation buffer slot count.
692 def early_latencies(self) -> List[int]:
696 def finish_latencies(self) -> List[int]:
699 @early_latencies.setter
700 def early_latencies(self, latencies: List[int]):
701 """@deprecated This setting has no effect."""
704 @finish_latencies.setter
705 def finish_latencies(self, latencies: List[int]):
706 """@deprecated This setting has no effect."""
711 @brief Returns the list of NPU CoreIds to be used for model inference.
713 This function returns a list of NPU CoreIds that the model will use for
714 inference. When `set_single_core_mode(num_cores)` is called and the
715 core allocation policy is set to CoreAllocationPolicy.Auto, it will return an
718 @return A list of NPU CoreIds.
733 return "{}({})".format(
734 self.__class__.__name__,
735 ", ".join(
"{}={}".format(k, v)
for k, v
in d.items()),
740 """@brief LogLevel"""
742 DEBUG = _cQbRuntime.LogLevel.DEBUG
743 INFO = _cQbRuntime.LogLevel.INFO
744 WARN = _cQbRuntime.LogLevel.WARN
745 ERR = _cQbRuntime.LogLevel.ERR
746 FATAL = _cQbRuntime.LogLevel.FATAL
747 OFF = _cQbRuntime.LogLevel.OFF
750def set_log_level(level: LogLevel):
751 _cQbRuntime.set_log_level(level.value)
755 """@brief CacheType"""
757 Default = _cQbRuntime.CacheType.Default
758 Batch = _cQbRuntime.CacheType.Batch
759 Error = _cQbRuntime.CacheType.Error
763 """@brief Struct representing KV-cache information."""
767 cache_type: CacheType = CacheType.Error,
769 layer_hash: str =
"",
771 num_batches: int = 0,
781 def from_cpp(cls, _cache_info: _cQbRuntime.CacheInfo):
785 _cache_info.layer_hash,
787 _cache_info.num_batches,
791 def cache_type(self) -> CacheType:
795 def name(self) -> str:
799 def layer_hash(self) -> str:
803 def size(self) -> int:
807 def num_batches(self) -> int:
811 def cache_type(self, value: CacheType):
815 def name(self, value: str):
819 def layer_hash(self, value: str):
823 def size(self, value: int):
827 def num_batches(self, value: int):
832 """@brief DataType"""
834 Float32 = _cQbRuntime.DataType.Float32
835 Float16 = _cQbRuntime.DataType.Float16
836 Int8 = _cQbRuntime.DataType.Int8
837 Uint8 = _cQbRuntime.DataType.Uint8
838 Error = _cQbRuntime.DataType.Error
843 @brief Starts event tracing and prepares to save the trace log to a specified file.
845 The trace log is recorded in "Chrome Tracing JSON format," which can be
846 viewed at https://ui.perfetto.dev/.
848 The trace log is not written immediately; it is saved only when
849 stop_tracing_events() is called.
851 @param[in] path The file path where the trace log should be stored.
852 @return True if tracing starts successfully, False otherwise.
854 return _cQbRuntime.start_tracing_events(path)
859 @brief Stops event tracing and writes the recorded trace log.
861 This function finalizes tracing and saves the collected trace data
862 to the file specified when start_tracing_events() was called.
864 _cQbRuntime.stop_tracing_events()
869 @brief Generates a structured summary of the specified MXQ model.
871 Returns an overview of the model contained in the MXQ file, including:
872 - Target NPU hardware
873 - Supported core modes and their associated cores
874 - The total number of model variants
876 - Input and output tensor shapes
877 - A list of layers with their types, output shapes, and input layer indices
879 The summary is returned as a human-readable string in a table and is useful for
880 inspecting model compatibility, structure, and input/output shapes.
882 @param[in] mxq_path Path to the MXQ model file.
883 @return A formatted string containing the model summary.
885 return _cQbRuntime.get_model_summary(mxq_path)
890 @brief Get the number of available NPU devices.
892 @return The number of available NPU devices.
894 return _cQbRuntime.get_available_device_numbers()
Struct representing input/output buffer information.
int reshaped_width(self)
Width of reshaped input/output.
int max_channel(self)
Maximum channel of original input/output if data is sequential.
int original_height(self)
Height of original input/output.
int width(self)
Width of NPU input/output.
int max_cache_size(self)
Maximum KV-cache size, relevant for LLM models using KV cache.
int max_height(self)
Maximum height of original input/output if data is sequential.
int original_size(self)
Returns the total size of the original input/output.
int height(self)
Height of NPU input/output.
int original_width(self)
Width of original input/output.
int channel(self)
Channel of NPU input/output.
int size(self)
Returns the total size of the NPU input/output.
int max_width(self)
Maximum width of original input/output if data is sequential.
int reshaped_channel(self)
Channel of reshaped input/output.
int original_channel(self)
Channel of original input/output.
int reshaped_size(self)
Returns the total size of the reshaped input/output.
int reshaped_height(self)
Height of reshaped input/output.
A simple byte-sized buffer.
Struct representing KV-cache information.
Enumerates clusters in the ARIES NPU.
Represents a unique identifier for an NPU core.
bool __eq__(self, other)
Checks if two CoreId objects are equal.
bool __lt__(self, other)
Compares two CoreId objects for ordering.
Defines the core mode for NPU execution.
Enumerates cores within a cluster in the ARIES NPU.
Configures a core mode and core allocation of a model for NPU inference.
List[CoreId] get_core_ids(self)
Returns the list of NPU CoreIds to be used for model inference.
bool get_forced_npu_bundle_index(self)
Retrieves the index of the forced NPU bundle.
bool set_global8_core_mode(self)
Sets the model to use global8-core mode for inference.
bool set_multi_core_mode(self, List[Cluster] clusters=[Cluster.Cluster0, Cluster.Cluster1])
Sets the model to use multi-core mode for batch inference.
bool set_global_core_mode(self, List[Cluster] clusters)
bool force_single_npu_bundle(self, int npu_bundle_index)
Forces the use of a specific NPU bundle.
__init__(self, Optional[int] num_cores=None)
Default constructor.
CoreMode get_core_mode(self)
Gets the core mode to be applied to the model.
None set_async_pipeline_enabled(self, bool enable)
Enables or disables the asynchronous pipeline required for asynchronous inference.
bool get_async_pipeline_enabled(self)
Returns whether the asynchronous pipeline is enabled in this configuration.
int get_activation_slots(self)
Returns activation buffer slot count.
bool set_auto_core_mode(self)
Sets the model to detect CoreMode automatically.
None set_activation_slots(self, int num)
Sets activation buffer slots for multi-activation supported model.
int get_num_cores(self)
Gets the number of cores to be allocated for the model.
bool set_global4_core_mode(self, List[Cluster] clusters=[Cluster.Cluster0, Cluster.Cluster1])
Sets the model to use global4-core mode for inference with a specified set of NPU clusters.
CoreAllocationPolicy get_core_allocation_policy(self)
Gets the core allocation policy to be applied to the model.
bool set_single_core_mode(self, Optional[int] num_cores=None, Optional[List[CoreId]] core_ids=None)
Sets the model to use single-core mode for inference with a specified number of local cores.
float __getitem__(self, int i)
Returns the scale value at the specified index.
List[float] scale_list(self)
List[int] get_available_device_numbers()
Get the number of available NPU devices.
str get_model_summary(str mxq_path)
Generates a structured summary of the specified MXQ model.
bool start_tracing_events(str path)
Starts event tracing and prepares to save the trace log to a specified file.
stop_tracing_events()
Stops event tracing and writes the recorded trace log.