5from typing
import List, Optional, Tuple
10import maccel.maccel
as _cMaccel
19 @brief Enumerates clusters in the ARIES NPU.
21 @note The ARIES NPU consists of two clusters, each containing one global core and
22 four local cores, totaling eight local cores. REGULUS has only a single cluster
23 (Cluster0) with one local core (Core0).
26 Cluster0 = _cMaccel.Cluster.Cluster0
27 Cluster1 = _cMaccel.Cluster.Cluster1
28 Error = _cMaccel.Cluster.Error
33 @brief Enumerates cores within a cluster in the ARIES NPU.
35 @note The ARIES NPU consists of two clusters, each containing one global core and
36 four local cores, totaling eight local cores. REGULUS has only a single cluster
37 (Cluster0) with one local core (Core0).
40 Core0 = _cMaccel.Core.Core0
41 Core1 = _cMaccel.Core.Core1
42 Core2 = _cMaccel.Core.Core2
43 Core3 = _cMaccel.Core.Core3
44 All = _cMaccel.Core.All
45 GlobalCore = _cMaccel.Core.GlobalCore
46 Error = _cMaccel.Core.Error
50 """@brief Core allocation policy"""
52 Auto = _cMaccel.CoreAllocationPolicy.Auto
53 Manual = _cMaccel.CoreAllocationPolicy.Manual
57 """@deprecated This enum is deprecated."""
59 Auto = _cMaccel.LatencySetPolicy.Auto
60 Manual = _cMaccel.LatencySetPolicy.Manual
64 """@deprecated This enum is deprecated."""
66 Maintain = _cMaccel.MaintenancePolicy.Maintain
67 DropExpired = _cMaccel.MaintenancePolicy.DropExpired
68 Undefined = _cMaccel.MaintenancePolicy.Undefined
72 """@deprecated This enum is deprecated."""
74 FIFO = _cMaccel.SchedulePolicy.FIFO
75 LIFO = _cMaccel.SchedulePolicy.LIFO
76 ByPriority = _cMaccel.SchedulePolicy.ByPriority
77 Undefined = _cMaccel.SchedulePolicy.Undefined
81 """@brief Struct for scale values."""
83 def __init__(self, scale: float, is_uniform: bool, scale_list: List[float]):
84 self.
_scale = _cMaccel.Scale()
86 self.
_scale.is_uniform = is_uniform
87 self.
_scale.scale_list = scale_list
90 def from_cpp(cls, _scale: _cMaccel.Scale):
91 return cls(_scale.scale, _scale.is_uniform, _scale.scale_list)
94 def scale_list(self) -> List[float]:
95 return self.
_scale.scale_list
98 def scale(self) -> float:
102 def is_uniform(self) -> bool:
103 return self.
_scale.is_uniform
106 def scale_list(self, value: List[float]):
107 self.
_scale.scale_list = value
110 def scale(self, value: float):
114 def is_uniform(self, value: bool):
115 self.
_scale.is_uniform = value
119 @brief Returns the scale value at the specified index.
132 return "{}({})".format(
133 self.__class__.__name__,
134 ", ".join(
"{}={}".format(k, v)
for k, v
in d.items()),
140 @brief Represents a unique identifier for an NPU core.
142 A CoreId consists of a Cluster and a Core, identifying a specific core
146 def __init__(self, cluster: Cluster, core: Core):
148 self.
_core_id.cluster = cluster.value
152 def from_cpp(cls, _core_id: _cMaccel.CoreId):
153 return cls(
Cluster(_core_id.cluster),
Core(_core_id.core))
156 def cluster(self) -> Cluster:
160 def core(self) -> Core:
164 def cluster(self, value: Cluster):
168 def core(self, value: Core):
173 @brief Checks if two CoreId objects are equal.
175 @return True if both CoreId objects are identical, False otherwise.
177 return self.
_core_id == other._core_id
181 @brief Compares two CoreId objects for ordering.
183 @return True if this CoreId is less than the given CoreId, False otherwise.
185 return self.
_core_id < other._core_id
189 return "{}({})".format(
190 self.__class__.__name__,
191 ", ".join(
"{}={}".format(k, v)
for k, v
in d.items()),
197 @brief A simple byte-sized buffer.
199 This struct represents a contiguous block of memory for storing byte-sized data.
202 def __init__(self, _buffer: Optional[_cMaccel.Buffer] =
None):
203 self.
_buffer = _cMaccel.Buffer()
if _buffer
is None else _buffer
206 def size(self) -> int:
210 def size(self, value: int):
213 def set_buffer(self, arr: np.ndarray):
214 self.
_buffer.set_buffer(np.ascontiguousarray(arr))
217 return f
"{self.__class__.__name__}(size={self._buffer.size})"
222 @brief Defines the core mode for NPU execution.
224 Supported core modes include single-core, multi-core, global4-core, and global8-core.
225 For detailed explanations of each mode, refer to the following functions:
227 - `ModelConfig.set_single_core_mode()`
228 - `ModelConfig.set_multi_core_mode()`
229 - `ModelConfig.set_global4_core_mode()`
230 - `ModelConfig.set_global8_core_mode()`
233 Single = _cMaccel.CoreMode.Single
234 Multi = _cMaccel.CoreMode.Multi
235 Global = _cMaccel.CoreMode.Global
236 Global4 = _cMaccel.CoreMode.Global4
237 Global8 = _cMaccel.CoreMode.Global8
238 Error = _cMaccel.CoreMode.Error
242 """@brief Struct representing input/output buffer information."""
246 original_height: int = 0,
247 original_width: int = 0,
248 original_channel: int = 0,
249 reshaped_height: int = 0,
250 reshaped_width: int = 0,
251 reshaped_channel: int = 0,
257 max_channel: int = 0,
258 max_cache_size: int = 0,
276 def from_cpp(cls, _buffer_info: _cMaccel.BufferInfo):
278 _buffer_info.original_height,
279 _buffer_info.original_width,
280 _buffer_info.original_channel,
281 _buffer_info.reshaped_height,
282 _buffer_info.reshaped_width,
283 _buffer_info.reshaped_channel,
286 _buffer_info.channel,
287 _buffer_info.max_height,
288 _buffer_info.max_width,
289 _buffer_info.max_channel,
290 _buffer_info.max_cache_size,
295 """Height of original input/output"""
300 """Width of original input/output"""
305 """Channel of original input/output"""
310 """Height of reshaped input/output"""
315 """Width of reshaped input/output"""
320 """Channel of reshaped input/output"""
325 """Height of NPU input/output"""
330 """Width of NPU input/output"""
335 """Channel of NPU input/output"""
340 """Maximum height of original input/output if data is sequential."""
345 """Maximum width of original input/output if data is sequential."""
350 """Maximum channel of original input/output if data is sequential."""
355 """Maximum KV-cache size, relevant for LLM models using KV cache."""
358 @original_height.setter
362 @original_width.setter
366 @original_channel.setter
370 @reshaped_height.setter
374 @reshaped_width.setter
378 @reshaped_channel.setter
383 def height(self, value: int):
387 def width(self, value: int):
406 @max_cache_size.setter
412 @brief Returns the total size of the original input/output.
414 @return The data size.
420 @brief Returns the total size of the reshaped input/output.
422 @return The data size.
428 @brief Returns the total size of the NPU input/output.
430 @return The data size.
434 def original_shape(self) -> Tuple[int, int, int]:
437 def original_shape_chw(self) -> Tuple[int, int, int]:
440 def reshaped_shape(self) -> Tuple[int, int, int]:
443 def reshaped_shape_chw(self) -> Tuple[int, int, int]:
446 def shape(self) -> Tuple[int, int, int]:
449 def shape_chw(self) -> Tuple[int, int, int]:
468 return "{}({})".format(
469 self.__class__.__name__,
470 ", ".join(
"{}={}".format(k, v)
for k, v
in d.items()),
476 @brief Configures a core mode and core allocation of a model for NPU inference.
477 The `ModelConfig` class provides methods for setting a core mode and allocating
478 cores for NPU inference. Supported core modes are single-core, multi-core,
479 global4-core, and global8-core. Users can also specify which cores to allocate for
480 the model. Additionally, the configuration offers an option to enforce the use of a
483 @note Deprecated functions are included for backward compatibility, but it is
484 recommended to use the newer core mode configuration methods.
487 def __init__(self, num_cores: Optional[int] =
None):
489 @brief Default constructor. This default-constructed object is initially set to
490 single-core mode with all NPU local cores included.
493 _cMaccel.ModelConfig()
495 else _cMaccel.ModelConfig(num_cores)
498 def include_all_cores(self) -> bool:
501 def exclude_all_cores(self) -> bool:
504 def include(self, cluster: Cluster, core: Optional[Core] =
None) -> bool:
510 def exclude(self, cluster: Cluster, core: Optional[Core] =
None) -> bool:
517 self, num_cores: Optional[int] =
None, core_ids: Optional[List[CoreId]] =
None
520 @brief Sets the model to use single-core mode for inference with a specified number
523 In single-core mode, each local core executes model inference independently.
524 The number of cores used is specified by the `num_cores` parameter, and the core
525 allocation policy is set to `CoreAllocationPolicy.Auto`, meaning the model will be
526 automatically allocated to available local cores when the model is launched to the
527 NPU, specifically when the `Model.launch()` function is called. Or The user can
528 specify a list of CoreIds to determine which cores to use for inference.
530 @note Use exactly one of `num_cores` or `core_ids`, not both.
532 @param[in] num_cores The number of local cores to use for inference.
533 @param[in] core_ids A list of CoreIds to be used for model inference.
535 @return True if the mode was successfully set, False otherwise.
537 if num_cores
is not None and core_ids
is None:
539 elif core_ids
is not None and num_cores
is None:
541 [core_id._core_id
for core_id
in core_ids]
544 "`set_single_core_mode` needs either `num_cores` and `core_ids`."
553 @brief Sets the model to use global4-core mode for inference with a specified set
556 For Aries NPU, there are two clusters, each consisting of four local cores. In
557 global4-core mode, four local cores within the same cluster work together to
558 execute the model inference.
560 @param[in] clusters A list of clusters to be used for model inference.
562 @return True if the mode was successfully set, False otherwise.
568 @brief Sets the model to use global8-core mode for inference.
570 For Aries NPU, there are two clusters, each consisting of four local cores. In
571 global8-core mode, all eight local cores across the two clusters work together to
572 execute the model inference.
574 @return True if the mode was successfully set, False otherwise.
580 @brief Gets the core mode to be applied to the model.
582 This reflects the core mode that will be used when the model is created.
584 @return The `CoreMode` to be applied to the model.
590 @brief Sets the model to use multi-core mode for batch inference.
592 In multi-core mode, on Aries NPU, the four local cores within a cluster work
593 together to process batch inference tasks efficiently. This mode is optimized for
596 @param[in] clusters A list of clusters to be used for multi-core batch inference.
598 @return True if the mode was successfully set, False otherwise.
612 @brief Gets the core allocation policy to be applied to the model.
614 This reflects the core allocation policy that will be used when the model is
617 @return The `CoreAllocationPolicy` to be applied to the model.
623 @brief Gets the number of cores to be allocated for the model.
625 This represents the number of cores that will be allocated for inference
626 when the model is launched to the NPU.
628 @return The number of cores to be allocated for the model.
634 @brief Forces the use of a specific NPU bundle.
636 This function forces the selection of a specific NPU bundle. If a non-negative
637 index is provided, the corresponding NPU bundle is selected and runs without CPU
638 offloading. If -1 is provided, all NPU bundles are used with CPU offloading
641 @param[in] npu_bundle_index The index of the NPU bundle to force. A non-negative
642 integer selects a specific NPU bundle (runs without CPU
643 offloading), or -1 to enable all NPU bundles with CPU
646 @return True if the index is valid and the NPU bundle is successfully set,
647 False if the index is invalid (less than -1).
653 @brief Retrieves the index of the forced NPU bundle.
655 This function returns the index of the NPU bundle that has been forced using the
656 `force_single_npu_bundle` function. If no NPU bundle is forced, the returned value
659 @return The index of the forced NPU bundle, or -1 if no bundle is forced.
665 @brief Enables or disables the asynchronous pipeline required for asynchronous
668 Call this function with `enable` set to `True` if you intend to use
669 `Model.infer_async()`, as the asynchronous pipeline is necessary for their operation.
671 If you are only using synchronous inference, such as `Model.infer()` or
672 `Model.infer_to_float()`, it is recommended to keep the asynchronous pipeline disabled
673 to avoid unnecessary overhead.
675 @param[in] enable Set to `True` to enable the asynchronous pipeline; set to `False`
682 @brief Returns whether the asynchronous pipeline is enabled in this configuration.
684 @return `True` if the asynchronous pipeline is enabled; `False` otherwise.
689 def schedule_policy(self) -> SchedulePolicy:
693 def latency_set_policy(self) -> LatencySetPolicy:
697 def maintenance_policy(self) -> MaintenancePolicy:
701 def early_latencies(self) -> List[int]:
705 def finish_latencies(self) -> List[int]:
708 @schedule_policy.setter
709 def schedule_policy(self, policy: SchedulePolicy):
710 """@deprecated This setting has no effect."""
713 @latency_set_policy.setter
714 def latency_set_policy(self, policy: LatencySetPolicy):
715 """@deprecated This setting has no effect."""
718 @maintenance_policy.setter
719 def maintenance_policy(self, policy: MaintenancePolicy):
720 """@deprecated This setting has no effect."""
723 @early_latencies.setter
724 def early_latencies(self, latencies: List[int]):
725 """@deprecated This setting has no effect."""
728 @finish_latencies.setter
729 def finish_latencies(self, latencies: List[int]):
730 """@deprecated This setting has no effect."""
735 @brief Returns the list of NPU CoreIds to be used for model inference.
737 This function returns a list of NPU CoreIds that the model will use for
738 inference. When `set_single_core_mode(num_cores)` is called and the
739 core allocation policy is set to CoreAllocationPolicy.Auto, it will return an
742 @return A list of NPU CoreIds.
757 return "{}({})".format(
758 self.__class__.__name__,
759 ", ".join(
"{}={}".format(k, v)
for k, v
in d.items()),
764 """@brief LogLevel"""
766 DEBUG = _cMaccel.LogLevel.DEBUG
767 INFO = _cMaccel.LogLevel.INFO
768 WARN = _cMaccel.LogLevel.WARN
769 ERR = _cMaccel.LogLevel.ERR
770 FATAL = _cMaccel.LogLevel.FATAL
771 OFF = _cMaccel.LogLevel.OFF
774def set_log_level(level: LogLevel):
775 _cMaccel.set_log_level(level.value)
779 """@brief CacheType"""
781 Default = _cMaccel.CacheType.Default
782 Batch = _cMaccel.CacheType.Batch
783 Error = _cMaccel.CacheType.Error
787 """@brief Struct representing KV-cache information."""
791 cache_type: CacheType = CacheType.Error,
793 layer_hash: str =
"",
795 num_batches: int = 0,
805 def from_cpp(cls, _cache_info: _cMaccel.CacheInfo):
809 _cache_info.layer_hash,
811 _cache_info.num_batches,
815 def cache_type(self) -> CacheType:
819 def name(self) -> str:
823 def layer_hash(self) -> str:
827 def size(self) -> int:
831 def num_batches(self) -> int:
835 def cache_type(self, value: CacheType):
839 def name(self, value: str):
843 def layer_hash(self, value: str):
847 def size(self, value: int):
851 def num_batches(self, value: int):
857 @brief Starts event tracing and prepares to save the trace log to a specified file.
859 The trace log is recorded in "Chrome Tracing JSON format," which can be
860 viewed at https://ui.perfetto.dev/.
862 The trace log is not written immediately; it is saved only when
863 stop_tracing_events() is called.
865 @param[in] path The file path where the trace log should be stored.
866 @return True if tracing starts successfully, False otherwise.
868 return _cMaccel.start_tracing_events(path)
873 @brief Stops event tracing and writes the recorded trace log.
875 This function finalizes tracing and saves the collected trace data
876 to the file specified when start_tracing_events() was called.
878 _cMaccel.stop_tracing_events()
883 @brief Generates a structured summary of the specified MXQ model.
885 Returns an overview of the model contained in the MXQ file, including:
886 - Target NPU hardware
887 - Supported core modes and their associated cores
888 - The total number of model variants
890 - Input and output tensor shapes
891 - A list of layers with their types, output shapes, and input layer indices
893 The summary is returned as a human-readable string in a table and is useful for
894 inspecting model compatibility, structure, and input/output shapes.
896 @param[in] mxq_path Path to the MXQ model file.
897 @return A formatted string containing the model summary.
899 return _cMaccel.get_model_summary(mxq_path)
Struct representing input/output buffer information.
int max_channel(self)
Maximum channel of original input/output if data is sequential.
int reshaped_height(self)
Height of reshaped input/output.
int reshaped_width(self)
Width of reshaped input/output.
int original_width(self)
Width of original input/output.
int size(self)
Returns the total size of the NPU input/output.
int original_height(self)
Height of original input/output.
int original_size(self)
Returns the total size of the original input/output.
int reshaped_size(self)
Returns the total size of the reshaped input/output.
int original_channel(self)
Channel of original input/output.
int channel(self)
Channel of NPU input/output.
int reshaped_channel(self)
Channel of reshaped input/output.
int max_height(self)
Maximum height of original input/output if data is sequential.
int max_cache_size(self)
Maximum KV-cache size, relevant for LLM models using KV cache.
int width(self)
Width of NPU input/output.
int height(self)
Height of NPU input/output.
int max_width(self)
Maximum width of original input/output if data is sequential.
A simple byte-sized buffer.
Struct representing KV-cache information.
Enumerates clusters in the ARIES NPU.
Represents a unique identifier for an NPU core.
bool __lt__(self, other)
Compares two CoreId objects for ordering.
bool __eq__(self, other)
Checks if two CoreId objects are equal.
Defines the core mode for NPU execution.
Enumerates cores within a cluster in the ARIES NPU.
Configures a core mode and core allocation of a model for NPU inference.
int get_num_cores(self)
Gets the number of cores to be allocated for the model.
CoreAllocationPolicy get_core_allocation_policy(self)
Gets the core allocation policy to be applied to the model.
bool get_async_pipeline_enabled(self)
Returns whether the asynchronous pipeline is enabled in this configuration.
bool set_manual_mode(self)
bool force_single_npu_bundle(self, int npu_bundle_index)
Forces the use of a specific NPU bundle.
bool get_forced_npu_bundle_index(self)
Retrieves the index of the forced NPU bundle.
bool set_multi_core_mode(self, List[Cluster] clusters)
Sets the model to use multi-core mode for batch inference.
List[CoreId] get_core_ids(self)
Returns the list of NPU CoreIds to be used for model inference.
bool set_global_core_mode(self, List[Cluster] clusters)
bool set_auto_mode(self, int num_cores=1)
bool set_global8_core_mode(self)
Sets the model to use global8-core mode for inference.
CoreMode get_core_mode(self)
Gets the core mode to be applied to the model.
bool set_global4_core_mode(self, List[Cluster] clusters)
Sets the model to use global4-core mode for inference with a specified set of NPU clusters.
__init__(self, Optional[int] num_cores=None)
Default constructor.
bool set_single_core_mode(self, Optional[int] num_cores=None, Optional[List[CoreId]] core_ids=None)
Sets the model to use single-core mode for inference with a specified number of local cores.
None set_async_pipeline_enabled(self, bool enable)
Enables or disables the asynchronous pipeline required for asynchronous inference.
float __getitem__(self, int i)
Returns the scale value at the specified index.
List[float] scale_list(self)
stop_tracing_events()
Stops event tracing and writes the recorded trace log.
str get_model_summary(str mxq_path)
Generates a structured summary of the specified MXQ model.
bool start_tracing_events(str path)
Starts event tracing and prepares to save the trace log to a specified file.