model.py Source File

model.py Source File#

Runtime Library: model.py Source File
Runtime Library v0.30
Mobilint SDK qb
model.py
Go to the documentation of this file.
1
4
5from typing import List, Optional, Tuple, Union
6
7import numpy as np
8
9import maccel.maccel as _cMaccel
10from .accelerator import Accelerator
11from .future import *
12from .model_variant_handle import *
13from .type import *
14
15_Shape = Tuple[int, ...]
16
17__all__ = ["Model", "load"]
18
19
22
23
24# input ndarray의 shape이 유효한 shape인지 판별한다.
25def _is_valid_shape(input_shape: _Shape, shape: _Shape) -> bool:
26 if (len(input_shape) < len(shape)) or (len(input_shape) > len(shape) + 1):
27 return False
28 # input을 batch일 경우도 고려하여 [h, w, c] 및 [batch, h, w, c] 모두 고려한다
29 offset = 1 if len(input_shape) > len(shape) else 0
30 for s1, s2 in zip(input_shape[offset:], shape):
31 # Dimensions that allow variable lengths are represented by negative values.
32 # A variable-length dimension only permits multiples of the original value.
33 if s1 % s2 != 0 or (s2 > 0 and s1 != s2):
34 return False
35 return True
36
37
38# input ndarray의 shape를 검사하여 HWC인지 CHW인지 판별한다.
39def _is_shape_hwc(inputs: List[np.ndarray], shapes: List[_Shape]) -> Optional[bool]:
40 if len(inputs) != len(shapes):
41 return None
42
43 is_hwc = True
44 is_chw = True
45 for arr, shape in zip(inputs, shapes):
46 shape_hwc = (shape[0], shape[1], shape[2])
47 shape_chw = (shape[2], shape[0], shape[1])
48 is_hwc = is_hwc and _is_valid_shape(arr.shape, shape_hwc)
49 is_chw = is_chw and _is_valid_shape(arr.shape, shape_chw)
50
51 if not is_hwc and not is_chw:
52 return None
53 # If both `is_hwc` and `is_chw` are `True`, the memory format is assumed to be HWC.
54 return is_hwc
55
56
57# input ndarray에 맞는 model variant index와 shape를 판별한다.
58def _find_matching_variant_idx_and_is_hwc(
59 model, inputs: List[np.ndarray]
60) -> Tuple[int, bool]:
61 variant_idx = None
62 is_hwc = None
63 for i in range(model.get_num_model_variants()):
64 is_hwc = _is_shape_hwc(
65 inputs, model.get_model_variant_handle(i).get_model_input_shape()
66 )
67 if is_hwc is not None:
68 variant_idx = i
69 break
70
71 if is_hwc is None:
72 raise ValueError("Input shape is invalid.")
73 assert variant_idx is not None
74 return variant_idx, is_hwc
75
76
77# shape에 맞게 numpy ndarray를 생성한다.
78def _build_outputs(
79 shapes: List[_Shape], is_hwc: bool, dtype: np.dtype
80) -> List[np.ndarray]:
81 outputs = []
82 for shape in shapes:
83 if is_hwc:
84 shape = (shape[0], shape[1], shape[2])
85 else:
86 shape = (shape[2], shape[0], shape[1])
87 outputs.append(np.empty(shape, dtype=dtype))
88 return outputs
89
90
91# output에 들어있는 numpy ndarray의 shape가 올바른지 검사한다.
92def _check_output_shapes(
93 outputs: List[np.ndarray], shapes: List[_Shape], is_hwc: bool, dtype: np.dtype
94) -> None:
95 if len(outputs) != len(shapes):
96 raise ValueError("The number of outputs is different.")
97
98 for output, shape in zip(outputs, shapes):
99 if output.dtype != dtype:
100 raise ValueError("Output dtype mismatch.")
101
102 if is_hwc:
103 shape = (shape[0], shape[1], shape[2])
104 else:
105 shape = (shape[2], shape[0], shape[1])
106 if output.shape != shape:
107 raise ValueError("Output shape mismatch.")
108
109
110class Model:
111 """
112 @brief Represents an AI model loaded from an MXQ file.
113
114 This class loads an AI model from an MXQ file and provides functions to launch it
115 on the NPU and perform inference.
116 """
117
118 def __init__(self, path: str, model_config: Optional[ModelConfig] = None):
119 """
120 @brief Creates a Model object from the specified MXQ model file and configuration.
121
122 Parses the MXQ file and constructs a Model object using the provided configuration,
123 initializing the model with the given settings.
124
125 @note The created Model object must be launched before performing inference.
126 See Model.launch for more details.
127
128 @param[in] path The path to the MXQ model file.
129 @param[in] model_config The configuration settings to initialize the Model.
130 """
131 if model_config is None:
132 self._model = _cMaccel.Model(path)
133 else:
134 self._model = _cMaccel.Model(path, model_config._model_config)
135
136 # 기존 BufferInfo 대신에 ModelShape를 사용한다.
137 # Model {input,output} shape는 batch를 포함한 4D이다.
140
141 def launch(self, acc: Accelerator) -> None:
142 """
143 @brief Launches the model on the specified Accelerator, which represents
144 the actual NPU.
145
146 @param[in] acc The accelerator on which to launch the model.
147 """
148 self._model.launch(acc._accelerator)
149 self._acc = acc
150
151 def dispose(self) -> None:
152 """
153 @brief Disposes of the model loaded onto the NPU.
154
155 Releases any resources associated with the model on the NPU.
156 """
157 self._model.dispose()
158 self._acc = None
159
160 def is_target(self, core_id: CoreId) -> bool:
161 """
162 @brief Checks if the NPU core specified by CoreId is the target of the model.
163 In other words, whether the model is configured to use the given NPU core.
164
165 @param[in] core_id The CoreId to check.
166 @return True if the model is configured to use the specified CoreId, false
167 otherwise.
168 """
169 return self._model.is_target(core_id._core_id)
170
171 def get_core_mode(self) -> CoreMode:
172 """
173 @brief Retrieves the core mode of the model.
174
175 @return The CoreMode of the model.
176 """
177 return CoreMode(self._model.get_core_mode())
178
179 def get_target_cores(self) -> List[CoreId]:
180 """
181 @brief Returns the NPU cores the model is configured to use.
182
183 @return A list of CoreIds representing the target NPU cores.
184 """
185 return [CoreId.from_cpp(target) for target in self._model.target_cores]
186
187 @property
188 def target_cores(self) -> List[CoreId]:
189 """@deprecated"""
190 return [CoreId.from_cpp(target) for target in self._model.target_cores]
191
192 def infer(
193 self,
194 inputs: Union[np.ndarray, List[np.ndarray]],
195 outputs: Optional[List[np.ndarray]] = None,
196 cache_size: int = 0,
197 ) -> Optional[List[np.ndarray]]:
198 """
199 @brief Performs inference.
200
201 Fowllowing types of inference supported.
202 1. infer(in:List[numpy]) -> List[numpy] (float / int)
203 2. infer(in:numpy) -> List[numpy] (float / int)
204 3. infer(in:List[numpy], out:List[numpy]) (float / int)
205 4. infer(in:List[numpy], out:List[]) (float / int)
206 5. infer(in:numpy, out:List[numpy]) (float / int)
207 6. infer(in:numpy, out:List[]) (float / int)
208
209 @param[in] inputs Input data as a single numpy.ndarray or a list
210 of numpy.ndarray's.
211 @param[out] outputs Optional pre-allocated list of numpy.ndarray's
212 to store inference results.
213 @return Inference results as a list of numpy.ndarray.
214 """
215 if not isinstance(inputs, list):
216 inputs = [inputs]
217
218 variant_idx, is_hwc = _find_matching_variant_idx_and_is_hwc(self, inputs)
219 inputs = [np.ascontiguousarray(i) for i in inputs]
220
221 if outputs is None:
222 # No Output Parameter
223 infer_func = self._model.infer if is_hwc else self._model.infer_chw
224 return [np.asarray(o) for o in infer_func(inputs, cache_size)]
225
226 else:
227 if outputs:
228 _check_output_shapes(
229 outputs,
231 is_hwc,
232 inputs[0].dtype,
233 )
234 for oi in range(len(outputs)):
235 outputs[oi] = np.ascontiguousarray(outputs[oi])
236 else:
237 outputs[:] = _build_outputs(
239 is_hwc,
240 inputs[0].dtype,
241 )
242
243 if is_hwc:
244 self._model.infer(inputs, outputs, cache_size)
245 else:
246 self._model.infer_chw(inputs, outputs, cache_size)
247
249 self,
250 inputs: Union[
251 np.ndarray,
252 List[np.ndarray],
253 ],
254 cache_size: int = 0,
255 ) -> List[np.ndarray]:
256 """
257 @brief int8_t-to-float inference
258 Performs inference with input and output elements of type `int8_t`
259
260 Using these inference APIs requires manual scaling (quantization)
261 of float values to `int8_t` for input.
262
263 @note These APIs are intended for advanced use rather than typical usage.
264 """
265 if not isinstance(inputs, list):
266 inputs = [inputs]
267
268 _, is_hwc = _find_matching_variant_idx_and_is_hwc(self, inputs)
269 inputs = [np.ascontiguousarray(i) for i in inputs]
270
271 if is_hwc:
272 outputs = self._model.infer_to_float(inputs, cache_size)
273 else:
274 outputs = self._model.infer_chw_to_float(inputs, cache_size)
275
276 return [np.asarray(o) for o in outputs]
277
278 # For backward compatibility.
279 infer_chw = infer
280 infer_chw_to_float = infer_to_float
281
283 self,
284 inputs: List[Buffer],
285 outputs: List[Buffer],
286 shape: List[List[int]] = [],
287 cache_size: int = 0,
288 ) -> None:
289 """
290 @brief Buffer-to-Buffer inference
291
292 Performs inference using input and output elements in the NPU’s internal data type.
293 The inference operates on buffers allocated via the following APIs:
294
295 - `Model.acquire_input_buffer()`
296 - `Model.acquire_output_buffer()`
297 - `ModelVariantHandle.acquire_input_buffer()`
298 - `ModelVariantHandle.acquire_output_buffer()`
299
300 Additionally, `Model.reposition_inputs()`, `Model.reposition_outputs()`,
301 `ModelVariantHandle.reposition_inputs()`, `ModelVariantHandle.reposition_outputs()`
302 must be used properly.
303
304 @note These APIs are intended for advanced use rather than typical usage.
305 """
306 self._model.infer_buffer(
307 [i._buffer for i in inputs], [o._buffer for o in outputs], shape, cache_size
308 )
309
310 def infer_speedrun(self) -> None:
311 """
312 @brief Development-only API for measuring pure NPU inference speed.
313
314 Runs NPU inference without uploading inputs and without retrieving outputs.
315 """
317
319 self,
320 inputs: Union[np.ndarray, List[np.ndarray]],
321 ) -> Future:
322 """
323 @brief Asynchronous Inference
324
325 Performs inference asynchronously.
326
327 To use asynchronous inference, the model must be created using a `ModelConfig`
328 object with the async pipeline configured to be enabled. This is done by calling
329 @ref ModelConfig.set_async_pipeline_enabled
330 "ModelConfig.set_async_pipeline_enabled(True)" before passing the configuration to
331 `Model()`.
332
333 Example:
334 @code
335 import maccel
336
337 mc = maccel.ModelConfig()
338 mc.set_async_pipeline_enabled(True)
339
340 model = maccel.Model(MXQ_PATH, mc)
341 acc = maccel.Accelerator()
342
343 model.launch(acc)
344
345 future = model.infer_async(inputs)
346
347 ret = future.get()
348 @endcode
349
350 @note Currently, only CNN-based models are supported, as asynchronous execution is
351 particularly effective for this type of workload.
352
353 @note Limitations:
354 - RNN/LSTM and LLM models are not supported yet.
355 - Models requiring CPU offloading are not supported yet.
356 - Currently, only single-batch inference is supported (i.e., N = 1).
357 - Currently, Buffer inference is not supported. The following types
358 are supported in the synchronous API for advanced use cases, but are not
359 yet available for asynchronous inference:
360 - Buffer to Buffer
361 - Buffer to float
362 """
363 if not isinstance(inputs, list):
364 inputs = [inputs]
365 _, is_hwc = _find_matching_variant_idx_and_is_hwc(self, inputs)
366 inputs = [np.ascontiguousarray(i) for i in inputs]
367 infer_async_func = (
368 self._model.infer_async if is_hwc else self._model.infer_async_chw
369 )
370 return Future.from_cpp(infer_async_func(inputs), inputs)
371
373 self,
374 inputs: Union[np.ndarray, List[np.ndarray]],
375 ) -> Future:
376 """
377 @brief This method supports int8_t-to-float asynchronous inference.
378
379 @param[in] inputs Input data as a single numpy.ndarray or a list
380 of numpy.ndarray's.
381
382 @return A future that can be used to retrieve the inference result.
383 """
384 if not isinstance(inputs, list):
385 inputs = [inputs]
386 _, is_hwc = _find_matching_variant_idx_and_is_hwc(self, inputs)
387 inputs = [np.ascontiguousarray(i) for i in inputs]
388 infer_async_func = (
389 self._model.infer_async_to_float
390 if is_hwc
391 else self._model.infer_async_chw_to_float
392 )
393 return Future.from_cpp(infer_async_func(inputs), inputs)
394
396 self,
397 inputs: List[np.ndarray],
398 input_bufs: List[Buffer],
399 seqlens: List[List[int]] = [],
400 ) -> None:
401 """Reposition input"""
402 inputs = [np.ascontiguousarray(i) for i in inputs]
404 inputs, [buf._buffer for buf in input_bufs], seqlens
405 )
406
408 self,
409 output_bufs: List[Buffer],
410 outputs: List[np.ndarray],
411 seqlens: List[List[int]] = [],
412 ) -> None:
413 """Reposition output"""
414 if len(outputs) != len(self._output_shape):
415 outputs.clear()
416 for shape in self._output_shape:
417 outputs.append(np.empty(shape=shape, dtype=np.float32))
418 else:
419 for oi in range(len(outputs)):
420 outputs[oi] = np.ascontiguousarray(outputs[oi])
422 [buf._buffer for buf in output_bufs], outputs, seqlens
423 )
424
425 def get_num_model_variants(self) -> int:
426 """
427 @brief Returns the total number of model variants available in this model.
428
429 The `variant_idx` parameter passed to `Model.get_model_variant_handle()` must be
430 in the range [0, return value of this function).
431
432 @return The total number of model variants.
433 """
434 return self._model.get_num_model_variants()
435
436 def get_model_variant_handle(self, variant_idx) -> ModelVariantHandle:
437 """
438 @brief Retrieves a handle to the specified model variant.
439
440 Use the returned `ModelVariantHandle` to query details such as input and output
441 shapes for the selected variant.
442
443 @param[in] variant_idx Index of the model variant to retrieve.
444 Must be in the range [0, getNumModelVariants()).
445
446 @return A `ModelVariantHandle` object if successful;
447 otherwise, raise maccel.MAccelError "Model_InvalidVariantIdx".
448 """
449 return ModelVariantHandle.from_cpp(
450 self._model.get_model_variant_handle(variant_idx)
451 )
452
453 def get_model_input_shape(self) -> List[_Shape]:
454 """
455 @brief Returns the input shape of the model.
456
457 @return A list of input shape of the model.
458 """
459 return self._model.get_model_input_shape()
460
461 def get_model_output_shape(self) -> List[_Shape]:
462 """
463 @brief Returns the output shape of the model.
464
465 @return A list of output shape of the model.
466 """
467 return self._model.get_model_output_shape()
468
469 def get_input_scale(self) -> List[Scale]:
470 """
471 @brief Returns the input quantization scale(s) of the model.
472
473 @return A list of input scales.
474 """
475 return [Scale.from_cpp(s) for s in self._model.get_input_scale()]
476
477 def get_output_scale(self) -> List[Scale]:
478 """
479 @brief Returns the output quantization scale(s) of the model.
480
481 @return A list of output scales.
482 """
483 return [Scale.from_cpp(s) for s in self._model.get_output_scale()]
484
485 def get_input_buffer_info(self) -> List[BufferInfo]:
486 """
487 @brief Returns the input buffer information for the model.
488
489 @return A list of input buffer information.
490 """
491 return [BufferInfo.from_cpp(bi) for bi in self._model.get_input_buffer_info()]
492
493 def get_output_buffer_info(self) -> List[BufferInfo]:
494 """
495 @brief Returns the output buffer information of the model.
496
497 @return A list of output buffer information.
498 """
499 return [BufferInfo.from_cpp(bi) for bi in self._model.get_output_buffer_info()]
500
501 def acquire_input_buffer(self, seqlens: List[List[int]] = []) -> List[Buffer]:
502 """
503 @brief Buffer Management API
504
505 Acquires list of `Buffer` for input.
506 These API is required when calling `Model.infer_buffer()`.
507
508 @note These APIs are intended for advanced use rather than typical usage.
509 """
510 return [Buffer(b) for b in self._model.acquire_input_buffer(seqlens)]
511
512 def acquire_output_buffer(self, seqlens: List[List[int]] = []) -> List[Buffer]:
513 """
514 @brief Buffer Management API
515
516 Acquires list of `Buffer` for output.
517 These API is required when calling `Model.infer_buffer()`.
518
519 @note These APIs are intended for advanced use rather than typical usage.
520 """
521 return [Buffer(b) for b in self._model.acquire_output_buffer(seqlens)]
522
523 def release_buffer(self, buffer: List[Buffer]) -> None:
524 """
525 @brief Buffer Management API
526
527 Deallocate acquired Input/Output buffer
528
529 @note These APIs are intended for advanced use rather than typical usage.
530 """
531 self._model.release_buffer([b._buffer for b in buffer])
532
533 def get_identifier(self) -> int:
534 """
535 @brief Returns the model's unique identifier.
536
537 This identifier distinguishes multiple models within a single user program.
538 It is assigned incrementally, starting from 0 (e.g., 0, 1, 2, 3, ...).
539
540 @return The model identifier.
541 """
542 return self._model.get_identifier()
543
544 def get_model_path(self) -> str:
545 """
546 @brief Returns the path to the MXQ model file associated with the Model.
547
548 @return The MXQ file path.
549 """
550 return self._model.get_model_path()
551
552 def get_cache_infos(self) -> List[CacheInfo]:
553 """
554 @brief Returns informations of KV-cache of the model.
555
556 @return A list of CacheInfo objects.
557 """
558 return [CacheInfo.from_cpp(c) for c in self._model.get_cache_infos()]
559
560 def get_schedule_policy(self) -> SchedulePolicy:
561 """@deprecated"""
563
564 def get_latency_set_policy(self) -> LatencySetPolicy:
565 """@deprecated"""
567
568 def get_maintenance_policy(self) -> MaintenancePolicy:
569 """@deprecated"""
571
572 def get_latency_consumed(self) -> int:
573 """@deprecated"""
574 return self._model.get_latency_consumed()
575
576 def get_latency_finished(self) -> int:
577 """@deprecated"""
578 return self._model.get_latency_finished()
579
580 def reset_cache_memory(self) -> None:
581 """
582 @brief Resets the KV cache memory.
583
584 Clears the stored KV cache, restoring it to its initial state.
585 """
587
588 def dump_cache_memory(self) -> List[bytes]:
589 """
590 @brief Dumps the KV cache memory into buffers.
591
592 Writes the current KV cache data into provided buffers.
593
594 @return A list of bytes containing the KV cache data.
595 """
596 bufs = self._model.dump_cache_memory()
597 return [np.asarray(buf, np.int8).tobytes() for buf in bufs]
598
599 def load_cache_memory(self, bufs: List[bytes]) -> None:
600 """
601 @brief Loads the KV cache memory from buffers.
602
603 Restores the KV cache from the provided buffers.
604
605 @param[in] bufs A list of bytes containing the KV cache
606 """
608 [np.frombuffer(buf, dtype=np.int8) for buf in bufs]
609 )
610
611 def dump_cache_memory_to(self, cache_dir: str) -> None:
612 """
613 @brief Dumps KV cache memory to files in the specified directory.
614
615 Writes the KV cache data to binary files within the given directory.
616 Each file is named using the format: `cache_<layer_hash>.bin`.
617
618 @param[in] cache_dir Path to the directory where KV cache files will be saved.
619 """
620 self._model.dump_cache_memory(cache_dir)
621
622 def load_cache_memory_from(self, cache_dir: str) -> None:
623 """
624 @brief Loads the KV cache memory from files in the specified directory.
625
626 Reads KV cache data from files within the given directory and restores them.
627 Each file is named using the format: `cache_<layer_hash>.bin`.
628
629 @param[in] cache_dir Path to the directory where KV cache files are saved.
630 """
631 self._model.load_cache_memory(cache_dir)
632
634 self, cache_size: int, tail_size: int, mask: List[bool]
635 ) -> int:
636 """
637 @brief Filter the tail of the KV cache memory
638
639 Retains the desired caches in the tail of the KV cache memory, excludes the others,
640 and shifts the remaining caches forward.
641
642 @param[in] cache_size The number of tokens accumulated in the KV cache so far.
643 @param[in] tail_size The tail size of the KV cache to filter (<=32).
644 @param[in] mask A mask indicating tokens to retain or exclude at the tail of the KV
645 cache.
646
647 @return New cache size after tail filtering.
648 """
649 return self._model.filter_cache_tail(cache_size, tail_size, mask)
650
651 def move_cache_tail(self, num_head: int, num_tail: int, cache_size: int) -> int:
652 """
653 @brief Moves the tail of the KV cache memory to the end of the head.
654
655 Slice the tail of the KV cache memory up to the specified size
656 and moves it to the designated cache position.
657
658 @param[in] num_head The size of the KV cache head where the tail is appended.
659 @param[in] num_tail The size of the KV cache tail to be moved.
660 @param[in] cache_size The total number of tokens accumulated in the KV cache so
661 far.
662
663 @return The updated cache size after moving the tail.
664 """
665 return self._model.move_cache_tail(num_head, num_tail, cache_size)
666
667
668def load(path: str, model_config: Optional[ModelConfig] = None) -> Model:
669 """
670 @brief Single-step inference API. Creates model and uploads the model
671 into NPU immediately.
672
673 This operation performs the Accelerator declaration, Model declaration,
674 and launch in a single step.
675 """
676 acc = Accelerator()
677 model = Model(path, model_config)
678 model.launch(acc)
679 return model
680
681
682
Represents an accelerator, i.e., an NPU, used for executing models.
Represents an AI model loaded from an MXQ file.
Definition model.py:110
None load_cache_memory_from(self, str cache_dir)
Loads the KV cache memory from files in the specified directory.
Definition model.py:622
int get_identifier(self)
Returns the model's unique identifier.
Definition model.py:533
bool is_target(self, CoreId core_id)
Checks if the NPU core specified by CoreId is the target of the model.
Definition model.py:160
List[CoreId] get_target_cores(self)
Returns the NPU cores the model is configured to use.
Definition model.py:179
None dump_cache_memory_to(self, str cache_dir)
Dumps KV cache memory to files in the specified directory.
Definition model.py:611
str get_model_path(self)
Returns the path to the MXQ model file associated with the Model.
Definition model.py:544
Optional[List[np.ndarray]] infer(self, Union[np.ndarray, List[np.ndarray]] inputs, Optional[List[np.ndarray]] outputs=None, int cache_size=0)
Performs inference.
Definition model.py:197
Future infer_async_to_float(self, Union[np.ndarray, List[np.ndarray]] inputs)
This method supports int8_t-to-float asynchronous inference.
Definition model.py:375
List[_Shape] _input_shape
Definition model.py:138
List[Buffer] acquire_input_buffer(self, List[List[int]] seqlens=[])
Buffer Management API.
Definition model.py:501
None load_cache_memory(self, List[bytes] bufs)
Loads the KV cache memory from buffers.
Definition model.py:599
List[BufferInfo] get_output_buffer_info(self)
Returns the output buffer information of the model.
Definition model.py:493
List[_Shape] get_model_input_shape(self)
Returns the input shape of the model.
Definition model.py:453
List[CoreId] target_cores(self)
Definition model.py:188
List[Scale] get_input_scale(self)
Returns the input quantization scale(s) of the model.
Definition model.py:469
SchedulePolicy get_schedule_policy(self)
Definition model.py:560
None infer_buffer(self, List[Buffer] inputs, List[Buffer] outputs, List[List[int]] shape=[], int cache_size=0)
Buffer-to-Buffer inference.
Definition model.py:288
None dispose(self)
Disposes of the model loaded onto the NPU.
Definition model.py:151
MaintenancePolicy get_maintenance_policy(self)
Definition model.py:568
LatencySetPolicy get_latency_set_policy(self)
Definition model.py:564
List[BufferInfo] get_input_buffer_info(self)
Returns the input buffer information for the model.
Definition model.py:485
List[Scale] get_output_scale(self)
Returns the output quantization scale(s) of the model.
Definition model.py:477
None reset_cache_memory(self)
Resets the KV cache memory.
Definition model.py:580
List[np.ndarray] infer_to_float(self, Union[np.ndarray, List[np.ndarray],] inputs, int cache_size=0)
int8_t-to-float inference Performs inference with input and output elements of type int8_t
Definition model.py:255
ModelVariantHandle get_model_variant_handle(self, variant_idx)
Retrieves a handle to the specified model variant.
Definition model.py:436
List[bytes] dump_cache_memory(self)
Dumps the KV cache memory into buffers.
Definition model.py:588
__init__(self, str path, Optional[ModelConfig] model_config=None)
Creates a Model object from the specified MXQ model file and configuration.
Definition model.py:118
int get_latency_finished(self)
Definition model.py:576
Future infer_async(self, Union[np.ndarray, List[np.ndarray]] inputs)
Asynchronous Inference.
Definition model.py:321
None reposition_outputs(self, List[Buffer] output_bufs, List[np.ndarray] outputs, List[List[int]] seqlens=[])
Reposition output.
Definition model.py:412
int move_cache_tail(self, int num_head, int num_tail, int cache_size)
Moves the tail of the KV cache memory to the end of the head.
Definition model.py:651
List[_Shape] _output_shape
Definition model.py:139
List[Buffer] acquire_output_buffer(self, List[List[int]] seqlens=[])
Buffer Management API.
Definition model.py:512
int get_num_model_variants(self)
Returns the total number of model variants available in this model.
Definition model.py:425
None reposition_inputs(self, List[np.ndarray] inputs, List[Buffer] input_bufs, List[List[int]] seqlens=[])
Reposition input.
Definition model.py:400
List[_Shape] get_model_output_shape(self)
Returns the output shape of the model.
Definition model.py:461
CoreMode get_core_mode(self)
Retrieves the core mode of the model.
Definition model.py:171
None release_buffer(self, List[Buffer] buffer)
Buffer Management API.
Definition model.py:523
int get_latency_consumed(self)
Definition model.py:572
List[CacheInfo] get_cache_infos(self)
Returns informations of KV-cache of the model.
Definition model.py:552
int filter_cache_tail(self, int cache_size, int tail_size, List[bool] mask)
Filter the tail of the KV cache memory.
Definition model.py:635
None launch(self, Accelerator acc)
Launches the model on the specified Accelerator, which represents the actual NPU.
Definition model.py:141
None infer_speedrun(self)
Development-only API for measuring pure NPU inference speed.
Definition model.py:310
A simple byte-sized buffer.
Definition type.py:195
Defines the core mode for NPU execution.
Definition type.py:220
Model load(str path, Optional[ModelConfig] model_config=None)
Single-step inference API.
Definition model.py:668