model.py Source File

model.py Source File#

SDK qb Runtime Library: model.py Source File
SDK qb Runtime Library v1.0
MCS001-
model.py
Go to the documentation of this file.
1
4
5from typing import List, Optional, Tuple, Union
6
7import numpy as np
8
9import qbruntime.qbruntime as _cQbRuntime
10from .accelerator import Accelerator
11from .future import *
12from .model_variant_handle import *
13from .type import *
14
15_Shape = Tuple[int, ...]
16
17__all__ = ["Model", "load"]
18
19
22
23
24# input ndarray의 shape이 유효한 shape인지 판별한다.
25def _is_valid_shape(input_shape: _Shape, shape: _Shape) -> bool:
26 if (len(input_shape) < len(shape)) or (len(input_shape) > len(shape) + 1):
27 return False
28 # input을 batch일 경우도 고려하여 [h, w, c] 및 [batch, h, w, c] 모두 고려한다
29 offset = 1 if len(input_shape) > len(shape) else 0
30 for s1, s2 in zip(input_shape[offset:], shape):
31 # Dimensions that allow variable lengths are represented by negative values.
32 # A variable-length dimension only permits multiples of the original value.
33 if s1 % s2 != 0 or (s2 > 0 and s1 != s2):
34 return False
35 return True
36
37
38# input ndarray의 shape를 검사하여 HWC인지 CHW인지 판별한다. HWC/CHW의
39# shape이 동일한 경우, `is_hwc`와 `is_chw`를 모두 true로 반환한다.
40def _find_memory_format(
41 inputs: List[np.ndarray], shapes: List[_Shape]
42) -> Optional[Tuple[bool, bool]]:
43 if len(inputs) != len(shapes):
44 return None
45
46 is_hwc = True
47 is_chw = True
48 for arr, shape in zip(inputs, shapes):
49 shape_hwc = (shape[0], shape[1], shape[2])
50 shape_chw = (shape[2], shape[0], shape[1])
51 is_hwc = is_hwc and _is_valid_shape(arr.shape, shape_hwc)
52 is_chw = is_chw and _is_valid_shape(arr.shape, shape_chw)
53
54 if not is_hwc and not is_chw:
55 return None
56 return is_hwc, is_chw
57
58
59# input ndarray에 맞는 model variant index와 shape를 판별한다.
60def _find_matching_variant_idx_and_memory_format(
61 model, inputs: List[np.ndarray]
62) -> Tuple[int, Tuple[bool, bool]]:
63 variant_idx = None
64 is_hwc = None
65 is_chw = None
66 for i in range(model.get_num_model_variants()):
67 res = _find_memory_format(
68 inputs, model.get_model_variant_handle(i).get_model_input_shape()
69 )
70 if res is not None:
71 variant_idx = i
72 is_hwc, is_chw = res
73 break
74
75 if variant_idx is None:
76 raise ValueError("Input shape is invalid.")
77 return variant_idx, (is_hwc, is_chw)
78
79
80# shape에 맞게 numpy ndarray를 생성한다.
81def _build_outputs(
82 shapes: List[_Shape], is_hwc: bool, dtype: np.dtype
83) -> List[np.ndarray]:
84 outputs = []
85 for shape in shapes:
86 if is_hwc:
87 shape = (shape[0], shape[1], shape[2])
88 else:
89 shape = (shape[2], shape[0], shape[1])
90 outputs.append(np.empty(shape, dtype=dtype))
91 return outputs
92
93
94# output에 들어있는 numpy ndarray의 shape가 올바른지 검사한다.
95def _check_output_shapes(
96 outputs: List[np.ndarray], shapes: List[_Shape], is_hwc: bool, dtype: np.dtype
97) -> None:
98 if len(outputs) != len(shapes):
99 raise ValueError("The number of outputs is different.")
100
101 for output, shape in zip(outputs, shapes):
102 if output.dtype != dtype:
103 raise ValueError("Output dtype mismatch.")
104
105 if is_hwc:
106 shape = (shape[0], shape[1], shape[2])
107 else:
108 shape = (shape[2], shape[0], shape[1])
109 if output.shape != shape:
110 raise ValueError("Output shape mismatch.")
111
112
113class Model:
114 """
115 @brief Represents an AI model loaded from an MXQ file.
116
117 This class loads an AI model from an MXQ file and provides functions to launch it
118 on the NPU and perform inference.
119 """
120
121 def __init__(self, path: str, model_config: Optional[ModelConfig] = None):
122 """
123 @brief Creates a Model object from the specified MXQ model file and configuration.
124
125 Parses the MXQ file and constructs a Model object using the provided configuration,
126 initializing the model with the given settings.
127
128 @note The created Model object must be launched before performing inference.
129 See Model.launch for more details.
130
131 @param[in] path The path to the MXQ model file.
132 @param[in] model_config The configuration settings to initialize the Model.
133 """
134 if model_config is None:
135 self._model = _cQbRuntime.Model(path)
136 else:
137 self._model = _cQbRuntime.Model(path, model_config._model_config)
138
139 # 기존 BufferInfo 대신에 ModelShape를 사용한다.
140 # Model {input,output} shape는 batch를 포함한 4D이다.
143
144 def launch(self, acc: Accelerator) -> None:
145 """
146 @brief Launches the model on the specified Accelerator, which represents
147 the actual NPU.
148
149 @param[in] acc The accelerator on which to launch the model.
150 """
151 self._model.launch(acc._accelerator)
152 self._acc = acc
153
154 def dispose(self) -> None:
155 """
156 @brief Disposes of the model loaded onto the NPU.
157
158 Releases any resources associated with the model on the NPU.
159 """
160 self._model.dispose()
161 self._acc = None
162
163 def is_target(self, core_id: CoreId) -> bool:
164 """
165 @brief Checks if the NPU core specified by CoreId is the target of the model.
166 In other words, whether the model is configured to use the given NPU core.
167
168 @param[in] core_id The CoreId to check.
169 @return True if the model is configured to use the specified CoreId, false
170 otherwise.
171 """
172 return self._model.is_target(core_id._core_id)
173
174 def get_core_mode(self) -> CoreMode:
175 """
176 @brief Retrieves the core mode of the model.
177
178 @return The CoreMode of the model.
179 """
180 return CoreMode(self._model.get_core_mode())
181
182 def get_target_cores(self) -> List[CoreId]:
183 """
184 @brief Returns the NPU cores the model is configured to use.
185
186 @return A list of CoreIds representing the target NPU cores.
187 """
188 return [CoreId.from_cpp(target) for target in self._model.target_cores]
189
190 @property
191 def target_cores(self) -> List[CoreId]:
192 """@deprecated"""
193 return [CoreId.from_cpp(target) for target in self._model.target_cores]
194
195 def infer(
196 self,
197 inputs: Union[np.ndarray, List[np.ndarray]],
198 outputs: Optional[List[np.ndarray]] = None,
199 cache_size: int = 0,
200 ) -> Optional[List[np.ndarray]]:
201 """
202 @brief Performs inference.
203
204 Fowllowing types of inference supported.
205 1. infer(in:List[numpy]) -> List[numpy] (float / int)
206 2. infer(in:numpy) -> List[numpy] (float / int)
207 3. infer(in:List[numpy], out:List[numpy]) (float / int)
208 4. infer(in:List[numpy], out:List[]) (float / int)
209 5. infer(in:numpy, out:List[numpy]) (float / int)
210 6. infer(in:numpy, out:List[]) (float / int)
211
212 @param[in] inputs Input data as a single numpy.ndarray or a list
213 of numpy.ndarray's.
214 @param[out] outputs Optional pre-allocated list of numpy.ndarray's
215 to store inference results.
216 @return Inference results as a list of numpy.ndarray.
217 """
218 return self._infer(inputs, outputs, cache_size)
219
220 def infer_hwc(
221 self,
222 inputs: Union[np.ndarray, List[np.ndarray]],
223 outputs: Optional[List[np.ndarray]] = None,
224 cache_size: int = 0,
225 ) -> Optional[List[np.ndarray]]:
226 return self._infer(inputs, outputs, cache_size, True)
227
228 def infer_chw(
229 self,
230 inputs: Union[np.ndarray, List[np.ndarray]],
231 outputs: Optional[List[np.ndarray]] = None,
232 cache_size: int = 0,
233 ) -> Optional[List[np.ndarray]]:
234 return self._infer(inputs, outputs, cache_size, False)
235
236 def _infer(
237 self,
238 inputs: Union[np.ndarray, List[np.ndarray]],
239 outputs: Optional[List[np.ndarray]],
240 cache_size: int,
241 is_target_hwc: Optional[bool] = None,
242 ) -> Optional[List[np.ndarray]]:
243 if not isinstance(inputs, list):
244 inputs = [inputs]
245
246 variant_idx, (is_hwc, is_chw) = _find_matching_variant_idx_and_memory_format(
247 self, inputs
248 )
249 if (is_target_hwc is not None) and (
250 (is_target_hwc != is_hwc) and (is_target_hwc == is_chw)
251 ):
252 raise ValueError("Input shape is invalid.")
253 elif is_target_hwc is None:
254 is_target_hwc = is_hwc
255 inputs = [np.ascontiguousarray(i) for i in inputs]
256
257 if outputs is None:
258 # No Output Parameter
259 infer_func = self._model.infer if is_target_hwc else self._model.infer_chw
260 return [np.asarray(o) for o in infer_func(inputs, cache_size)]
261
262 else:
263 if outputs:
264 _check_output_shapes(
265 outputs,
267 is_target_hwc,
268 inputs[0].dtype,
269 )
270 for oi in range(len(outputs)):
271 outputs[oi] = np.ascontiguousarray(outputs[oi])
272 else:
273 outputs[:] = _build_outputs(
275 is_target_hwc,
276 inputs[0].dtype,
277 )
278
279 if is_target_hwc:
280 self._model.infer(inputs, outputs, cache_size)
281 else:
282 self._model.infer_chw(inputs, outputs, cache_size)
283
285 self,
286 inputs: Union[
287 np.ndarray,
288 List[np.ndarray],
289 ],
290 cache_size: int = 0,
291 ) -> List[np.ndarray]:
292 """
293 @brief int8_t-to-float inference
294 Performs inference with input and output elements of type `int8_t`
295
296 Using these inference APIs requires manual scaling (quantization)
297 of float values to `int8_t` for input.
298
299 @note These APIs are intended for advanced use rather than typical usage.
300 """
301 return self._infer_to_float(inputs, cache_size)
302
303 def infer_hwc_to_float(
304 self,
305 inputs: Union[
306 np.ndarray,
307 List[np.ndarray],
308 ],
309 cache_size: int = 0,
310 ) -> List[np.ndarray]:
311 return self._infer_to_float(inputs, cache_size, True)
312
313 def infer_chw_to_float(
314 self,
315 inputs: Union[
316 np.ndarray,
317 List[np.ndarray],
318 ],
319 cache_size: int = 0,
320 ) -> List[np.ndarray]:
321 return self._infer_to_float(inputs, cache_size, False)
322
324 self,
325 inputs: Union[
326 np.ndarray,
327 List[np.ndarray],
328 ],
329 cache_size: int,
330 is_target_hwc: Optional[bool] = None,
331 ) -> List[np.ndarray]:
332 """
333 @brief int8_t-to-float inference
334 Performs inference with input and output elements of type `int8_t`
335
336 Using these inference APIs requires manual scaling (quantization)
337 of float values to `int8_t` for input.
338
339 @note These APIs are intended for advanced use rather than typical usage.
340 """
341 if not isinstance(inputs, list):
342 inputs = [inputs]
343
344 _, (is_hwc, is_chw) = _find_matching_variant_idx_and_memory_format(self, inputs)
345 if (is_target_hwc is not None) and (
346 (is_target_hwc != is_hwc) and (is_target_hwc == is_chw)
347 ):
348 raise ValueError("Input shape is invalid.")
349 elif is_target_hwc is None:
350 is_target_hwc = is_hwc
351 inputs = [np.ascontiguousarray(i) for i in inputs]
352
353 if is_target_hwc:
354 outputs = self._model.infer_to_float(inputs, cache_size)
355 else:
356 outputs = self._model.infer_chw_to_float(inputs, cache_size)
357
358 return [np.asarray(o) for o in outputs]
359
361 self,
362 inputs: List[Buffer],
363 outputs: List[Buffer],
364 shape: List[List[int]] = [],
365 cache_size: int = 0,
366 ) -> None:
367 """
368 @brief Buffer-to-Buffer inference
369
370 Performs inference using input and output elements in the NPU’s internal data type.
371 The inference operates on buffers allocated via the following APIs:
372
373 - `Model.acquire_input_buffer()`
374 - `Model.acquire_output_buffer()`
375 - `ModelVariantHandle.acquire_input_buffer()`
376 - `ModelVariantHandle.acquire_output_buffer()`
377
378 Additionally, `Model.reposition_inputs()`, `Model.reposition_outputs()`,
379 `ModelVariantHandle.reposition_inputs()`, `ModelVariantHandle.reposition_outputs()`
380 must be used properly.
381
382 @note These APIs are intended for advanced use rather than typical usage.
383 """
384 self._model.infer_buffer(
385 [i._buffer for i in inputs], [o._buffer for o in outputs], shape, cache_size
386 )
387
388 def infer_speedrun(self) -> None:
389 """
390 @brief Development-only API for measuring pure NPU inference speed.
391
392 Runs NPU inference without uploading inputs and without retrieving outputs.
393 """
395
397 self,
398 inputs: Union[np.ndarray, List[np.ndarray]],
399 ) -> Future:
400 """
401 @brief Asynchronous Inference
402
403 Performs inference asynchronously.
404
405 To use asynchronous inference, the model must be created using a `ModelConfig`
406 object with the async pipeline configured to be enabled. This is done by calling
407 @ref ModelConfig.set_async_pipeline_enabled
408 "ModelConfig.set_async_pipeline_enabled(True)" before passing the configuration to
409 `Model()`.
410
411 Example:
412 @code
413 import qbruntime
414
415 mc = qbruntime.ModelConfig()
416 mc.set_async_pipeline_enabled(True)
417
418 model = qbruntime.Model(MXQ_PATH, mc)
419 acc = qbruntime.Accelerator()
420
421 model.launch(acc)
422
423 future = model.infer_async(inputs)
424
425 ret = future.get()
426 @endcode
427
428 @note Currently, only CNN-based models are supported, as asynchronous execution is
429 particularly effective for this type of workload.
430
431 @note Limitations:
432 - RNN/LSTM and LLM models are not supported yet.
433 - Models requiring CPU offloading are not supported yet.
434 - Currently, only single-batch inference is supported (i.e., N = 1).
435 - Currently, Buffer inference is not supported. The following types
436 are supported in the synchronous API for advanced use cases, but are not
437 yet available for asynchronous inference:
438 - Buffer to Buffer
439 - Buffer to float
440 """
441 if not isinstance(inputs, list):
442 inputs = [inputs]
443 _, (is_hwc, _) = _find_matching_variant_idx_and_memory_format(self, inputs)
444 inputs = [np.ascontiguousarray(i) for i in inputs]
445 infer_async_func = (
446 self._model.infer_async if is_hwc else self._model.infer_async_chw
447 )
448 return Future.from_cpp(infer_async_func(inputs), inputs)
449
451 self,
452 inputs: Union[np.ndarray, List[np.ndarray]],
453 ) -> Future:
454 """
455 @brief This method supports int8_t-to-float asynchronous inference.
456
457 @param[in] inputs Input data as a single numpy.ndarray or a list
458 of numpy.ndarray's.
459
460 @return A future that can be used to retrieve the inference result.
461 """
462 if not isinstance(inputs, list):
463 inputs = [inputs]
464 _, (is_hwc, _) = _find_matching_variant_idx_and_memory_format(self, inputs)
465 inputs = [np.ascontiguousarray(i) for i in inputs]
466 infer_async_func = (
467 self._model.infer_async_to_float
468 if is_hwc
469 else self._model.infer_async_chw_to_float
470 )
471 return Future.from_cpp(infer_async_func(inputs), inputs)
472
474 self,
475 inputs: List[np.ndarray],
476 input_bufs: List[Buffer],
477 seqlens: List[List[int]] = [],
478 ) -> None:
479 """Reposition input"""
480 inputs = [np.ascontiguousarray(i) for i in inputs]
482 inputs, [buf._buffer for buf in input_bufs], seqlens
483 )
484
486 self,
487 output_bufs: List[Buffer],
488 outputs: List[np.ndarray],
489 seqlens: List[List[int]] = [],
490 ) -> None:
491 """Reposition output"""
492 if len(outputs) != len(self._output_shape):
493 outputs.clear()
494 for shape in self._output_shape:
495 outputs.append(np.empty(shape=shape, dtype=np.float32))
496 else:
497 for oi in range(len(outputs)):
498 outputs[oi] = np.ascontiguousarray(outputs[oi])
500 [buf._buffer for buf in output_bufs], outputs, seqlens
501 )
502
503 def get_num_model_variants(self) -> int:
504 """
505 @brief Returns the total number of model variants available in this model.
506
507 The `variant_idx` parameter passed to `Model.get_model_variant_handle()` must be
508 in the range [0, return value of this function).
509
510 @return The total number of model variants.
511 """
512 return self._model.get_num_model_variants()
513
514 def get_model_variant_handle(self, variant_idx) -> ModelVariantHandle:
515 """
516 @brief Retrieves a handle to the specified model variant.
517
518 Use the returned `ModelVariantHandle` to query details such as input and output
519 shapes for the selected variant.
520
521 @param[in] variant_idx Index of the model variant to retrieve.
522 Must be in the range [0, getNumModelVariants()).
523
524 @return A `ModelVariantHandle` object if successful;
525 otherwise, raise qbruntime.QbRuntimeError "Model_InvalidVariantIdx".
526 """
527 return ModelVariantHandle.from_cpp(
528 self._model.get_model_variant_handle(variant_idx)
529 )
530
531 def get_model_input_shape(self) -> List[_Shape]:
532 """
533 @brief Returns the input shape of the model.
534
535 @return A list of input shape of the model.
536 """
537 return self._model.get_model_input_shape()
538
539 def get_model_output_shape(self) -> List[_Shape]:
540 """
541 @brief Returns the output shape of the model.
542
543 @return A list of output shape of the model.
544 """
545 return self._model.get_model_output_shape()
546
547 def get_input_scale(self) -> List[Scale]:
548 """
549 @brief Returns the input quantization scale(s) of the model.
550
551 @return A list of input scales.
552 """
553 return [Scale.from_cpp(s) for s in self._model.get_input_scale()]
554
555 def get_output_scale(self) -> List[Scale]:
556 """
557 @brief Returns the output quantization scale(s) of the model.
558
559 @return A list of output scales.
560 """
561 return [Scale.from_cpp(s) for s in self._model.get_output_scale()]
562
563 def get_input_buffer_info(self) -> List[BufferInfo]:
564 """
565 @brief Returns the input buffer information for the model.
566
567 @return A list of input buffer information.
568 """
569 return [BufferInfo.from_cpp(bi) for bi in self._model.get_input_buffer_info()]
570
571 def get_output_buffer_info(self) -> List[BufferInfo]:
572 """
573 @brief Returns the output buffer information of the model.
574
575 @return A list of output buffer information.
576 """
577 return [BufferInfo.from_cpp(bi) for bi in self._model.get_output_buffer_info()]
578
579 def acquire_input_buffer(self, seqlens: List[List[int]] = []) -> List[Buffer]:
580 """
581 @brief Buffer Management API
582
583 Acquires list of `Buffer` for input.
584 These API is required when calling `Model.infer_buffer()`.
585
586 @note These APIs are intended for advanced use rather than typical usage.
587 """
588 return [Buffer(b) for b in self._model.acquire_input_buffer(seqlens)]
589
590 def acquire_output_buffer(self, seqlens: List[List[int]] = []) -> List[Buffer]:
591 """
592 @brief Buffer Management API
593
594 Acquires list of `Buffer` for output.
595 These API is required when calling `Model.infer_buffer()`.
596
597 @note These APIs are intended for advanced use rather than typical usage.
598 """
599 return [Buffer(b) for b in self._model.acquire_output_buffer(seqlens)]
600
601 def release_buffer(self, buffer: List[Buffer]) -> None:
602 """
603 @brief Buffer Management API
604
605 Deallocate acquired Input/Output buffer
606
607 @note These APIs are intended for advanced use rather than typical usage.
608 """
609 self._model.release_buffer([b._buffer for b in buffer])
610
611 def get_identifier(self) -> int:
612 """
613 @brief Returns the model's unique identifier.
614
615 This identifier distinguishes multiple models within a single user program.
616 It is assigned incrementally, starting from 0 (e.g., 0, 1, 2, 3, ...).
617
618 @return The model identifier.
619 """
620 return self._model.get_identifier()
621
622 def get_model_path(self) -> str:
623 """
624 @brief Returns the path to the MXQ model file associated with the Model.
625
626 @return The MXQ file path.
627 """
628 return self._model.get_model_path()
629
630 def get_cache_infos(self) -> List[CacheInfo]:
631 """
632 @brief Returns informations of KV-cache of the model.
633
634 @return A list of CacheInfo objects.
635 """
636 return [CacheInfo.from_cpp(c) for c in self._model.get_cache_infos()]
637
638 def get_latency_consumed(self) -> int:
639 """@deprecated"""
640 return self._model.get_latency_consumed()
641
642 def get_latency_finished(self) -> int:
643 """@deprecated"""
644 return self._model.get_latency_finished()
645
646 def dump_cache_memory(self) -> List[bytes]:
647 """
648 @brief Dumps the KV cache memory into buffers.
649
650 Writes the current KV cache data into provided buffers.
651
652 @return A list of bytes containing the KV cache data.
653 """
654 bufs = self._model.dump_cache_memory()
655 return [np.asarray(buf, np.int8).tobytes() for buf in bufs]
656
657 def load_cache_memory(self, bufs: List[bytes]) -> None:
658 """
659 @brief Loads the KV cache memory from buffers.
660
661 Restores the KV cache from the provided buffers.
662
663 @param[in] bufs A list of bytes containing the KV cache
664 """
666 [np.frombuffer(buf, dtype=np.int8) for buf in bufs]
667 )
668
669 def dump_cache_memory_to(self, cache_dir: str) -> None:
670 """
671 @brief Dumps KV cache memory to files in the specified directory.
672
673 Writes the KV cache data to binary files within the given directory.
674 Each file is named using the format: `cache_<layer_hash>.bin`.
675
676 @param[in] cache_dir Path to the directory where KV cache files will be saved.
677 """
678 self._model.dump_cache_memory(cache_dir)
679
680 def load_cache_memory_from(self, cache_dir: str) -> None:
681 """
682 @brief Loads the KV cache memory from files in the specified directory.
683
684 Reads KV cache data from files within the given directory and restores them.
685 Each file is named using the format: `cache_<layer_hash>.bin`.
686
687 @param[in] cache_dir Path to the directory where KV cache files are saved.
688 """
689 self._model.load_cache_memory(cache_dir)
690
692 self, cache_size: int, tail_size: int, mask: List[bool]
693 ) -> int:
694 """
695 @brief Filter the tail of the KV cache memory
696
697 Retains the desired caches in the tail of the KV cache memory, excludes the others,
698 and shifts the remaining caches forward.
699
700 @param[in] cache_size The number of tokens accumulated in the KV cache so far.
701 @param[in] tail_size The tail size of the KV cache to filter (<=32).
702 @param[in] mask A mask indicating tokens to retain or exclude at the tail of the KV
703 cache.
704
705 @return New cache size after tail filtering.
706 """
707 return self._model.filter_cache_tail(cache_size, tail_size, mask)
708
709 def move_cache_tail(self, num_head: int, num_tail: int, cache_size: int) -> int:
710 """
711 @brief Moves the tail of the KV cache memory to the end of the head.
712
713 Slice the tail of the KV cache memory up to the specified size
714 and moves it to the designated cache position.
715
716 @param[in] num_head The size of the KV cache head where the tail is appended.
717 @param[in] num_tail The size of the KV cache tail to be moved.
718 @param[in] cache_size The total number of tokens accumulated in the KV cache so
719 far.
720
721 @return The updated cache size after moving the tail.
722 """
723 return self._model.move_cache_tail(num_head, num_tail, cache_size)
724
725
726def load(path: str, model_config: Optional[ModelConfig] = None) -> Model:
727 """
728 @brief Single-step inference API. Creates model and uploads the model
729 into NPU immediately.
730
731 This operation performs the Accelerator declaration, Model declaration,
732 and launch in a single step.
733 """
734 acc = Accelerator()
735 model = Model(path, model_config)
736 model.launch(acc)
737 return model
738
739
740
Represents an accelerator, i.e., an NPU, used for executing models.
Represents an AI model loaded from an MXQ file.
Definition model.py:113
None load_cache_memory(self, List[bytes] bufs)
Loads the KV cache memory from buffers.
Definition model.py:657
None launch(self, Accelerator acc)
Launches the model on the specified Accelerator, which represents the actual NPU.
Definition model.py:144
None reposition_outputs(self, List[Buffer] output_bufs, List[np.ndarray] outputs, List[List[int]] seqlens=[])
Reposition output.
Definition model.py:490
List[Scale] get_input_scale(self)
Returns the input quantization scale(s) of the model.
Definition model.py:547
CoreMode get_core_mode(self)
Retrieves the core mode of the model.
Definition model.py:174
List[CoreId] get_target_cores(self)
Returns the NPU cores the model is configured to use.
Definition model.py:182
List[_Shape] _input_shape
Definition model.py:141
List[CoreId] target_cores(self)
Definition model.py:191
Optional[List[np.ndarray]] infer(self, Union[np.ndarray, List[np.ndarray]] inputs, Optional[List[np.ndarray]] outputs=None, int cache_size=0)
Performs inference.
Definition model.py:200
List[_Shape] _output_shape
Definition model.py:142
List[Buffer] acquire_input_buffer(self, List[List[int]] seqlens=[])
Buffer Management API.
Definition model.py:579
List[BufferInfo] get_output_buffer_info(self)
Returns the output buffer information of the model.
Definition model.py:571
str get_model_path(self)
Returns the path to the MXQ model file associated with the Model.
Definition model.py:622
Future infer_async(self, Union[np.ndarray, List[np.ndarray]] inputs)
Asynchronous Inference.
Definition model.py:399
Optional[List[np.ndarray]] _infer(self, Union[np.ndarray, List[np.ndarray]] inputs, Optional[List[np.ndarray]] outputs, int cache_size, Optional[bool] is_target_hwc=None)
Definition model.py:242
List[Buffer] acquire_output_buffer(self, List[List[int]] seqlens=[])
Buffer Management API.
Definition model.py:590
int filter_cache_tail(self, int cache_size, int tail_size, List[bool] mask)
Filter the tail of the KV cache memory.
Definition model.py:693
List[np.ndarray] _infer_to_float(self, Union[np.ndarray, List[np.ndarray],] inputs, int cache_size, Optional[bool] is_target_hwc=None)
int8_t-to-float inference Performs inference with input and output elements of type int8_t
Definition model.py:331
int get_num_model_variants(self)
Returns the total number of model variants available in this model.
Definition model.py:503
List[bytes] dump_cache_memory(self)
Dumps the KV cache memory into buffers.
Definition model.py:646
bool is_target(self, CoreId core_id)
Checks if the NPU core specified by CoreId is the target of the model.
Definition model.py:163
None infer_speedrun(self)
Development-only API for measuring pure NPU inference speed.
Definition model.py:388
int get_latency_consumed(self)
Definition model.py:638
Future infer_async_to_float(self, Union[np.ndarray, List[np.ndarray]] inputs)
This method supports int8_t-to-float asynchronous inference.
Definition model.py:453
ModelVariantHandle get_model_variant_handle(self, variant_idx)
Retrieves a handle to the specified model variant.
Definition model.py:514
List[_Shape] get_model_output_shape(self)
Returns the output shape of the model.
Definition model.py:539
List[CacheInfo] get_cache_infos(self)
Returns informations of KV-cache of the model.
Definition model.py:630
int get_latency_finished(self)
Definition model.py:642
None release_buffer(self, List[Buffer] buffer)
Buffer Management API.
Definition model.py:601
None dispose(self)
Disposes of the model loaded onto the NPU.
Definition model.py:154
List[_Shape] get_model_input_shape(self)
Returns the input shape of the model.
Definition model.py:531
List[Scale] get_output_scale(self)
Returns the output quantization scale(s) of the model.
Definition model.py:555
None load_cache_memory_from(self, str cache_dir)
Loads the KV cache memory from files in the specified directory.
Definition model.py:680
None infer_buffer(self, List[Buffer] inputs, List[Buffer] outputs, List[List[int]] shape=[], int cache_size=0)
Buffer-to-Buffer inference.
Definition model.py:366
List[np.ndarray] infer_to_float(self, Union[np.ndarray, List[np.ndarray],] inputs, int cache_size=0)
int8_t-to-float inference Performs inference with input and output elements of type int8_t
Definition model.py:291
__init__(self, str path, Optional[ModelConfig] model_config=None)
Creates a Model object from the specified MXQ model file and configuration.
Definition model.py:121
None dump_cache_memory_to(self, str cache_dir)
Dumps KV cache memory to files in the specified directory.
Definition model.py:669
None reposition_inputs(self, List[np.ndarray] inputs, List[Buffer] input_bufs, List[List[int]] seqlens=[])
Reposition input.
Definition model.py:478
List[BufferInfo] get_input_buffer_info(self)
Returns the input buffer information for the model.
Definition model.py:563
int get_identifier(self)
Returns the model's unique identifier.
Definition model.py:611
int move_cache_tail(self, int num_head, int num_tail, int cache_size)
Moves the tail of the KV cache memory to the end of the head.
Definition model.py:709
A simple byte-sized buffer.
Definition type.py:171
Defines the core mode for NPU execution.
Definition type.py:196
Model load(str path, Optional[ModelConfig] model_config=None)
Single-step inference API.
Definition model.py:726