model.py Source File

model.py Source File#

SDK qb Runtime Library: model.py Source File
SDK qb Runtime Library v1.2
MCS001-
model.py
Go to the documentation of this file.
1
4
5from typing import List, Optional, Tuple, Union
6
7import numpy as np
8
9import qbruntime.qbruntime as _cQbRuntime
10from .accelerator import Accelerator
11from .future import *
12from .model_variant_handle import *
13from .type import *
14
15_Shape = Tuple[int, ...]
16
17__all__ = ["Model", "load"]
18
19
22
23
24# input ndarray의 shape이 유효한 shape인지 판별한다.
25def _is_valid_shape(input_shape: _Shape, shape: _Shape) -> bool:
26 if (len(input_shape) < len(shape)) or (len(input_shape) > len(shape) + 1):
27 return False
28 # input을 batch일 경우도 고려하여 [h, w, c] 및 [batch, h, w, c] 모두 고려한다
29 offset = 1 if len(input_shape) > len(shape) else 0
30 for s1, s2 in zip(input_shape[offset:], shape):
31 # Dimensions that allow variable lengths are represented by negative values.
32 # A variable-length dimension only permits multiples of the original value.
33 if s1 % s2 != 0 or (s2 > 0 and s1 != s2):
34 return False
35 return True
36
37
38# input ndarray의 shape를 검사하여 HWC인지 CHW인지 판별한다. HWC/CHW의
39# shape이 동일한 경우, `is_hwc`와 `is_chw`를 모두 true로 반환한다.
40def _find_memory_format(
41 inputs: List[np.ndarray], shapes: List[_Shape]
42) -> Optional[Tuple[bool, bool]]:
43 if len(inputs) != len(shapes):
44 return None
45
46 is_hwc = True
47 is_chw = True
48 for arr, shape in zip(inputs, shapes):
49 shape_hwc = (shape[0], shape[1], shape[2])
50 shape_chw = (shape[2], shape[0], shape[1])
51 is_hwc = is_hwc and _is_valid_shape(arr.shape, shape_hwc)
52 is_chw = is_chw and _is_valid_shape(arr.shape, shape_chw)
53
54 if not is_hwc and not is_chw:
55 return None
56 return is_hwc, is_chw
57
58
59# input ndarray에 맞는 model variant index와 shape를 판별한다.
60def _find_matching_variant_idx_and_memory_format(
61 model, inputs: List[np.ndarray]
62) -> Tuple[int, Tuple[bool, bool]]:
63 variant_idx = None
64 is_hwc = None
65 is_chw = None
66 for i in range(model.get_num_model_variants()):
67 res = _find_memory_format(
68 inputs, model.get_model_variant_handle(i).get_model_input_shape()
69 )
70 if res is not None:
71 variant_idx = i
72 is_hwc, is_chw = res
73 break
74
75 if variant_idx is None:
76 raise ValueError("Input shape is invalid.")
77 return variant_idx, (is_hwc, is_chw)
78
79
80# shape에 맞게 numpy ndarray를 생성한다.
81def _build_outputs(
82 shapes: List[_Shape], is_hwc: bool, dtype: np.dtype
83) -> List[np.ndarray]:
84 outputs = []
85 for shape in shapes:
86 if is_hwc:
87 shape = (shape[0], shape[1], shape[2])
88 else:
89 shape = (shape[2], shape[0], shape[1])
90 outputs.append(np.empty(shape, dtype=dtype))
91 return outputs
92
93
94# output에 들어있는 numpy ndarray의 shape가 올바른지 검사한다.
95def _check_output_shapes(
96 outputs: List[np.ndarray], shapes: List[_Shape], is_hwc: bool, dtype: np.dtype
97) -> None:
98 if len(outputs) != len(shapes):
99 raise ValueError("The number of outputs is different.")
100
101 for output, shape in zip(outputs, shapes):
102 if output.dtype != dtype:
103 raise ValueError("Output dtype mismatch.")
104
105 if is_hwc:
106 shape = (shape[0], shape[1], shape[2])
107 else:
108 shape = (shape[2], shape[0], shape[1])
109 if output.shape != shape:
110 raise ValueError("Output shape mismatch.")
111
112
113class Model:
114 """
115 @brief Represents an AI model loaded from an MXQ file.
116
117 This class loads an AI model from an MXQ file and provides functions to launch it
118 on the NPU and perform inference.
119 """
120
121 def __init__(self, path: str, model_config: Optional[ModelConfig] = None):
122 """
123 @brief Creates a Model object from the specified MXQ model file and configuration.
124
125 Parses the MXQ file and constructs a Model object using the provided configuration,
126 initializing the model with the given settings.
127
128 @note The created Model object must be launched before performing inference.
129 See Model.launch for more details.
130
131 @param[in] path The path to the MXQ model file.
132 @param[in] model_config The configuration settings to initialize the Model.
133 """
134 if model_config is None:
135 self._model = _cQbRuntime.Model(path)
136 else:
137 self._model = _cQbRuntime.Model(path, model_config._model_config)
138
139 # 기존 BufferInfo 대신에 ModelShape를 사용한다.
140 # Model {input,output} shape는 batch를 포함한 4D이다.
143
144 def launch(self, acc: Accelerator) -> None:
145 """
146 @brief Launches the model on the specified Accelerator, which represents
147 the actual NPU.
148
149 @param[in] acc The accelerator on which to launch the model.
150 """
151 self._model.launch(acc._accelerator)
152 self._acc = acc
153
154 def dispose(self) -> None:
155 """
156 @brief Disposes of the model loaded onto the NPU.
157
158 Releases any resources associated with the model on the NPU.
159 """
160 self._model.dispose()
161 self._acc = None
162
163 def is_target(self, core_id: CoreId) -> bool:
164 """
165 @brief Checks if the NPU core specified by CoreId is the target of the model.
166 In other words, whether the model is configured to use the given NPU core.
167
168 @param[in] core_id The CoreId to check.
169 @return True if the model is configured to use the specified CoreId, false
170 otherwise.
171 """
172 return self._model.is_target(core_id._core_id)
173
174 def get_core_mode(self) -> CoreMode:
175 """
176 @brief Retrieves the core mode of the model.
177
178 @return The CoreMode of the model.
179 """
180 return CoreMode(self._model.get_core_mode())
181
182 def get_target_cores(self) -> List[CoreId]:
183 """
184 @brief Returns the NPU cores the model is configured to use.
185
186 @return A list of CoreIds representing the target NPU cores.
187 """
188 return [CoreId.from_cpp(target) for target in self._model.target_cores]
189
190 @property
191 def target_cores(self) -> List[CoreId]:
192 """@deprecated"""
193 return [CoreId.from_cpp(target) for target in self._model.target_cores]
194
195 def infer(
196 self,
197 inputs: Union[np.ndarray, List[np.ndarray]],
198 outputs: Optional[List[np.ndarray]] = None,
199 cache_size: int = 0,
200 params: Optional[List[BatchParam]] = None,
201 ) -> Optional[List[np.ndarray]]:
202 """
203 @brief Performs inference.
204
205 Fowllowing types of inference supported.
206 1. infer(in:List[numpy]) -> List[numpy] (float / int)
207 2. infer(in:numpy) -> List[numpy] (float / int)
208 3. infer(in:List[numpy], out:List[numpy]) (float / int)
209 4. infer(in:List[numpy], out:List[]) (float / int)
210 5. infer(in:numpy, out:List[numpy]) (float / int)
211 6. infer(in:numpy, out:List[]) (float / int)
212
213 @param[in] inputs Input data as a single numpy.ndarray or a list
214 of numpy.ndarray's.
215 @param[out] outputs Optional pre-allocated list of numpy.ndarray's
216 to store inference results.
217 @param[in] cache_size The number of tokens accumulated in the KV cache so far.
218 @param[in] params A List of `BatchParam`, specifying each batch's information
219 for BatchLLM inference. If `params` is specified,
220 `cache_size` is ignored.
221 @return Inference results as a list of numpy.ndarray.
222 """
223 return self._infer(inputs, outputs, cache_size, params=params)
224
225 def infer_hwc(
226 self,
227 inputs: Union[np.ndarray, List[np.ndarray]],
228 outputs: Optional[List[np.ndarray]] = None,
229 cache_size: int = 0,
230 params: Optional[List[BatchParam]] = None,
231 ) -> Optional[List[np.ndarray]]:
232 return self._infer(inputs, outputs, cache_size, True, params)
233
234 def infer_chw(
235 self,
236 inputs: Union[np.ndarray, List[np.ndarray]],
237 outputs: Optional[List[np.ndarray]] = None,
238 cache_size: int = 0,
239 params: Optional[List[BatchParam]] = None,
240 ) -> Optional[List[np.ndarray]]:
241 return self._infer(inputs, outputs, cache_size, False, params)
242
243 def _infer(
244 self,
245 inputs: Union[np.ndarray, List[np.ndarray]],
246 outputs: Optional[List[np.ndarray]],
247 cache_size: int,
248 is_target_hwc: Optional[bool] = None,
249 params: Optional[List[BatchParam]] = None,
250 ) -> Optional[List[np.ndarray]]:
251 if not isinstance(inputs, list):
252 inputs = [inputs]
253
254 variant_idx, (is_hwc, is_chw) = _find_matching_variant_idx_and_memory_format(
255 self, inputs
256 )
257 if (is_target_hwc is not None) and (
258 (is_target_hwc != is_hwc) and (is_target_hwc == is_chw)
259 ):
260 raise ValueError("Input shape is invalid.")
261 elif is_target_hwc is None:
262 is_target_hwc = is_hwc
263 inputs = [np.ascontiguousarray(i) for i in inputs]
264
265 infer_func = self._model.infer if is_target_hwc else self._model.infer_chw
266 if outputs is None:
267 # No Output Parameter
268 if params == None:
269 return [np.asarray(o) for o in infer_func(inputs, cache_size)]
270 else:
271 return [
272 np.asarray(o)
273 for o in infer_func(
274 inputs, [param._batch_param for param in params]
275 )
276 ]
277 else:
278 if outputs:
279 _check_output_shapes(
280 outputs,
282 is_target_hwc,
283 inputs[0].dtype,
284 )
285 for oi in range(len(outputs)):
286 outputs[oi] = np.ascontiguousarray(outputs[oi])
287 else:
288 outputs[:] = _build_outputs(
290 is_target_hwc,
291 inputs[0].dtype,
292 )
293
294 if params == None:
295 infer_func(inputs, outputs, cache_size)
296 else:
297 infer_func(inputs, outputs, [param._batch_param for param in params])
298
300 self,
301 inputs: Union[
302 np.ndarray,
303 List[np.ndarray],
304 ],
305 cache_size: int = 0,
306 ) -> List[np.ndarray]:
307 """
308 @brief int8_t-to-float inference
309 Performs inference with input and output elements of type `int8_t`
310
311 Using these inference APIs requires manual scaling (quantization)
312 of float values to `int8_t` for input.
313
314 @note These APIs are intended for advanced use rather than typical usage.
315 """
316 return self._infer_to_float(inputs, cache_size)
317
318 def infer_hwc_to_float(
319 self,
320 inputs: Union[
321 np.ndarray,
322 List[np.ndarray],
323 ],
324 cache_size: int = 0,
325 ) -> List[np.ndarray]:
326 return self._infer_to_float(inputs, cache_size, True)
327
328 def infer_chw_to_float(
329 self,
330 inputs: Union[
331 np.ndarray,
332 List[np.ndarray],
333 ],
334 cache_size: int = 0,
335 ) -> List[np.ndarray]:
336 return self._infer_to_float(inputs, cache_size, False)
337
339 self,
340 inputs: Union[
341 np.ndarray,
342 List[np.ndarray],
343 ],
344 cache_size: int,
345 is_target_hwc: Optional[bool] = None,
346 ) -> List[np.ndarray]:
347 """
348 @brief int8_t-to-float inference
349 Performs inference with input and output elements of type `int8_t`
350
351 Using these inference APIs requires manual scaling (quantization)
352 of float values to `int8_t` for input.
353
354 @note These APIs are intended for advanced use rather than typical usage.
355 """
356 if not isinstance(inputs, list):
357 inputs = [inputs]
358
359 _, (is_hwc, is_chw) = _find_matching_variant_idx_and_memory_format(self, inputs)
360 if (is_target_hwc is not None) and (
361 (is_target_hwc != is_hwc) and (is_target_hwc == is_chw)
362 ):
363 raise ValueError("Input shape is invalid.")
364 elif is_target_hwc is None:
365 is_target_hwc = is_hwc
366 inputs = [np.ascontiguousarray(i) for i in inputs]
367
368 if is_target_hwc:
369 outputs = self._model.infer_to_float(inputs, cache_size)
370 else:
371 outputs = self._model.infer_chw_to_float(inputs, cache_size)
372
373 return [np.asarray(o) for o in outputs]
374
376 self,
377 inputs: List[Buffer],
378 outputs: List[Buffer],
379 shape: List[List[int]] = [],
380 cache_size: int = 0,
381 ) -> None:
382 """
383 @brief Buffer-to-Buffer inference
384
385 Performs inference using input and output elements in the NPU’s internal data type.
386 The inference operates on buffers allocated via the following APIs:
387
388 - `Model.acquire_input_buffer()`
389 - `Model.acquire_output_buffer()`
390 - `ModelVariantHandle.acquire_input_buffer()`
391 - `ModelVariantHandle.acquire_output_buffer()`
392
393 Additionally, `Model.reposition_inputs()`, `Model.reposition_outputs()`,
394 `ModelVariantHandle.reposition_inputs()`, `ModelVariantHandle.reposition_outputs()`
395 must be used properly.
396
397 @note These APIs are intended for advanced use rather than typical usage.
398 """
399 self._model.infer_buffer(
400 [i._buffer for i in inputs], [o._buffer for o in outputs], shape, cache_size
401 )
402
403 def infer_speedrun(self) -> None:
404 """
405 @brief Development-only API for measuring pure NPU inference speed.
406
407 Runs NPU inference without uploading inputs and without retrieving outputs.
408 """
410
412 self,
413 inputs: Union[np.ndarray, List[np.ndarray]],
414 ) -> Future:
415 """
416 @brief Asynchronous Inference
417
418 Performs inference asynchronously.
419
420 To use asynchronous inference, the model must be created using a `ModelConfig`
421 object with the async pipeline configured to be enabled. This is done by calling
422 @ref ModelConfig.set_async_pipeline_enabled
423 "ModelConfig.set_async_pipeline_enabled(True)" before passing the configuration to
424 `Model()`.
425
426 Example:
427 @code
428 import qbruntime
429
430 mc = qbruntime.ModelConfig()
431 mc.set_async_pipeline_enabled(True)
432
433 model = qbruntime.Model(MXQ_PATH, mc)
434 acc = qbruntime.Accelerator()
435
436 model.launch(acc)
437
438 future = model.infer_async(inputs)
439
440 ret = future.get()
441 @endcode
442
443 @note Currently, only CNN-based models are supported, as asynchronous execution is
444 particularly effective for this type of workload.
445
446 @note Limitations:
447 - RNN/LSTM and LLM models are not supported yet.
448 - Models requiring CPU offloading are not supported yet.
449 - Currently, only single-batch inference is supported (i.e., N = 1).
450 - Currently, Buffer inference is not supported. The following types
451 are supported in the synchronous API for advanced use cases, but are not
452 yet available for asynchronous inference:
453 - Buffer to Buffer
454 - Buffer to float
455 """
456 if not isinstance(inputs, list):
457 inputs = [inputs]
458 _, (is_hwc, _) = _find_matching_variant_idx_and_memory_format(self, inputs)
459 inputs = [np.ascontiguousarray(i) for i in inputs]
460 infer_async_func = (
461 self._model.infer_async if is_hwc else self._model.infer_async_chw
462 )
463 return Future.from_cpp(infer_async_func(inputs), inputs)
464
466 self,
467 inputs: Union[np.ndarray, List[np.ndarray]],
468 ) -> Future:
469 """
470 @brief This method supports int8_t-to-float asynchronous inference.
471
472 @param[in] inputs Input data as a single numpy.ndarray or a list
473 of numpy.ndarray's.
474
475 @return A future that can be used to retrieve the inference result.
476 """
477 if not isinstance(inputs, list):
478 inputs = [inputs]
479 _, (is_hwc, _) = _find_matching_variant_idx_and_memory_format(self, inputs)
480 inputs = [np.ascontiguousarray(i) for i in inputs]
481 infer_async_func = (
482 self._model.infer_async_to_float
483 if is_hwc
484 else self._model.infer_async_chw_to_float
485 )
486 return Future.from_cpp(infer_async_func(inputs), inputs)
487
489 self,
490 inputs: List[np.ndarray],
491 input_bufs: List[Buffer],
492 seqlens: List[List[int]] = [],
493 ) -> None:
494 """Reposition input"""
495 inputs = [np.ascontiguousarray(i) for i in inputs]
497 inputs, [buf._buffer for buf in input_bufs], seqlens
498 )
499
501 self,
502 output_bufs: List[Buffer],
503 outputs: List[np.ndarray],
504 seqlens: List[List[int]] = [],
505 ) -> None:
506 """Reposition output"""
507 if len(outputs) != len(self._output_shape):
508 outputs.clear()
509 for shape in self._output_shape:
510 outputs.append(np.empty(shape=shape, dtype=np.float32))
511 else:
512 for oi in range(len(outputs)):
513 outputs[oi] = np.ascontiguousarray(outputs[oi])
515 [buf._buffer for buf in output_bufs], outputs, seqlens
516 )
517
518 def get_num_model_variants(self) -> int:
519 """
520 @brief Returns the total number of model variants available in this model.
521
522 The `variant_idx` parameter passed to `Model.get_model_variant_handle()` must be
523 in the range [0, return value of this function).
524
525 @return The total number of model variants.
526 """
527 return self._model.get_num_model_variants()
528
529 def get_model_variant_handle(self, variant_idx) -> ModelVariantHandle:
530 """
531 @brief Retrieves a handle to the specified model variant.
532
533 Use the returned `ModelVariantHandle` to query details such as input and output
534 shapes for the selected variant.
535
536 @param[in] variant_idx Index of the model variant to retrieve.
537 Must be in the range [0, getNumModelVariants()).
538
539 @return A `ModelVariantHandle` object if successful;
540 otherwise, raise qbruntime.QbRuntimeError "Model_InvalidVariantIdx".
541 """
542 return ModelVariantHandle.from_cpp(
543 self._model.get_model_variant_handle(variant_idx)
544 )
545
546 def get_model_input_shape(self) -> List[_Shape]:
547 """
548 @brief Returns the input shape of the model.
549
550 @return A list of input shape of the model.
551 """
552 return self._model.get_model_input_shape()
553
554 def get_model_output_shape(self) -> List[_Shape]:
555 """
556 @brief Returns the output shape of the model.
557
558 @return A list of output shape of the model.
559 """
560 return self._model.get_model_output_shape()
561
562 def get_input_scale(self) -> List[Scale]:
563 """
564 @brief Returns the input quantization scale(s) of the model.
565
566 @return A list of input scales.
567 """
568 return [Scale.from_cpp(s) for s in self._model.get_input_scale()]
569
570 def get_output_scale(self) -> List[Scale]:
571 """
572 @brief Returns the output quantization scale(s) of the model.
573
574 @return A list of output scales.
575 """
576 return [Scale.from_cpp(s) for s in self._model.get_output_scale()]
577
578 def get_input_buffer_info(self) -> List[BufferInfo]:
579 """
580 @brief Returns the input buffer information for the model.
581
582 @return A list of input buffer information.
583 """
584 return [BufferInfo.from_cpp(bi) for bi in self._model.get_input_buffer_info()]
585
586 def get_output_buffer_info(self) -> List[BufferInfo]:
587 """
588 @brief Returns the output buffer information of the model.
589
590 @return A list of output buffer information.
591 """
592 return [BufferInfo.from_cpp(bi) for bi in self._model.get_output_buffer_info()]
593
594 def get_model_input_data_type(self) -> DataType:
595 """
596 @brief Returns a data type for model inputs.
597
598 @return An input data type.
599 """
601
602 def get_model_output_data_type(self) -> DataType:
603 """
604 @brief Returns a data type for model outputs.
605
606 @return An output data type.
607 """
609
610 def acquire_input_buffer(self, seqlens: List[List[int]] = []) -> List[Buffer]:
611 """
612 @brief Buffer Management API
613
614 Acquires list of `Buffer` for input.
615 These API is required when calling `Model.infer_buffer()`.
616
617 @note These APIs are intended for advanced use rather than typical usage.
618 """
619 return [Buffer(b) for b in self._model.acquire_input_buffer(seqlens)]
620
621 def acquire_output_buffer(self, seqlens: List[List[int]] = []) -> List[Buffer]:
622 """
623 @brief Buffer Management API
624
625 Acquires list of `Buffer` for output.
626 These API is required when calling `Model.infer_buffer()`.
627
628 @note These APIs are intended for advanced use rather than typical usage.
629 """
630 return [Buffer(b) for b in self._model.acquire_output_buffer(seqlens)]
631
632 def release_buffer(self, buffer: List[Buffer]) -> None:
633 """
634 @brief Buffer Management API
635
636 Deallocate acquired Input/Output buffer
637
638 @note These APIs are intended for advanced use rather than typical usage.
639 """
640 self._model.release_buffer([b._buffer for b in buffer])
641
642 def get_identifier(self) -> int:
643 """
644 @brief Returns the model's unique identifier.
645
646 This identifier distinguishes multiple models within a single user program.
647 It is assigned incrementally, starting from 0 (e.g., 0, 1, 2, 3, ...).
648
649 @return The model identifier.
650 """
651 return self._model.get_identifier()
652
653 def get_model_path(self) -> str:
654 """
655 @brief Returns the path to the MXQ model file associated with the Model.
656
657 @return The MXQ file path.
658 """
659 return self._model.get_model_path()
660
661 def get_cache_infos(self) -> List[CacheInfo]:
662 """
663 @brief Returns informations of KV-cache of the model.
664
665 @return A list of CacheInfo objects.
666 """
667 return [CacheInfo.from_cpp(c) for c in self._model.get_cache_infos()]
668
669 def get_latency_consumed(self) -> int:
670 """@deprecated"""
671 return self._model.get_latency_consumed()
672
673 def get_latency_finished(self) -> int:
674 """@deprecated"""
675 return self._model.get_latency_finished()
676
677 def dump_cache_memory(self, cache_id: int = 0) -> List[bytes]:
678 """
679 @brief Dumps the KV cache memory into buffers.
680
681 Writes the current KV cache data into provided buffers.
682
683 @param[in] cache_id Index of target cache.
684
685 @return A list of bytes containing the KV cache data.
686 """
687 bufs = self._model.dump_cache_memory(cache_id)
688 return [np.asarray(buf, np.int8).tobytes() for buf in bufs]
689
690 def load_cache_memory(self, bufs: List[bytes], cache_id: int = 0) -> None:
691 """
692 @brief Loads the KV cache memory from buffers.
693
694 Restores the KV cache from the provided buffers.
695
696 @param[in] bufs A list of bytes containing the KV cache
697 """
699 [np.frombuffer(buf, dtype=np.int8) for buf in bufs], cache_id
700 )
701
702 def dump_cache_memory_to(self, cache_dir: str, cache_id: int = 0) -> None:
703 """
704 @brief Dumps KV cache memory to files in the specified directory.
705
706 Writes the KV cache data to binary files within the given directory.
707 Each file is named using the format: `cache_<layer_hash>.bin`.
708
709 @param[in] cache_dir Path to the directory where KV cache files will be saved.
710 @param[in] cache_id Index of target cache.
711 """
712 self._model.dump_cache_memory(cache_dir, cache_id)
713
714 def load_cache_memory_from(self, cache_dir: str, cache_id: int = 0) -> None:
715 """
716 @brief Loads the KV cache memory from files in the specified directory.
717
718 Reads KV cache data from files within the given directory and restores them.
719 Each file is named using the format: `cache_<layer_hash>.bin`.
720
721 @param[in] cache_dir Path to the directory where KV cache files are saved.
722 """
723 self._model.load_cache_memory(cache_dir, cache_id)
724
726 self, cache_size: int, tail_size: int, mask: List[bool]
727 ) -> int:
728 """
729 @brief Filter the tail of the KV cache memory
730
731 Retains the desired caches in the tail of the KV cache memory, excludes the others,
732 and shifts the remaining caches forward.
733
734 @param[in] cache_size The number of tokens accumulated in the KV cache so far.
735 @param[in] tail_size The tail size of the KV cache to filter (<=32).
736 @param[in] mask A mask indicating tokens to retain or exclude at the tail of the KV
737 cache.
738
739 @return New cache size after tail filtering.
740 """
741 return self._model.filter_cache_tail(cache_size, tail_size, mask)
742
743 def move_cache_tail(self, num_head: int, num_tail: int, cache_size: int) -> int:
744 """
745 @brief Moves the tail of the KV cache memory to the end of the head.
746
747 Slice the tail of the KV cache memory up to the specified size
748 and moves it to the designated cache position.
749
750 @param[in] num_head The size of the KV cache head where the tail is appended.
751 @param[in] num_tail The size of the KV cache tail to be moved.
752 @param[in] cache_size The total number of tokens accumulated in the KV cache so
753 far.
754
755 @return The updated cache size after moving the tail.
756 """
757 return self._model.move_cache_tail(num_head, num_tail, cache_size)
758
759
760def load(path: str, model_config: Optional[ModelConfig] = None) -> Model:
761 """
762 @brief Single-step inference API. Creates model and uploads the model
763 into NPU immediately.
764
765 This operation performs the Accelerator declaration, Model declaration,
766 and launch in a single step.
767 """
768 acc = Accelerator()
769 model = Model(path, model_config)
770 model.launch(acc)
771 return model
772
773
774
Represents an accelerator, i.e., an NPU, used for executing models.
Represents an AI model loaded from an MXQ file.
Definition model.py:113
DataType get_model_input_data_type(self)
Returns a data type for model inputs.
Definition model.py:594
None launch(self, Accelerator acc)
Launches the model on the specified Accelerator, which represents the actual NPU.
Definition model.py:144
None reposition_outputs(self, List[Buffer] output_bufs, List[np.ndarray] outputs, List[List[int]] seqlens=[])
Reposition output.
Definition model.py:505
List[Scale] get_input_scale(self)
Returns the input quantization scale(s) of the model.
Definition model.py:562
CoreMode get_core_mode(self)
Retrieves the core mode of the model.
Definition model.py:174
List[CoreId] get_target_cores(self)
Returns the NPU cores the model is configured to use.
Definition model.py:182
List[_Shape] _input_shape
Definition model.py:141
List[CoreId] target_cores(self)
Definition model.py:191
List[_Shape] _output_shape
Definition model.py:142
List[Buffer] acquire_input_buffer(self, List[List[int]] seqlens=[])
Buffer Management API.
Definition model.py:610
List[BufferInfo] get_output_buffer_info(self)
Returns the output buffer information of the model.
Definition model.py:586
str get_model_path(self)
Returns the path to the MXQ model file associated with the Model.
Definition model.py:653
Future infer_async(self, Union[np.ndarray, List[np.ndarray]] inputs)
Asynchronous Inference.
Definition model.py:414
DataType get_model_output_data_type(self)
Returns a data type for model outputs.
Definition model.py:602
List[Buffer] acquire_output_buffer(self, List[List[int]] seqlens=[])
Buffer Management API.
Definition model.py:621
int filter_cache_tail(self, int cache_size, int tail_size, List[bool] mask)
Filter the tail of the KV cache memory.
Definition model.py:727
List[np.ndarray] _infer_to_float(self, Union[np.ndarray, List[np.ndarray],] inputs, int cache_size, Optional[bool] is_target_hwc=None)
int8_t-to-float inference Performs inference with input and output elements of type int8_t
Definition model.py:346
int get_num_model_variants(self)
Returns the total number of model variants available in this model.
Definition model.py:518
None load_cache_memory_from(self, str cache_dir, int cache_id=0)
Loads the KV cache memory from files in the specified directory.
Definition model.py:714
bool is_target(self, CoreId core_id)
Checks if the NPU core specified by CoreId is the target of the model.
Definition model.py:163
None infer_speedrun(self)
Development-only API for measuring pure NPU inference speed.
Definition model.py:403
int get_latency_consumed(self)
Definition model.py:669
None dump_cache_memory_to(self, str cache_dir, int cache_id=0)
Dumps KV cache memory to files in the specified directory.
Definition model.py:702
Future infer_async_to_float(self, Union[np.ndarray, List[np.ndarray]] inputs)
This method supports int8_t-to-float asynchronous inference.
Definition model.py:468
ModelVariantHandle get_model_variant_handle(self, variant_idx)
Retrieves a handle to the specified model variant.
Definition model.py:529
None load_cache_memory(self, List[bytes] bufs, int cache_id=0)
Loads the KV cache memory from buffers.
Definition model.py:690
List[_Shape] get_model_output_shape(self)
Returns the output shape of the model.
Definition model.py:554
List[CacheInfo] get_cache_infos(self)
Returns informations of KV-cache of the model.
Definition model.py:661
int get_latency_finished(self)
Definition model.py:673
None release_buffer(self, List[Buffer] buffer)
Buffer Management API.
Definition model.py:632
List[bytes] dump_cache_memory(self, int cache_id=0)
Dumps the KV cache memory into buffers.
Definition model.py:677
None dispose(self)
Disposes of the model loaded onto the NPU.
Definition model.py:154
List[_Shape] get_model_input_shape(self)
Returns the input shape of the model.
Definition model.py:546
List[Scale] get_output_scale(self)
Returns the output quantization scale(s) of the model.
Definition model.py:570
None infer_buffer(self, List[Buffer] inputs, List[Buffer] outputs, List[List[int]] shape=[], int cache_size=0)
Buffer-to-Buffer inference.
Definition model.py:381
List[np.ndarray] infer_to_float(self, Union[np.ndarray, List[np.ndarray],] inputs, int cache_size=0)
int8_t-to-float inference Performs inference with input and output elements of type int8_t
Definition model.py:306
__init__(self, str path, Optional[ModelConfig] model_config=None)
Creates a Model object from the specified MXQ model file and configuration.
Definition model.py:121
Optional[List[np.ndarray]] _infer(self, Union[np.ndarray, List[np.ndarray]] inputs, Optional[List[np.ndarray]] outputs, int cache_size, Optional[bool] is_target_hwc=None, Optional[List[BatchParam]] params=None)
Definition model.py:250
Optional[List[np.ndarray]] infer(self, Union[np.ndarray, List[np.ndarray]] inputs, Optional[List[np.ndarray]] outputs=None, int cache_size=0, Optional[List[BatchParam]] params=None)
Performs inference.
Definition model.py:201
None reposition_inputs(self, List[np.ndarray] inputs, List[Buffer] input_bufs, List[List[int]] seqlens=[])
Reposition input.
Definition model.py:493
List[BufferInfo] get_input_buffer_info(self)
Returns the input buffer information for the model.
Definition model.py:578
int get_identifier(self)
Returns the model's unique identifier.
Definition model.py:642
int move_cache_tail(self, int num_head, int num_tail, int cache_size)
Moves the tail of the KV cache memory to the end of the head.
Definition model.py:743
A simple byte-sized buffer.
Definition type.py:171
Defines the core mode for NPU execution.
Definition type.py:196
Model load(str path, Optional[ModelConfig] model_config=None)
Single-step inference API.
Definition model.py:760