From 3bfef7e596f0036402050a43c6a4ce0d8b589cd6 Mon Sep 17 00:00:00 2001
From: wanglusheng <wanglusheng@axera-tech.com>
Date: Thu, 5 Dec 2024 15:30:35 +0800
Subject: [PATCH] init version

---
 axengine/__init__.py       |   8 +
 axengine/_capi.py          | 323 +++++++++++++++++++++++++++++++++++
 axengine/_chip.py          |  31 ++++
 axengine/_node.py          |  13 ++
 axengine/_types.py         |  63 +++++++
 axengine/session.py        | 341 +++++++++++++++++++++++++++++++++++++
 examples/classification.py |  86 ++++++++++
 setup.py                   |  22 +++
 8 files changed, 887 insertions(+)
 create mode 100644 axengine/__init__.py
 create mode 100644 axengine/_capi.py
 create mode 100644 axengine/_chip.py
 create mode 100644 axengine/_node.py
 create mode 100644 axengine/_types.py
 create mode 100644 axengine/session.py
 create mode 100644 examples/classification.py
 create mode 100644 setup.py

diff --git a/axengine/__init__.py b/axengine/__init__.py
new file mode 100644
index 0000000..198ad7e
--- /dev/null
+++ b/axengine/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2019-2024 Axera Semiconductor Co., Ltd. All Rights Reserved.
+#
+# This source file is the property of Axera Semiconductor Co., Ltd. and
+# may not be copied or distributed in any isomorphic form without the prior
+# written consent of Axera Semiconductor Co., Ltd.
+#
+
+from .session import InferenceSession
diff --git a/axengine/_capi.py b/axengine/_capi.py
new file mode 100644
index 0000000..dd40bc1
--- /dev/null
+++ b/axengine/_capi.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2019-2024 Axera Semiconductor Co., Ltd. All Rights Reserved.
+#
+# This source file is the property of Axera Semiconductor Co., Ltd. and
+# may not be copied or distributed in any isomorphic form without the prior
+# written consent of Axera Semiconductor Co., Ltd.
+#
+
+import ctypes.util
+import platform
+
+from cffi import FFI
+
+__all__: ["S", "M", "E", "N"]
+
+M = FFI()
+
+# ax_base_type.h
+M.cdef(
+    """
+    typedef int                         AX_S32;
+    typedef unsigned int                AX_U32;
+    typedef unsigned long long int      AX_U64;
+    typedef signed char                 AX_S8;
+    typedef void                        AX_VOID;
+"""
+)
+
+# ax_sys_api.h
+M.cdef(
+    """
+    AX_S32 AX_SYS_Init(AX_VOID);
+    AX_S32 AX_SYS_Deinit(AX_VOID);
+    AX_S32 AX_SYS_MemAllocCached(AX_U64 *phyaddr, AX_VOID **pviraddr, AX_U32 size, AX_U32 align, const AX_S8 *token);
+    AX_S32 AX_SYS_MemFree(AX_U64 phyaddr, AX_VOID *pviraddr);
+    AX_S32 AX_SYS_MflushCache(AX_U64 phyaddr, AX_VOID *pviraddr, AX_U32 size);
+    AX_S32 AX_SYS_MinvalidateCache(AX_U64 phyaddr, AX_VOID *pviraddr, AX_U32 size);
+"""
+)
+
+sys_name = "ax_sys"
+sys_path = ctypes.util.find_library(sys_name)
+assert (
+    sys_path is not None
+), f"Failed to find library {sys_name}. Please ensure it is installed and in the library path."
+
+S = M.dlopen(sys_path)
+assert S is not None, f"Failed to load library {sys_path}. Please ensure it is installed and in the library path."
+
+N = FFI()
+
+# ax_base_type.h
+N.cdef(
+    """
+    typedef unsigned long long int      AX_U64;
+    typedef unsigned int                AX_U32;
+    typedef unsigned char               AX_U8;
+    typedef int                         AX_S32;
+    typedef signed char                 AX_S8;
+    typedef char                        AX_CHAR;
+    typedef void                        AX_VOID;
+    
+    typedef enum {
+        AX_FALSE = 0,
+        AX_TRUE  = 1,
+    } AX_BOOL;
+"""
+)
+
+# ax_engine_type.h, base type
+N.cdef(
+    """
+    typedef AX_U32                      AX_ENGINE_NPU_SET_T;
+"""
+)
+
+# ax_engine_type.h, enum
+N.cdef(
+    """
+    typedef enum _AX_ENGINE_TENSOR_LAYOUT_E
+    {
+        AX_ENGINE_TENSOR_LAYOUT_UNKNOWN = 0,
+        AX_ENGINE_TENSOR_LAYOUT_NHWC    = 1,
+        AX_ENGINE_TENSOR_LAYOUT_NCHW    = 2,
+    } AX_ENGINE_TENSOR_LAYOUT_T;
+
+    typedef enum
+    {
+        AX_ENGINE_MT_PHYSICAL           = 0,
+        AX_ENGINE_MT_VIRTUAL            = 1,
+        AX_ENGINE_MT_OCM                = 2,
+    } AX_ENGINE_MEMORY_TYPE_T;
+
+    typedef enum
+    {
+        AX_ENGINE_DT_UNKNOWN            = 0,
+        AX_ENGINE_DT_UINT8              = 1,
+        AX_ENGINE_DT_UINT16             = 2,
+        AX_ENGINE_DT_FLOAT32            = 3,
+        AX_ENGINE_DT_SINT16             = 4,
+        AX_ENGINE_DT_SINT8              = 5,
+        AX_ENGINE_DT_SINT32             = 6,
+        AX_ENGINE_DT_UINT32             = 7,
+        AX_ENGINE_DT_FLOAT64            = 8,
+        AX_ENGINE_DT_BFLOAT16           = 9,
+        AX_ENGINE_DT_UINT10_PACKED      = 100,
+        AX_ENGINE_DT_UINT12_PACKED      = 101,
+        AX_ENGINE_DT_UINT14_PACKED      = 102,
+        AX_ENGINE_DT_UINT16_PACKED      = 103,
+    } AX_ENGINE_DATA_TYPE_T;
+
+    typedef enum
+    {
+        AX_ENGINE_CS_FEATUREMAP         = 0,
+        AX_ENGINE_CS_RAW8               = 12,
+        AX_ENGINE_CS_RAW10              = 1,
+        AX_ENGINE_CS_RAW12              = 2,
+        AX_ENGINE_CS_RAW14              = 11,
+        AX_ENGINE_CS_RAW16              = 3,
+        AX_ENGINE_CS_NV12               = 4,
+        AX_ENGINE_CS_NV21               = 5,
+        AX_ENGINE_CS_RGB                = 6,
+        AX_ENGINE_CS_BGR                = 7,
+        AX_ENGINE_CS_RGBA               = 8,
+        AX_ENGINE_CS_GRAY               = 9,
+        AX_ENGINE_CS_YUV444             = 10,
+    } AX_ENGINE_COLOR_SPACE_T;
+"""
+)
+
+# ax_engine_type.h, architecturally agnostic struct
+N.cdef(
+    """
+    typedef enum {
+        AX_ENGINE_VIRTUAL_NPU_DISABLE   = 0,
+    } AX_ENGINE_NPU_MODE_T;
+
+    typedef enum {
+        AX_ENGINE_MODEL_TYPE0           = 0,
+    } AX_ENGINE_MODEL_TYPE_T;
+
+    typedef struct {
+        AX_ENGINE_NPU_MODE_T            eHardMode;
+        AX_U32                          reserve[8];
+    } AX_ENGINE_NPU_ATTR_T;
+
+    typedef struct _AX_ENGINE_IO_META_EX_T
+    {
+        AX_ENGINE_COLOR_SPACE_T         eColorSpace;
+        AX_U64                          u64Reserved[18];
+    } AX_ENGINE_IO_META_EX_T;
+    
+    typedef struct {
+        AX_ENGINE_NPU_SET_T             nNpuSet;
+        AX_S8*                          pName;
+        AX_U32                          reserve[8];
+    } AX_ENGINE_HANDLE_EXTRA_T;
+    
+    typedef struct _AX_ENGINE_CMM_INFO_T
+    {
+        AX_U32                          nCMMSize;
+    } AX_ENGINE_CMM_INFO_T;
+
+    typedef struct _AX_ENGINE_IO_SETTING_T
+    {
+        AX_U32                          nWbtIndex;
+        AX_U64                          u64Reserved[7];
+    }AX_ENGINE_IO_SETTING_T;
+"""
+)
+
+# check architecture, 32bit or 64bit
+arch = platform.architecture()[0]
+
+# ax_engine_type.h, struct
+if arch == "64bit":
+    N.cdef(
+        """
+        typedef struct _AX_ENGINE_IO_META_T
+        {
+            AX_CHAR*                    pName;
+            AX_S32*                     pShape;
+            AX_U8                       nShapeSize;
+            AX_ENGINE_TENSOR_LAYOUT_T   eLayout;
+            AX_ENGINE_MEMORY_TYPE_T     eMemoryType;
+            AX_ENGINE_DATA_TYPE_T       eDataType;
+            AX_ENGINE_IO_META_EX_T*     pExtraMeta;
+            AX_U32                      nSize;
+            AX_U32                      nQuantizationValue;
+            AX_S32*                     pStride;
+            AX_U64                      u64Reserved[9];
+        } AX_ENGINE_IO_META_T;
+
+        typedef struct _AX_ENGINE_IO_INFO_T
+        {
+            AX_ENGINE_IO_META_T*        pInputs;
+            AX_U32                      nInputSize;
+            AX_ENGINE_IO_META_T*        pOutputs;
+            AX_U32                      nOutputSize;
+            AX_U32                      nMaxBatchSize;
+            AX_BOOL                     bDynamicBatchSize;
+            AX_U64                      u64Reserved[11];
+        } AX_ENGINE_IO_INFO_T;
+
+        typedef struct _AX_ENGINE_IO_BUFFER_T
+        {
+            AX_U64                      phyAddr;
+            AX_VOID*                    pVirAddr;
+            AX_U32                      nSize;
+            AX_S32*                     pStride;
+            AX_U8                       nStrideSize;
+            AX_U64                      u64Reserved[11];
+        } AX_ENGINE_IO_BUFFER_T;
+
+        typedef struct _AX_ENGINE_IO_T
+        {
+            AX_ENGINE_IO_BUFFER_T*      pInputs;
+            AX_U32                      nInputSize;
+            AX_ENGINE_IO_BUFFER_T*      pOutputs;
+            AX_U32                      nOutputSize;
+            AX_U32                      nBatchSize;
+            AX_ENGINE_IO_SETTING_T*     pIoSetting;
+            AX_U64                      u64Reserved[10];
+        } AX_ENGINE_IO_T;
+    """
+    )
+else:
+    N.cdef(
+        """
+        typedef struct _AX_ENGINE_IO_META_T
+        {
+            AX_CHAR*                    pName;
+            AX_S32*                     pShape;
+            AX_U8                       nShapeSize;
+            AX_ENGINE_TENSOR_LAYOUT_T   eLayout;
+            AX_ENGINE_MEMORY_TYPE_T     eMemoryType;
+            AX_ENGINE_DATA_TYPE_T       eDataType;
+            AX_ENGINE_IO_META_EX_T*     pExtraMeta;
+            AX_U32                      nSize;
+            AX_U32                      nQuantizationValue;
+            AX_S32*                     pStride;
+            AX_U64 u64Reserved[11];
+        } AX_ENGINE_IO_META_T;
+
+        typedef struct _AX_ENGINE_IO_INFO_T
+        {
+            AX_ENGINE_IO_META_T*        pInputs;
+            AX_U32                      nInputSize;
+            AX_ENGINE_IO_META_T*        pOutputs;
+            AX_U32                      nOutputSize;
+            AX_U32                      nMaxBatchSize;
+            AX_BOOL                     bDynamicBatchSize;
+            AX_U64                      u64Reserved[13];
+        } AX_ENGINE_IO_INFO_T;
+
+        typedef struct _AX_ENGINE_IO_BUFFER_T
+        {
+            AX_U64                      phyAddr;
+            AX_VOID*                    pVirAddr;
+            AX_U32                      nSize;
+            AX_S32*                     pStride;
+            AX_U8                       nStrideSize;
+            AX_U64                      u64Reserved[13];
+        } AX_ENGINE_IO_BUFFER_T;
+
+        typedef struct _AX_ENGINE_IO_T
+        {
+            AX_ENGINE_IO_BUFFER_T*      pInputs;
+            AX_U32                      nInputSize;
+            AX_ENGINE_IO_BUFFER_T*      pOutputs;
+            AX_U32                      nOutputSize;
+            AX_U32                      nBatchSize;
+            AX_ENGINE_IO_SETTING_T*     pIoSetting;
+            AX_U64                      u64Reserved[12];
+        } AX_ENGINE_IO_T;
+    """
+    )
+
+# ax_engine_api.h
+N.cdef(
+    """
+    const AX_CHAR* AX_ENGINE_GetVersion(AX_VOID);
+
+    AX_VOID AX_ENGINE_NPUReset(AX_VOID);
+    AX_S32 AX_ENGINE_Init(AX_ENGINE_NPU_ATTR_T* pNpuAttr);
+    AX_S32 AX_ENGINE_GetVNPUAttr(AX_ENGINE_NPU_ATTR_T* pNpuAttr);
+    AX_S32 AX_ENGINE_Deinit(AX_VOID);
+
+    AX_S32 AX_ENGINE_GetModelType(const AX_VOID* pData, AX_U32 nDataSize, AX_ENGINE_MODEL_TYPE_T* pModelType);
+
+    AX_S32 AX_ENGINE_CreateHandleV2(uint64_t** pHandle, const AX_VOID* pData, AX_U32 nDataSize, AX_ENGINE_HANDLE_EXTRA_T* pExtraParam);
+    AX_S32 AX_ENGINE_DestroyHandle(uint64_t* nHandle);
+
+    AX_S32 AX_ENGINE_GetIOInfo(uint64_t* nHandle, AX_ENGINE_IO_INFO_T** pIO);
+    AX_S32 AX_ENGINE_GetGroupIOInfoCount(uint64_t* nHandle, AX_U32* pCount);
+    AX_S32 AX_ENGINE_GetGroupIOInfo(uint64_t* nHandle, AX_U32 nIndex, AX_ENGINE_IO_INFO_T** pIO);
+
+    AX_S32 AX_ENGINE_GetHandleModelType(uint64_t* nHandle, AX_ENGINE_MODEL_TYPE_T* pModelType);
+
+    AX_S32 AX_ENGINE_CreateContextV2(uint64_t* nHandle, uint64_t** pContext);
+
+    AX_S32 AX_ENGINE_RunSyncV2(uint64_t* handle, uint64_t* context, AX_ENGINE_IO_T* pIO);
+    AX_S32 AX_ENGINE_RunGroupIOSync(uint64_t* handle, uint64_t* context, AX_U32 nIndex, AX_ENGINE_IO_T* pIO);
+
+    AX_S32 AX_ENGINE_SetAffinity(uint64_t* nHandle, AX_ENGINE_NPU_SET_T nNpuSet);
+    AX_S32 AX_ENGINE_GetAffinity(uint64_t* nHandle, AX_ENGINE_NPU_SET_T* pNpuSet);
+
+    AX_S32 AX_ENGINE_GetCMMUsage(uint64_t* nHandle, AX_ENGINE_CMM_INFO_T* pCMMInfo);
+
+    const AX_CHAR* AX_ENGINE_GetModelToolsVersion(uint64_t* nHandle);
+
+    // internal use api, remember no question
+    AX_S32 AX_ENGINE_GetTotalOps();
+"""
+)
+
+engine_name = "ax_engine"
+engine_path = ctypes.util.find_library(engine_name)
+assert (
+    engine_path is not None
+), f"Failed to find library {engine_name}. Please ensure it is installed and in the library path."
+
+E = N.dlopen(engine_path)
+assert E is not None, f"Failed to load library {engine_path}. Please ensure it is installed and in the library path."
diff --git a/axengine/_chip.py b/axengine/_chip.py
new file mode 100644
index 0000000..e770ed4
--- /dev/null
+++ b/axengine/_chip.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2019-2024 Axera Semiconductor Co., Ltd. All Rights Reserved.
+#
+# This source file is the property of Axera Semiconductor Co., Ltd. and
+# may not be copied or distributed in any isomorphic form without the prior
+# written consent of Axera Semiconductor Co., Ltd.
+#
+
+from . import _types
+from ._capi import E as _lib
+
+__all__: ["T"]
+
+
+def function_exists(lib, func_name):
+    try:
+        getattr(lib, func_name)
+        return True
+    except AttributeError:
+        return False
+
+
+def check_chip_type(clib):
+    if not function_exists(clib, "AX_ENGINE_SetAffinity"):
+        return _types.ChipType.M57H
+    elif not function_exists(clib, "AX_ENGINE_GetTotalOps"):
+        return _types.ChipType.MC50
+    else:
+        return _types.ChipType.MC20E
+
+
+T = check_chip_type(_lib)
diff --git a/axengine/_node.py b/axengine/_node.py
new file mode 100644
index 0000000..cf0459e
--- /dev/null
+++ b/axengine/_node.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2019-2024 Axera Semiconductor Co., Ltd. All Rights Reserved.
+#
+# This source file is the property of Axera Semiconductor Co., Ltd. and
+# may not be copied or distributed in any isomorphic form without the prior
+# written consent of Axera Semiconductor Co., Ltd.
+#
+
+
+class NodeArg(object):
+    def __init__(self, name, dtype, shape):
+        self.name = name
+        self.dtype = dtype
+        self.shape = shape
diff --git a/axengine/_types.py b/axengine/_types.py
new file mode 100644
index 0000000..d054a92
--- /dev/null
+++ b/axengine/_types.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2019-2024 Axera Semiconductor Co., Ltd. All Rights Reserved.
+#
+# This source file is the property of Axera Semiconductor Co., Ltd. and
+# may not be copied or distributed in any isomorphic form without the prior
+# written consent of Axera Semiconductor Co., Ltd.
+#
+
+from enum import Enum
+import ml_dtypes as mldt
+import numpy as np
+
+
+class VNPUType(Enum):
+    DISABLED = 0
+    ENABLED = 1
+    BIG_LITTLE = 2
+    LITTLE_BIG = 3
+
+
+class ModelType(Enum):
+    HALF = 0  # for MC20E, which means chip is AX630C(x), or AX620Q(x)
+    FULL = 1  # for MC20E
+    SINGLE = 0  # for MC50, which means chip is AX650A or AX650N, and M57H
+    DUAL = 1  # for MC50
+    TRIPLE = 2  # for MC50
+
+
+class ChipType(Enum):
+    MC20E = 0
+    MC50 = 1
+    M57H = 2
+
+
+def get_data_type(engine_type):
+    if engine_type == ChipType.MC20E:
+        return ModelType.HALF
+    elif engine_type == ChipType.MC50:
+        return ModelType.SINGLE
+    elif engine_type == ChipType.M57H:
+        return ModelType.SINGLE
+    else:
+        raise ValueError("Invalid engine type: %s" % engine_type)
+
+
+def _transform_dtype(ffi, lib, dtype):
+    if dtype == ffi.cast("AX_ENGINE_DATA_TYPE_T", lib.AX_ENGINE_DT_UINT8):
+        return np.dtype(np.uint8)
+    elif dtype == ffi.cast("AX_ENGINE_DATA_TYPE_T", lib.AX_ENGINE_DT_SINT8):
+        return np.dtype(np.int8)
+    elif dtype == ffi.cast("AX_ENGINE_DATA_TYPE_T", lib.AX_ENGINE_DT_UINT16):
+        return np.dtype(np.uint16)
+    elif dtype == ffi.cast("AX_ENGINE_DATA_TYPE_T", lib.AX_ENGINE_DT_SINT16):
+        return np.dtype(np.int16)
+    elif dtype == ffi.cast("AX_ENGINE_DATA_TYPE_T", lib.AX_ENGINE_DT_UINT32):
+        return np.dtype(np.uint32)
+    elif dtype == ffi.cast("AX_ENGINE_DATA_TYPE_T", lib.AX_ENGINE_DT_SINT32):
+        return np.dtype(np.int32)
+    elif dtype == ffi.cast("AX_ENGINE_DATA_TYPE_T", lib.AX_ENGINE_DT_FLOAT32):
+        return np.dtype(np.float32)
+    elif dtype == ffi.cast("AX_ENGINE_DATA_TYPE_T", lib.AX_ENGINE_DT_BFLOAT16):
+        return np.dtype(mldt.bfloat16)
+    else:
+        raise ValueError(f"Unsupported data type '{dtype}'.")
diff --git a/axengine/session.py b/axengine/session.py
new file mode 100644
index 0000000..0e16142
--- /dev/null
+++ b/axengine/session.py
@@ -0,0 +1,341 @@
+# Copyright (c) 2019-2024 Axera Semiconductor Co., Ltd. All Rights Reserved.
+#
+# This source file is the property of Axera Semiconductor Co., Ltd. and
+# may not be copied or distributed in any isomorphic form without the prior
+# written consent of Axera Semiconductor Co., Ltd.
+#
+
+from ._types import VNPUType, ModelType, ChipType
+from ._types import _transform_dtype
+from ._node import NodeArg
+
+from . import _chip
+from . import _capi
+
+import os
+import numpy as np
+
+__all__: ["InferenceSession"]
+
+
+class InferenceSession:
+    def __init__(
+        self,
+        path_or_bytes: str | bytes | os.PathLike,
+    ) -> None:
+        # load shared library
+        self._sys_lib = _capi.S
+        self._sys_ffi = _capi.M
+        self._engine_lib = _capi.E
+        self._engine_ffi = _capi.N
+
+        # chip type
+        self._chip_type = _chip.T
+        print(f"[INFO] Chip type: {self._chip_type}")
+
+        # handle, context, info, io
+        self._handle = self._engine_ffi.new("uint64_t **")
+        self._context = self._engine_ffi.new("uint64_t **")
+        self._io = self._engine_ffi.new("AX_ENGINE_IO_T *")
+
+        # init ax sys & engine
+        ret = self._init()
+        if 0 != ret:
+            raise RuntimeError("Failed to initialize engine.")
+        print(f"[INFO] Engine version: {self._get_version()}")
+
+        # get vnpu type
+        self._vnpu_type = self._get_vnpu_type()
+        print(f"[INFO] VNPU type: {self._vnpu_type}")
+
+        # model buffer, almost copied from onnx runtime
+        if isinstance(path_or_bytes, (str, os.PathLike)):
+            self._model_name = os.path.splitext(os.path.basename(path_or_bytes))[0]
+            with open(path_or_bytes, "rb") as f:
+                data = f.read()
+            self._model_buffer = self._engine_ffi.new("char[]", data)
+            self._model_buffer_size = len(data)
+        elif isinstance(path_or_bytes, bytes):
+            self._model_buffer = self._engine_ffi.new("char[]", path_or_bytes)
+            self._model_buffer_size = len(path_or_bytes)
+        else:
+            raise TypeError(f"Unable to load model from type '{type(path_or_bytes)}'")
+
+        # get model type
+        self._model_type = self._get_model_type()
+        if self._chip_type is ChipType.MC20E:
+            if self._model_type is ModelType.FULL:
+                print(f"[INFO] Model type: {self._model_type.value} (full core)")
+            if self._model_type is ModelType.HALF:
+                print(f"[INFO] Model type: {self._model_type.value} (half core)")
+        if self._chip_type is ChipType.MC50:
+            if self._model_type is ModelType.SINGLE:
+                print(f"[INFO] Model type: {self._model_type.value} (single core)")
+            if self._model_type is ModelType.DUAL:
+                print(f"[INFO] Model type: {self._model_type.value} (dual core)")
+            if self._model_type is ModelType.TRIPLE:
+                print(f"[INFO] Model type: {self._model_type.value} (triple core)")
+        if self._chip_type is ChipType.M57H:
+            print(f"[INFO] Model type: {self._model_type.value} (single core)")
+
+        # check model type
+        if self._chip_type is ChipType.MC50:
+            # all types (single or dual or triple) of model are allowed in vnpu mode disabled
+            # only single core model is allowed in vnpu mode enabled
+            # only triple core model is NOT allowed in vnpu mode big-little or little-big
+            if self._vnpu_type is VNPUType.ENABLED:
+                if self._model_type is not ModelType.SINGLE:
+                    raise ValueError(
+                        f"Model type '{self._model_type}' is not allowed when vnpu is inited as {self._vnpu_type}."
+                    )
+            if self._vnpu_type is VNPUType.BIG_LITTLE or self._vnpu_type is VNPUType.LITTLE_BIG:
+                if self._model_type is ModelType.TRIPLE:
+                    raise ValueError(
+                        f"Model type '{self._model_type}' is not allowed when vnpu is inited as {self._vnpu_type}."
+                    )
+        if self._chip_type is ChipType.MC20E:
+            # all types of full or half core model are allowed in vnpu mode disabled
+            # only half core model is allowed in vnpu mode enabled
+            if self._vnpu_type is VNPUType.ENABLED:
+                if self._model_type is ModelType.FULL:
+                    raise ValueError(
+                        f"Model type '{self._model_type}' is not allowed when vnpu is inited as {self._vnpu_type}."
+                    )
+        # if self._chip_type is ChipType.M57H:
+        # there only one type of model will be compiled, so no need to check
+
+        # load model
+        ret = self._load()
+        if 0 != ret:
+            raise RuntimeError("Failed to load model.")
+        print(f"[INFO] Compiler version: {self._get_model_tool_version()}")
+
+        # get shape group count
+        self._shape_count = self._get_shape_count()
+
+        # get model shape
+        self._info = self._get_info()
+        self._inputs = self._get_inputs()
+        self._outputs = self._get_outputs()
+
+        # fill model io
+        self._align = 128
+        self._cmm_token = self._engine_ffi.new("AX_S8[]", b"PyEngine")
+        self._io[0].nInputSize = len(self.get_inputs())
+        self._io[0].nOutputSize = len(self.get_outputs())
+        self._io[0].pInputs = self._engine_ffi.new("AX_ENGINE_IO_BUFFER_T[{}]".format(self._io[0].nInputSize))
+        self._io[0].pOutputs = self._engine_ffi.new("AX_ENGINE_IO_BUFFER_T[{}]".format(self._io[0].nOutputSize))
+        for i in range(len(self.get_inputs())):
+            max_buf = 0
+            for j in range(self._shape_count):
+                max_buf = max(max_buf, self._info[j][0].pInputs[i].nSize)
+            self._io[0].pInputs[i].nSize = max_buf
+            phy = self._engine_ffi.new("AX_U64*")
+            vir = self._engine_ffi.new("AX_VOID**")
+            ret = self._sys_lib.AX_SYS_MemAllocCached(
+                phy, vir, self._io[0].pInputs[i].nSize, self._align, self._cmm_token
+            )
+            if 0 != ret:
+                raise RuntimeError("Failed to allocate memory for input.")
+            self._io[0].pInputs[i].phyAddr = phy[0]
+            self._io[0].pInputs[i].pVirAddr = vir[0]
+        for i in range(len(self.get_outputs())):
+            max_buf = 0
+            for j in range(self._shape_count):
+                max_buf = max(max_buf, self._info[j][0].pOutputs[i].nSize)
+            self._io[0].pOutputs[i].nSize = max_buf
+            phy = self._engine_ffi.new("AX_U64*")
+            vir = self._engine_ffi.new("AX_VOID**")
+            ret = self._sys_lib.AX_SYS_MemAllocCached(
+                phy, vir, self._io[0].pOutputs[i].nSize, self._align, self._cmm_token
+            )
+            if 0 != ret:
+                raise RuntimeError("Failed to allocate memory for output.")
+            self._io[0].pOutputs[i].phyAddr = phy[0]
+            self._io[0].pOutputs[i].pVirAddr = vir[0]
+
+    def __del__(self):
+        self._final()
+
+    def _init(self, vnpu=VNPUType.DISABLED):  # vnpu type, the default is disabled
+        ret = self._sys_lib.AX_SYS_Init()
+        if 0 != ret:
+            raise RuntimeError("Failed to initialize system.")
+
+        # get vnpu type first, check if npu was initialized
+        vnpu_type = self._engine_ffi.new("AX_ENGINE_NPU_ATTR_T *")
+        ret = self._engine_lib.AX_ENGINE_GetVNPUAttr(vnpu_type)
+        if 0 != ret:
+            # this means the NPU was not initialized
+            vnpu_type.eHardMode = self._engine_ffi.cast("AX_ENGINE_NPU_MODE_T", vnpu.value)
+
+        return self._engine_lib.AX_ENGINE_Init(vnpu_type)
+
+    def _final(self):
+        if self._handle[0] is not None:
+            self._unload()
+        self._engine_lib.AX_ENGINE_Deinit()
+        return self._sys_lib.AX_SYS_Deinit()
+
+    def _get_version(self):
+        engine_version = self._engine_lib.AX_ENGINE_GetVersion()
+        return self._engine_ffi.string(engine_version).decode("utf-8")
+
+    def _get_vnpu_type(self) -> VNPUType:
+        vnpu_type = self._engine_ffi.new("AX_ENGINE_NPU_ATTR_T *")
+        ret = self._engine_lib.AX_ENGINE_GetVNPUAttr(vnpu_type)
+        if 0 != ret:
+            raise RuntimeError("Failed to get VNPU attribute.")
+        return VNPUType(vnpu_type.eHardMode)
+
+    def _get_model_type(self) -> ModelType:
+        model_type = self._engine_ffi.new("AX_ENGINE_MODEL_TYPE_T *")
+        ret = self._engine_lib.AX_ENGINE_GetModelType(self._model_buffer, self._model_buffer_size, model_type)
+        if 0 != ret:
+            raise RuntimeError("Failed to get model type.")
+        return ModelType(model_type[0])
+
+    def _get_model_tool_version(self):
+        model_tool_version = self._engine_lib.AX_ENGINE_GetModelToolsVersion(self._handle[0])
+        return self._engine_ffi.string(model_tool_version).decode("utf-8")
+
+    def _load(self):
+        extra = self._engine_ffi.new("AX_ENGINE_HANDLE_EXTRA_T *")
+        extra_name = self._engine_ffi.new("char[]", self._model_name.encode("utf-8"))
+        extra.pName = extra_name
+
+        # for onnx runtime do not support one model multiple context running in multi-thread as far as I know, so
+        # the engine handle and context will create only once
+        ret = self._engine_lib.AX_ENGINE_CreateHandleV2(
+            self._handle, self._model_buffer, self._model_buffer_size, extra
+        )
+        if 0 == ret:
+            ret = self._engine_lib.AX_ENGINE_CreateContextV2(self._handle[0], self._context)
+        return ret
+
+    def _get_info(self):
+        total_info = []
+        if 1 == self._shape_count:
+            info = self._engine_ffi.new("AX_ENGINE_IO_INFO_T **")
+            ret = self._engine_lib.AX_ENGINE_GetIOInfo(self._handle[0], info)
+            if 0 != ret:
+                raise RuntimeError("Failed to get model shape.")
+            total_info.append(info)
+        else:
+            for i in range(self._shape_count):
+                info = self._engine_ffi.new("AX_ENGINE_IO_INFO_T **")
+                ret = self._engine_lib.AX_ENGINE_GetGroupIOInfo(self._handle[0], i, info)
+                if 0 != ret:
+                    raise RuntimeError(f"Failed to get model the {i}th shape.")
+                total_info.append(info)
+        return total_info
+
+    def _get_shape_count(self):
+        count = self._engine_ffi.new("AX_U32 *")
+        ret = self._engine_lib.AX_ENGINE_GetGroupIOInfoCount(self._handle[0], count)
+        if 0 != ret:
+            raise RuntimeError("Failed to get model shape group.")
+        return count[0]
+
+    def _unload(self):
+        return self._engine_lib.AX_ENGINE_DestroyHandle(self._handle[0])
+
+    def _get_inputs(self):
+        inputs = []
+        for group in range(self._shape_count):
+            one_group_input = []
+            for index in range(self._info[group][0].nInputSize):
+                current_input = self._info[group][0].pInputs[index]
+                name = self._engine_ffi.string(current_input.pName).decode("utf-8")
+                shape = []
+                for i in range(current_input.nShapeSize):
+                    shape.append(current_input.pShape[i])
+                dtype = _transform_dtype(self._engine_ffi, self._engine_lib, current_input.eDataType)
+                meta = NodeArg(name, dtype, shape)
+                one_group_input.append(meta)
+            inputs.append(one_group_input)
+        return inputs
+
+    def _get_outputs(self):
+        outputs = []
+        for group in range(self._shape_count):
+            one_group_output = []
+            for index in range(self._info[group][0].nOutputSize):
+                current_output = self._info[group][0].pOutputs[index]
+                name = self._engine_ffi.string(current_output.pName).decode("utf-8")
+                shape = []
+                for i in range(current_output.nShapeSize):
+                    shape.append(current_output.pShape[i])
+                dtype = _transform_dtype(self._engine_ffi, self._engine_lib, current_output.eDataType)
+                meta = NodeArg(name, dtype, shape)
+                one_group_output.append(meta)
+            outputs.append(one_group_output)
+        return outputs
+
+    def get_inputs(self, shape_group=0) -> list[NodeArg]:
+        if shape_group > self._shape_count:
+            raise ValueError(f"Shape group '{shape_group}' is out of range, total {self._shape_count}.")
+        selected_info = self._inputs[shape_group]
+        return selected_info
+
+    def get_outputs(self, shape_group=0) -> list[NodeArg]:
+        if shape_group > self._shape_count:
+            raise ValueError(f"Shape group '{shape_group}' is out of range, total {self._shape_count}.")
+        selected_info = self._outputs[shape_group]
+        return selected_info
+
+    # copy from onnxruntime
+    def _validate_input(self, feed_input_names):
+        missing_input_names = []
+        for i in self.get_inputs():
+            if i.name not in feed_input_names:
+                missing_input_names.append(i.name)
+        if missing_input_names:
+            raise ValueError(
+                f"Required inputs ({missing_input_names}) are missing from input feed ({feed_input_names})."
+            )
+
+    def _validate_output(self, output_names):
+        if output_names is not None:
+            for name in output_names:
+                if name not in [o.name for o in self.get_outputs()]:
+                    raise ValueError(f"Output name '{name}' is not registered.")
+
+    def run(self, output_names, input_feed, run_options=None):
+        self._validate_input(list(input_feed.keys()))
+        self._validate_output(output_names)
+
+        if None is output_names:
+            output_names = [o.name for o in self.get_outputs()]
+
+        # fill model io
+        for key, npy in input_feed.items():
+            for i, one in enumerate(self.get_inputs()):
+                if one.name == key:
+                    npy_ptr = self._engine_ffi.cast("void *", npy.ctypes.data)
+                    self._engine_ffi.memmove(self._io[0].pInputs[i].pVirAddr, npy_ptr, npy.nbytes)
+                    self._sys_lib.AX_SYS_MflushCache(
+                        self._io[0].pInputs[i].phyAddr, self._io[0].pInputs[i].pVirAddr, self._io[0].pInputs[i].nSize
+                    )
+                    break
+
+        # execute model
+        ret = self._engine_lib.AX_ENGINE_RunSyncV2(self._handle[0], self._context[0], self._io)
+
+        # flush output
+        outputs = []
+        if 0 == ret:
+            for i in range(len(self.get_outputs())):
+                self._sys_lib.AX_SYS_MinvalidateCache(
+                    self._io[0].pOutputs[i].phyAddr, self._io[0].pOutputs[i].pVirAddr, self._io[0].pOutputs[i].nSize
+                )
+                npy = np.frombuffer(
+                    self._engine_ffi.buffer(self._io[0].pOutputs[i].pVirAddr, self._io[0].pOutputs[i].nSize),
+                    dtype=self.get_outputs()[i].dtype,
+                ).reshape(self.get_outputs()[i].shape)
+                name = self.get_outputs()[i].name
+                if name in output_names:
+                    outputs.append(npy)
+            return outputs
+        else:
+            raise RuntimeError("Failed to run model.")
diff --git a/examples/classification.py b/examples/classification.py
new file mode 100644
index 0000000..a87d516
--- /dev/null
+++ b/examples/classification.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2019-2024 Axera Semiconductor Co., Ltd. All Rights Reserved.
+#
+# This source file is the property of Axera Semiconductor Co., Ltd. and
+# may not be copied or distributed in any isomorphic form without the prior
+# written consent of Axera Semiconductor Co., Ltd.
+#
+
+import axengine as axe
+import numpy as np
+from PIL import Image
+
+
+def load_model(model_path):
+    session = axe.InferenceSession(model_path)
+    return session
+
+
+def preprocess_image(image_path, target_size=(256, 256), crop_size=(224, 224)):
+    # Load the image
+    img = Image.open(image_path).convert("RGB")
+
+    # Get original dimensions
+    original_width, original_height = img.size
+
+    # Determine the shorter side and calculate the center crop
+    if original_width < original_height:
+        crop_area = original_width
+    else:
+        crop_area = original_height
+
+    crop_x = (original_width - crop_area) // 2
+    crop_y = (original_height - crop_area) // 2
+
+    # Crop the center square
+    img = img.crop((crop_x, crop_y, crop_x + crop_area, crop_y + crop_area))
+
+    # Resize the image to 256x256
+    img = img.resize(target_size)
+
+    # Crop the center 224x224
+    crop_x = (target_size[0] - crop_size[0]) // 2
+    crop_y = (target_size[1] - crop_size[1]) // 2
+    img = img.crop((crop_x, crop_y, crop_x + crop_size[0], crop_y + crop_size[1]))
+
+    # Convert to numpy array and change dtype to int
+    img_array = np.array(img).astype("uint8")
+    # Transpose to (1, C, H, W)
+    # img_array = np.transpose(img_array, (2, 0, 1))
+    # img_array = np.expand_dims(img_array, axis=0)  # Add batch dimension
+    return img_array
+
+
+def get_top_k_predictions(output, k=5):
+    # Get top k predictions
+    top_k_indices = np.argsort(output[0])[-k:][::-1]
+    top_k_scores = output[0][top_k_indices]
+    return top_k_indices, top_k_scores
+
+
+def main(model_path, image_path, target_size, crop_size, k):
+    # Load the model
+    session = load_model(model_path)
+
+    # Preprocess the image
+    input_tensor = preprocess_image(image_path, target_size, crop_size)
+
+    # Get input name and run inference
+    input_name = session.get_inputs()[0].name
+    output = session.run(None, {input_name: input_tensor})
+
+    # Get top k predictions
+    top_k_indices, top_k_scores = get_top_k_predictions(output, k)
+
+    # Print the results
+    print(f"Top {k} Predictions:")
+    for i in range(k):
+        print(f"Class Index: {top_k_indices[i]}, Score: {top_k_scores[i]}")
+
+
+if __name__ == "__main__":
+    MODEL_PATH = "/opt/data/npu/models/mobilenetv2.axmodel"
+    IMAGE_PATH = "/opt/data/npu/images/cat.jpg"
+    TARGET_SIZE = (256, 256)  # Resize to 256x256
+    CROP_SIZE = (224, 224)  # Crop to 224x224
+    K = 5  # Top K predictions
+    main(MODEL_PATH, IMAGE_PATH, TARGET_SIZE, CROP_SIZE, K)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..609e459
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,22 @@
+from setuptools import setup
+
+setup(
+    name="axengine",
+    version="0.1",
+    classifiers=[
+        "Development Status :: 1 - Alpha",
+        "License :: OSI Approved :: BSD License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Programming Language :: Python :: 3.13",
+        "Programming Language :: Python :: Implementation :: PyPy",
+    ],
+    packages=["axengine"],
+    ext_modules=[],
+    install_requires=["cffi>=1.0.0", "numpy>=1.22", "ml-dtypes>=0.1.0"],
+    setup_requires=["cffi>=1.0.0", "numpy>=1.22", "ml-dtypes>=0.1.0"],
+)