From 49c64aa5baf637273ab29cbbd5a3ba78fa9974f0 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Mon, 2 Sep 2024 13:25:25 +0800
Subject: [PATCH 01/18] check in pre-commit hooks

---
 .gitignore                         |   2 +-
 .pre-commit-config.yaml            |  58 +++++++++++
 README.md                          |   2 +-
 generate.sh                        |   2 +-
 requirements.txt                   |   1 +
 requirements/build.txt             |   2 +
 requirements/runtime.txt           |   1 +
 requirements/test.txt              |  10 ++
 setup.py                           | 157 +++++++++++++++++++++++++++++
 src/turbomind/CMakeLists.txt       |   2 +-
 src/turbomind/kernels/gemm/diff.py |  10 +-
 src/turbomind/kernels/gemm/plot.py |  89 ++++++++++++----
 src/turbomind/utils/CMakeLists.txt |   2 +-
 turbomind/__init__.py              |   1 +
 turbomind/linear.py                |   6 +-
 turbomind/utils.py                 |   3 +-
 turbomind/version.py               |  28 +++++
 17 files changed, 344 insertions(+), 32 deletions(-)
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 requirements.txt
 create mode 100644 requirements/build.txt
 create mode 100644 requirements/runtime.txt
 create mode 100644 requirements/test.txt
 create mode 100644 setup.py
 create mode 100644 turbomind/version.py

diff --git a/.gitignore b/.gitignore
index 1aac461..2290706 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,4 +33,4 @@
 
 
 .cache
-/build
\ No newline at end of file
+/build
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..61d3b10
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,58 @@
+repos:
+  - repo: https://github.com/PyCQA/flake8
+    rev: 4.0.1
+    hooks:
+      - id: flake8
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.11.5
+    hooks:
+      - id: isort
+  - repo: https://github.com/pre-commit/mirrors-yapf
+    rev: v0.32.0
+    hooks:
+      - id: yapf
+        name: yapf
+        description: 'Formatter for Python code'
+        entry: yapf
+        language: python
+        args: ['-i', '--style={based_on_style: pep8, column_limit: 79}']
+
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.2.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: double-quote-string-fixer
+      - id: check-merge-conflict
+      - id: fix-encoding-pragma
+        args: ["--remove"]
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+  - repo: https://github.com/executablebooks/mdformat
+    rev: 0.7.9
+    hooks:
+      - id: mdformat
+        args: ["--number"]
+        additional_dependencies:
+          - mdformat-openmmlab
+          - mdformat_frontmatter
+          - linkify-it-py
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.1.0
+    hooks:
+      - id: codespell
+        args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/kernels/gemm/transform.h"]
+
+  - repo: https://github.com/myint/docformatter
+    rev: v1.4
+    hooks:
+      - id: docformatter
+        args: ["--in-place", "--wrap-descriptions", "79"]
+
+  - repo: https://github.com/open-mmlab/pre-commit-hooks
+    rev: v0.2.0
+    hooks:
+    -   id: check-copyright
+        args: ["turbomind"]
diff --git a/README.md b/README.md
index 4a0f313..5f671f2 100644
--- a/README.md
+++ b/README.md
@@ -1 +1 @@
-# turbomind
\ No newline at end of file
+# turbomind
diff --git a/generate.sh b/generate.sh
index 46939e9..7ed3168 100755
--- a/generate.sh
+++ b/generate.sh
@@ -9,4 +9,4 @@ cmake -G Ninja .. \
     -DFETCHCONTENT_UPDATES_DISCONNECTED=ON \
     -DLMDEPLOY_ASAN_ENABLE=OFF \
     -DLMDEPLOY_UBSAN_ENABLE=OFF \
-    -DCMAKE_CUDA_ARCHITECTURES="80-real"
\ No newline at end of file
+    -DCMAKE_CUDA_ARCHITECTURES="80-real"
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e7b244f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+-r runtime.txt
diff --git a/requirements/build.txt b/requirements/build.txt
new file mode 100644
index 0000000..28c4a7a
--- /dev/null
+++ b/requirements/build.txt
@@ -0,0 +1,2 @@
+pybind11<=2.13.1
+setuptools
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
new file mode 100644
index 0000000..12c6d5d
--- /dev/null
+++ b/requirements/runtime.txt
@@ -0,0 +1 @@
+torch
diff --git a/requirements/test.txt b/requirements/test.txt
new file mode 100644
index 0000000..d06440a
--- /dev/null
+++ b/requirements/test.txt
@@ -0,0 +1,10 @@
+allure-pytest
+coverage
+pynvml
+pytest
+pytest-assume
+pytest-order
+pytest-rerunfailures
+pytest-sugar
+pytest-xdist
+pyyaml
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..4061558
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,157 @@
+import os
+import re
+import sys
+
+from setuptools import find_packages, setup
+
+pwd = os.path.dirname(__file__)
+version_file = 'lmdeploy/version.py'
+
+
+def readme():
+    with open(os.path.join(pwd, 'README.md'), encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+def get_version():
+    with open(os.path.join(pwd, version_file), 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+def check_ext_modules():
+    if os.path.exists(os.path.join(pwd, 'turbomind', 'lib')):
+        return True
+    return False
+
+
+def get_cuda_pkgs():
+    arg_name = '--cuda='
+    arg_value = None
+    for arg in sys.argv[1:]:
+        if arg.startswith(arg_name):
+            arg_value = arg[len(arg_name):]
+            sys.argv.remove(arg)
+            break
+
+    cuda_pkgs = []
+    if arg_value == '11':
+        cuda_pkgs = [
+            'nvidia-nccl-cu11', 'nvidia-cuda-runtime-cu11',
+            'nvidia-cublas-cu11', 'nvidia-curand-cu11'
+        ]
+    elif arg_value == '12':
+        cuda_pkgs = [
+            'nvidia-nccl-cu12', 'nvidia-cuda-runtime-cu12',
+            'nvidia-cublas-cu12', 'nvidia-curand-cu12'
+        ]
+    return cuda_pkgs
+
+
+cuda_pkgs = get_cuda_pkgs()
+
+
+def parse_requirements(fname='requirements.txt', with_version=True):
+    """Parse the package dependencies listed in a file but strips specific
+    versioning information.
+
+    Args:
+        fname (str): path to the file
+        with_version (bool, default=False): if True include version specs
+
+    Returns:
+        List[str]: list of requirements items
+
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    require_fpath = fname
+
+    def parse_line(line):
+        """Parse information from a line in a requirements text file."""
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            elif '@git+' in line:
+                info['package'] = line
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath, 'r') as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    for info in parse_line(line):
+                        yield info
+
+    def gen_packages_items():
+        if os.path.exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    packages += cuda_pkgs
+    return packages
+
+
+if __name__ == '__main__':
+    setup(name='turbomind',
+          version=get_version(),
+          description='CUDA kernels used for LLM Quantization',
+          long_description=readme(),
+          long_description_content_type='text/markdown',
+          author='OpenMMLab',
+          author_email='openmmlab@gmail.com',
+          packages=find_packages(exclude=()),
+          include_package_data=True,
+          setup_requires=parse_requirements('requirements/build.txt'),
+          tests_require=parse_requirements('requirements/test.txt'),
+          install_requires=parse_requirements('requirements/runtime.txt'),
+          extras_require={
+              'all': parse_requirements('requirements.txt'),
+          },
+          has_ext_modules=check_ext_modules,
+          classifiers=[
+              'Programming Language :: Python :: 3.8',
+              'Programming Language :: Python :: 3.9',
+              'Programming Language :: Python :: 3.10',
+              'Programming Language :: Python :: 3.11',
+              'Programming Language :: Python :: 3.12',
+              'Intended Audience :: Developers',
+              'Intended Audience :: Education',
+              'Intended Audience :: Science/Research',
+          ])
diff --git a/src/turbomind/CMakeLists.txt b/src/turbomind/CMakeLists.txt
index 289b362..ed840d6 100644
--- a/src/turbomind/CMakeLists.txt
+++ b/src/turbomind/CMakeLists.txt
@@ -2,4 +2,4 @@
 
 add_subdirectory(utils)
 add_subdirectory(kernels/gemm)
-add_subdirectory(api/python)
\ No newline at end of file
+add_subdirectory(api/python)
diff --git a/src/turbomind/kernels/gemm/diff.py b/src/turbomind/kernels/gemm/diff.py
index 01824cf..e233c46 100644
--- a/src/turbomind/kernels/gemm/diff.py
+++ b/src/turbomind/kernels/gemm/diff.py
@@ -1,7 +1,9 @@
-import fire
 import json
+
+import fire
 import prettytable
 
+
 def get_tag(state, tag):
     for x in state['summaries']:
         if x['tag'] == tag:
@@ -9,7 +11,7 @@ def get_tag(state, tag):
 
 
 def main(path_a: str, path_b: str):
-    
+
     with open(path_a, 'r') as f:
         a = json.load(f)
     with open(path_b, 'r') as f:
@@ -38,7 +40,7 @@ def main(path_a: str, path_b: str):
         x = by_bs.get(bs, [0, 0, 0])
         by_bs[bs] = x[0] + 1, x[1] + flops, x[2] + diff
         idx_4 = int(idx[4:]) % 4
-        x = by_idx_4.get(idx_4, [0,0,0])
+        x = by_idx_4.get(idx_4, [0, 0, 0])
         by_idx_4[idx_4] = x[0] + 1, x[1] + flops, x[2] + diff
 
     by_idx_tab = prettytable.PrettyTable(['idx', 'flops', 'diff'])
@@ -67,4 +69,4 @@ def main(path_a: str, path_b: str):
 
 
 if __name__ == '__main__':
-    fire.Fire(main)
\ No newline at end of file
+    fire.Fire(main)
diff --git a/src/turbomind/kernels/gemm/plot.py b/src/turbomind/kernels/gemm/plot.py
index 9c08414..c7c162e 100644
--- a/src/turbomind/kernels/gemm/plot.py
+++ b/src/turbomind/kernels/gemm/plot.py
@@ -4,50 +4,99 @@
 
 a100 = {
     'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192],
-    'flops': [30.975, 61.083, 104.805, 134.369, 152.262, 173.313, 187.04, 210.705, 212.336, 225.996, 241.908, 246.485],
-    'diff': [203.433, 200.315, 157.897, 120.865, 90.145, 68.087, 40.362, 36.08, 12.763, 6.857, 1.694, 3.056]
+    'flops': [
+        30.975, 61.083, 104.805, 134.369, 152.262, 173.313, 187.04, 210.705,
+        212.336, 225.996, 241.908, 246.485
+    ],
+    'diff': [
+        203.433, 200.315, 157.897, 120.865, 90.145, 68.087, 40.362, 36.08,
+        12.763, 6.857, 1.694, 3.056
+    ]
 }
 
 h100 = {
     'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192],
-    'flops': [40.723, 80.707, 145.495, 188.742, 216.633, 252.697, 282.116, 307.794, 330.382, 368.38, 394.966, 395.913],
-    'diff': [151.567, 149.366, 122.284, 93.14, 68.945, 41.166, 18.074, -1.506, -15.628, -29.799, -36.413, -37.346]
+    'flops': [
+        40.723, 80.707, 145.495, 188.742, 216.633, 252.697, 282.116, 307.794,
+        330.382, 368.38, 394.966, 395.913
+    ],
+    'diff': [
+        151.567, 149.366, 122.284, 93.14, 68.945, 41.166, 18.074, -1.506,
+        -15.628, -29.799, -36.413, -37.346
+    ]
 }
 
-rtx4090 ={
+rtx4090 = {
     'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192],
-    'flops': [18.685, 37.152, 72.702, 104.168, 125.963, 139.206, 146.199, 154.541, 157.06, 164.551, 167.729, 169.425],
-    'diff': [242.372, 247.154, 235.135, 230.773, 203.158, 131.433, 81.746, 45.646, 17.447, 10.195, 2.411, 2.135]
+    'flops': [
+        18.685, 37.152, 72.702, 104.168, 125.963, 139.206, 146.199, 154.541,
+        157.06, 164.551, 167.729, 169.425
+    ],
+    'diff': [
+        242.372, 247.154, 235.135, 230.773, 203.158, 131.433, 81.746, 45.646,
+        17.447, 10.195, 2.411, 2.135
+    ]
 }
 
 v100 = {
     'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192],
-    'flops': [17.179, 31.255, 46.443, 44.769, 56.847, 59.087, 62.678, 64.258, 65.164, 66.699, 69.755, 71.13],
-    'diff': [260.359, 231.003, 156.902, 142.953, 134.385, 97.611, 54.962, 21.085, -5.455, -14.298, -15.295, -14.578]
+    'flops': [
+        17.179, 31.255, 46.443, 44.769, 56.847, 59.087, 62.678, 64.258, 65.164,
+        66.699, 69.755, 71.13
+    ],
+    'diff': [
+        260.359, 231.003, 156.902, 142.953, 134.385, 97.611, 54.962, 21.085,
+        -5.455, -14.298, -15.295, -14.578
+    ]
 }
 
 rtx2080 = {
     'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096],
-    'flops': [10.958, 21.416, 26.624, 26.946, 28.604, 30.342, 31.97, 32.201, 32.424, 32.982, 34.231],
-    'diff': [271.225, 265.331, 121.867, 51.776, 21.793, 40.907, 15.176, 23.986, 10.364, -0.066, -2.167]
+    'flops': [
+        10.958, 21.416, 26.624, 26.946, 28.604, 30.342, 31.97, 32.201, 32.424,
+        32.982, 34.231
+    ],
+    'diff': [
+        271.225, 265.331, 121.867, 51.776, 21.793, 40.907, 15.176, 23.986,
+        10.364, -0.066, -2.167
+    ]
 }
 
-fig, ax1 = plt.subplots(1, 1, figsize=(16,9), dpi=80)
+fig, ax1 = plt.subplots(1, 1, figsize=(16, 9), dpi=80)
 ax1.set_xscale('log')
 ax1.set_xlabel('linear')
-ax1.plot(a100['bs'], np.array(a100['diff']) * 0.01 + 1, marker='o', label='A100', color='tab:red')
-ax1.plot(h100['bs'], np.array(h100['diff']) * 0.01 + 1, marker='*', label='H100', color='tab:blue')
-ax1.plot(v100['bs'], np.array(v100['diff']) * 0.01 + 1, marker='p', label='V100', color='tab:orange')
-ax1.plot(rtx4090['bs'], np.array(rtx4090['diff']) * 0.01 + 1, marker='h', label='4090', color='tab:green')
-ax1.plot(rtx2080['bs'], np.array(rtx2080['diff']) * 0.01 + 1, marker='s', label='2080', color='tab:purple')
+ax1.plot(a100['bs'],
+         np.array(a100['diff']) * 0.01 + 1,
+         marker='o',
+         label='A100',
+         color='tab:red')
+ax1.plot(h100['bs'],
+         np.array(h100['diff']) * 0.01 + 1,
+         marker='*',
+         label='H100',
+         color='tab:blue')
+ax1.plot(v100['bs'],
+         np.array(v100['diff']) * 0.01 + 1,
+         marker='p',
+         label='V100',
+         color='tab:orange')
+ax1.plot(rtx4090['bs'],
+         np.array(rtx4090['diff']) * 0.01 + 1,
+         marker='h',
+         label='4090',
+         color='tab:green')
+ax1.plot(rtx2080['bs'],
+         np.array(rtx2080['diff']) * 0.01 + 1,
+         marker='s',
+         label='2080',
+         color='tab:purple')
 ax1.set_ylabel('relative speed (vs cublasGemmEx)')
 ax1.tick_params(axis='x', rotation=0, labelsize=12)
 ax1.set_xlabel('batch size')
 ax1.set_xticks(a100['bs'])
-ax1.set_xticklabels(a100['bs'], fontdict={'fontsize':10})
+ax1.set_xticklabels(a100['bs'], fontdict={'fontsize': 10})
 ax1.grid(alpha=.25, axis='y')
 
-
 ax2 = ax1.twinx()
 ax2.plot(a100['bs'], a100['flops'], alpha=.4, color='tab:red')
 ax2.plot(h100['bs'], h100['flops'], alpha=.4, color='tab:blue')
@@ -62,4 +111,4 @@
 ax1.legend()
 
 fig.tight_layout()
-plt.savefig('fig.jpg')
\ No newline at end of file
+plt.savefig('fig.jpg')
diff --git a/src/turbomind/utils/CMakeLists.txt b/src/turbomind/utils/CMakeLists.txt
index 05b1011..06d7749 100644
--- a/src/turbomind/utils/CMakeLists.txt
+++ b/src/turbomind/utils/CMakeLists.txt
@@ -1,4 +1,4 @@
 
 
 add_library(parser STATIC parser.cc)
-set_property(TARGET parser PROPERTY POSITION_INDEPENDENT_CODE  ON)
\ No newline at end of file
+set_property(TARGET parser PROPERTY POSITION_INDEPENDENT_CODE  ON)
diff --git a/turbomind/__init__.py b/turbomind/__init__.py
index e69de29..ef101fe 100644
--- a/turbomind/__init__.py
+++ b/turbomind/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/turbomind/linear.py b/turbomind/linear.py
index 1ec6419..0b322a1 100644
--- a/turbomind/linear.py
+++ b/turbomind/linear.py
@@ -1,6 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch
-from . import utils
+
+# from . import utils
+
 
 class Linear(torch.nn.Module):
     pass
-
diff --git a/turbomind/utils.py b/turbomind/utils.py
index da1adc5..39d9397 100644
--- a/turbomind/utils.py
+++ b/turbomind/utils.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from typing import List
+
 import torch
 
 
@@ -46,4 +47,4 @@ def pack_u4_row(x: torch.Tensor) -> torch.Tensor:
     a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device)
     for t in reversed(xs):
         a = (a << 4) | t
-    return a.squeeze(dim=-1)
\ No newline at end of file
+    return a.squeeze(dim=-1)
diff --git a/turbomind/version.py b/turbomind/version.py
new file mode 100644
index 0000000..20e5e73
--- /dev/null
+++ b/turbomind/version.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+__version__ = '0.0.1'
+short_version = __version__
+
+
+def parse_version_info(version_str: str) -> Tuple:
+    """Parse version from a string.
+
+    Args:
+        version_str (str): A string represents a version info.
+
+    Returns:
+        tuple: A sequence of integer and string represents version.
+    """
+    _version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            _version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            _version_info.append(int(patch_version[0]))
+            _version_info.append(f'rc{patch_version[1]}')
+    return tuple(_version_info)
+
+
+version_info = parse_version_info(__version__)

From a79f44429648ea8817e7d76dac71dc3e97d8397c Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Mon, 2 Sep 2024 21:31:54 +0800
Subject: [PATCH 02/18] remove diff.py and plot.py from src/turbomind

---
 src/turbomind/kernels/gemm/diff.py |  72 ------------------
 src/turbomind/kernels/gemm/plot.py | 114 -----------------------------
 2 files changed, 186 deletions(-)
 delete mode 100644 src/turbomind/kernels/gemm/diff.py
 delete mode 100644 src/turbomind/kernels/gemm/plot.py

diff --git a/src/turbomind/kernels/gemm/diff.py b/src/turbomind/kernels/gemm/diff.py
deleted file mode 100644
index e233c46..0000000
--- a/src/turbomind/kernels/gemm/diff.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import json
-
-import fire
-import prettytable
-
-
-def get_tag(state, tag):
-    for x in state['summaries']:
-        if x['tag'] == tag:
-            return float(x['data'][0]['value'])
-
-
-def main(path_a: str, path_b: str):
-
-    with open(path_a, 'r') as f:
-        a = json.load(f)
-    with open(path_b, 'r') as f:
-        b = json.load(f)
-    states_a = a['benchmarks'][0]['states']
-    states_b = b['benchmarks'][0]['states']
-    by_idx = {}
-    by_idx_4 = {}
-    by_bs = {}
-    table = prettytable.PrettyTable(['idx', 'bs', 'tp', 'flops', 'diff'])
-    for u in states_a:
-        for v in states_b:
-            if u['name'] == v['name']:
-                break
-        assert u['name'] == v['name']
-        tag = 'nv/cold/bw/item_rate'
-        flops_a = get_tag(u, tag)
-        flops_b = get_tag(v, tag)
-        ratio = flops_a / flops_b
-        flops = round(flops_a / 1e12, 3)
-        diff = round((ratio - 1.0) * 100, 3)
-        idx, bs, tp = u['name'].split(' ')[1:]
-        table.add_row([idx, bs, tp, flops, diff])
-        x = by_idx.get(idx, [0, 0, 0])
-        by_idx[idx] = x[0] + 1, x[1] + flops, x[2] + diff
-        x = by_bs.get(bs, [0, 0, 0])
-        by_bs[bs] = x[0] + 1, x[1] + flops, x[2] + diff
-        idx_4 = int(idx[4:]) % 4
-        x = by_idx_4.get(idx_4, [0, 0, 0])
-        by_idx_4[idx_4] = x[0] + 1, x[1] + flops, x[2] + diff
-
-    by_idx_tab = prettytable.PrettyTable(['idx', 'flops', 'diff'])
-    for k, (cnt, flops, diff) in by_idx.items():
-        flops = round(flops / cnt, 3)
-        diff = round(diff / cnt, 3)
-        by_idx_tab.add_row([k, flops, diff])
-
-    by_bs_tab = prettytable.PrettyTable(['idx', 'flops', 'diff'])
-    for k, (cnt, flops, diff) in by_bs.items():
-        flops = round(flops / cnt, 3)
-        diff = round(diff / cnt, 3)
-        by_bs_tab.add_row([k, flops, diff])
-
-    by_idx_4_tab = prettytable.PrettyTable(['idx_4', 'flops', 'diff'])
-    for k, (cnt, flops, diff) in by_idx_4.items():
-        flops = round(flops / cnt, 3)
-        diff = round(diff / cnt, 3)
-        by_idx_4_tab.add_row([k, flops, diff])
-
-    print(table)
-
-    print(by_idx_tab)
-    print(by_idx_4_tab)
-    print(by_bs_tab)
-
-
-if __name__ == '__main__':
-    fire.Fire(main)
diff --git a/src/turbomind/kernels/gemm/plot.py b/src/turbomind/kernels/gemm/plot.py
deleted file mode 100644
index c7c162e..0000000
--- a/src/turbomind/kernels/gemm/plot.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# import pandas as pd
-import matplotlib.pyplot as plt
-import numpy as np
-
-a100 = {
-    'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192],
-    'flops': [
-        30.975, 61.083, 104.805, 134.369, 152.262, 173.313, 187.04, 210.705,
-        212.336, 225.996, 241.908, 246.485
-    ],
-    'diff': [
-        203.433, 200.315, 157.897, 120.865, 90.145, 68.087, 40.362, 36.08,
-        12.763, 6.857, 1.694, 3.056
-    ]
-}
-
-h100 = {
-    'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192],
-    'flops': [
-        40.723, 80.707, 145.495, 188.742, 216.633, 252.697, 282.116, 307.794,
-        330.382, 368.38, 394.966, 395.913
-    ],
-    'diff': [
-        151.567, 149.366, 122.284, 93.14, 68.945, 41.166, 18.074, -1.506,
-        -15.628, -29.799, -36.413, -37.346
-    ]
-}
-
-rtx4090 = {
-    'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192],
-    'flops': [
-        18.685, 37.152, 72.702, 104.168, 125.963, 139.206, 146.199, 154.541,
-        157.06, 164.551, 167.729, 169.425
-    ],
-    'diff': [
-        242.372, 247.154, 235.135, 230.773, 203.158, 131.433, 81.746, 45.646,
-        17.447, 10.195, 2.411, 2.135
-    ]
-}
-
-v100 = {
-    'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192],
-    'flops': [
-        17.179, 31.255, 46.443, 44.769, 56.847, 59.087, 62.678, 64.258, 65.164,
-        66.699, 69.755, 71.13
-    ],
-    'diff': [
-        260.359, 231.003, 156.902, 142.953, 134.385, 97.611, 54.962, 21.085,
-        -5.455, -14.298, -15.295, -14.578
-    ]
-}
-
-rtx2080 = {
-    'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096],
-    'flops': [
-        10.958, 21.416, 26.624, 26.946, 28.604, 30.342, 31.97, 32.201, 32.424,
-        32.982, 34.231
-    ],
-    'diff': [
-        271.225, 265.331, 121.867, 51.776, 21.793, 40.907, 15.176, 23.986,
-        10.364, -0.066, -2.167
-    ]
-}
-
-fig, ax1 = plt.subplots(1, 1, figsize=(16, 9), dpi=80)
-ax1.set_xscale('log')
-ax1.set_xlabel('linear')
-ax1.plot(a100['bs'],
-         np.array(a100['diff']) * 0.01 + 1,
-         marker='o',
-         label='A100',
-         color='tab:red')
-ax1.plot(h100['bs'],
-         np.array(h100['diff']) * 0.01 + 1,
-         marker='*',
-         label='H100',
-         color='tab:blue')
-ax1.plot(v100['bs'],
-         np.array(v100['diff']) * 0.01 + 1,
-         marker='p',
-         label='V100',
-         color='tab:orange')
-ax1.plot(rtx4090['bs'],
-         np.array(rtx4090['diff']) * 0.01 + 1,
-         marker='h',
-         label='4090',
-         color='tab:green')
-ax1.plot(rtx2080['bs'],
-         np.array(rtx2080['diff']) * 0.01 + 1,
-         marker='s',
-         label='2080',
-         color='tab:purple')
-ax1.set_ylabel('relative speed (vs cublasGemmEx)')
-ax1.tick_params(axis='x', rotation=0, labelsize=12)
-ax1.set_xlabel('batch size')
-ax1.set_xticks(a100['bs'])
-ax1.set_xticklabels(a100['bs'], fontdict={'fontsize': 10})
-ax1.grid(alpha=.25, axis='y')
-
-ax2 = ax1.twinx()
-ax2.plot(a100['bs'], a100['flops'], alpha=.4, color='tab:red')
-ax2.plot(h100['bs'], h100['flops'], alpha=.4, color='tab:blue')
-ax2.plot(v100['bs'], v100['flops'], alpha=.4, color='tab:orange')
-ax2.plot(rtx4090['bs'], rtx4090['flops'], alpha=.4, color='tab:green')
-ax2.plot(rtx2080['bs'], rtx2080['flops'], alpha=.4, color='tab:purple')
-ax2.set_ylabel('TFLOPS')
-# ax2.set_xticklabels(x[::60], rotation=90, fontdict={'fontsize':10})
-
-# ax2.set_yscale(0.01)
-
-ax1.legend()
-
-fig.tight_layout()
-plt.savefig('fig.jpg')

From 170fa36aae02c2cfc43d99b634dbda204c3017f9 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Tue, 3 Sep 2024 17:20:23 +0800
Subject: [PATCH 03/18] update setup.py

---
 .gitignore       | 9 +++++++++
 requirements.txt | 3 ++-
 setup.py         | 2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2290706..7d4768c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,3 +34,12 @@
 
 .cache
 /build
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+.vscode/
+
+# Distribution / packaging
+.eggs/
+wheels/
+*.egg-info/
diff --git a/requirements.txt b/requirements.txt
index e7b244f..1b0eb4a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
--r runtime.txt
+-r requirements/build.txt
+-r requirements/runtime.txt
diff --git a/setup.py b/setup.py
index 4061558..1b238db 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 from setuptools import find_packages, setup
 
 pwd = os.path.dirname(__file__)
-version_file = 'lmdeploy/version.py'
+version_file = 'turbomind/version.py'
 
 
 def readme():

From 7ee84e6801bb4f33397bcf07659e5f982b4cac11 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Tue, 3 Sep 2024 21:03:07 +0800
Subject: [PATCH 04/18] add autoaweq inference as a test example

---
 example/awq/__init__.py      |   0
 example/awq/_config.py       | 109 +++++++++++++
 example/awq/act.py           |  12 ++
 example/awq/auto.py          |  64 ++++++++
 example/awq/base.py          | 244 +++++++++++++++++++++++++++++
 example/awq/gemm.py          | 287 +++++++++++++++++++++++++++++++++++
 example/awq/llama.py         |  81 ++++++++++
 example/awq/module.py        |  60 ++++++++
 example/awq/packing_utils.py |  97 ++++++++++++
 example/awq/utils.py         | 110 ++++++++++++++
 example/generate.py          |  44 ++++++
 11 files changed, 1108 insertions(+)
 create mode 100644 example/awq/__init__.py
 create mode 100644 example/awq/_config.py
 create mode 100644 example/awq/act.py
 create mode 100644 example/awq/auto.py
 create mode 100644 example/awq/base.py
 create mode 100644 example/awq/gemm.py
 create mode 100644 example/awq/llama.py
 create mode 100644 example/awq/module.py
 create mode 100644 example/awq/packing_utils.py
 create mode 100644 example/awq/utils.py
 create mode 100644 example/generate.py

diff --git a/example/awq/__init__.py b/example/awq/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/example/awq/_config.py b/example/awq/_config.py
new file mode 100644
index 0000000..a966e5e
--- /dev/null
+++ b/example/awq/_config.py
@@ -0,0 +1,109 @@
+import json
+import os
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+from transformers.utils.hub import PushToHubMixin, cached_file
+
+
+@dataclass
+class AwqConfig(PushToHubMixin):
+    quant_method: str = field(default='awq')
+    zero_point: bool = field(default=True)
+    q_group_size: int = field(default=128)
+    w_bit: int = field(default=4)
+    version: str = field(default='gemm')
+    config_file_name = 'config.json'
+    modules_to_not_convert: Optional[List] = None
+
+    @classmethod
+    def from_dict(cls, quant_config: Dict = {}):
+        if not quant_config:
+            quant_config = cls()
+        else:
+            quant_config = cls(**quant_config)
+            quant_config.version = quant_config.version.lower()
+
+        return quant_config
+
+    @classmethod
+    def from_pretrained(cls, save_dir: str, **kwargs):
+        cache_dir = kwargs.pop('cache_dir', None)
+        force_download = kwargs.pop('force_download', False)
+        resume_download = kwargs.pop('resume_download', False)
+        proxies = kwargs.pop('proxies', None)
+        local_files_only = kwargs.pop('local_files_only', False)
+        use_auth_token = kwargs.pop('use_auth_token', None)
+        revision = kwargs.pop('revision', None)
+        subfolder = kwargs.pop('subfolder', None)
+        commit_hash = kwargs.pop('_commit_hash', None)
+
+        if os.path.isdir(save_dir):  # Local
+            resolved_config_file = os.path.join(save_dir, cls.config_file_name)
+        else:  # Remote
+            resolved_config_file = cached_file(
+                save_dir,
+                cls.config_file_name,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                local_files_only=local_files_only,
+                subfolder=subfolder,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+                _commit_hash=commit_hash,
+            )
+
+        quant_config = None
+        if os.path.exists(resolved_config_file):
+            with open(resolved_config_file, 'r', encoding='utf-8') as file:
+                loaded_config = json.loads(file.read())
+
+            quant_config = loaded_config.get('quantization_config')
+
+            if quant_config is not None:
+                awq_config = cls.from_transformers_dict(cls, quant_config)
+                quant_config = cls(**awq_config)
+
+        if quant_config is None:
+            quant_config = cls()
+
+        return quant_config
+
+    def to_dict(self):
+        return {
+            'zero_point': self.zero_point,
+            'q_group_size': self.q_group_size,
+            'w_bit': self.w_bit,
+            'version': self.version,
+            'modules_to_not_convert': self.modules_to_not_convert,
+        }
+
+    def to_transformers_dict(self):
+        return {
+            'quant_method': self.quant_method,
+            'zero_point': self.zero_point,
+            'group_size': self.q_group_size,
+            'bits': self.w_bit,
+            'version': self.version.lower(),
+            'modules_to_not_convert': self.modules_to_not_convert,
+        }
+
+    def from_transformers_dict(self, transformers_dict: Dict):
+        return {
+            'quant_method':
+            transformers_dict.get('quant_method'),
+            'zero_point':
+            transformers_dict.get('zero_point'),
+            'q_group_size':
+            transformers_dict.get('group_size'),
+            'w_bit':
+            transformers_dict.get('bits'),
+            'version':
+            transformers_dict.get('version'),
+            'modules_to_not_convert':
+            transformers_dict.get('modules_to_not_convert'),
+        }
diff --git a/example/awq/act.py b/example/awq/act.py
new file mode 100644
index 0000000..59ca7c4
--- /dev/null
+++ b/example/awq/act.py
@@ -0,0 +1,12 @@
+import torch.nn as nn
+
+
+class ScaledActivation(nn.Module):
+
+    def __init__(self, module, scales):
+        super().__init__()
+        self.act = module
+        self.scales = nn.Parameter(scales.data)
+
+    def forward(self, x):
+        return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
diff --git a/example/awq/auto.py b/example/awq/auto.py
new file mode 100644
index 0000000..a18db0c
--- /dev/null
+++ b/example/awq/auto.py
@@ -0,0 +1,64 @@
+import logging
+import os
+
+from transformers import AutoConfig
+
+from .base import BaseAWQForCausalLM
+from .llama import LlamaAWQForCausalLM
+
+AWQ_CAUSAL_LM_MODEL_MAP = {
+    'llama': LlamaAWQForCausalLM,
+}
+
+
+def check_and_get_model_type(model_dir, **model_init_kwargs):
+    config = AutoConfig.from_pretrained(model_dir,
+                                        trust_remote_code=True,
+                                        **model_init_kwargs)
+    if config.model_type not in AWQ_CAUSAL_LM_MODEL_MAP.keys():
+        raise TypeError(f"{config.model_type} isn't supported yet.")
+    model_type = config.model_type
+    return model_type
+
+
+class AutoAWQForCausalLM:
+
+    def __init__(self):
+        raise EnvironmentError(
+            'You must instantiate AutoAWQForCausalLM with\n'
+            'AutoAWQForCausalLM.from_quantized or AutoAWQForCausalLM.'
+            'from_pretrained')
+
+    @classmethod
+    def from_quantized(
+        self,
+        quant_path,
+        max_seq_len=2048,
+        fuse_layers=True,
+        batch_size=1,
+        device_map='balanced',
+        max_memory=None,
+        offload_folder=None,
+        download_kwargs=None,
+        **config_kwargs,
+    ) -> BaseAWQForCausalLM:
+        os.environ['AWQ_BATCH_SIZE'] = str(batch_size)
+        model_type = check_and_get_model_type(quant_path)
+
+        if config_kwargs.get('max_new_tokens') is not None:
+            max_seq_len = config_kwargs['max_new_tokens']
+            logging.warning(
+                'max_new_tokens argument is deprecated... gracefully '
+                'setting max_seq_len=max_new_tokens.')
+
+        return AWQ_CAUSAL_LM_MODEL_MAP[model_type].from_quantized(
+            quant_path,
+            model_type,
+            max_seq_len,
+            fuse_layers=fuse_layers,
+            device_map=device_map,
+            max_memory=max_memory,
+            offload_folder=offload_folder,
+            download_kwargs=download_kwargs,
+            **config_kwargs,
+        )
diff --git a/example/awq/base.py b/example/awq/base.py
new file mode 100644
index 0000000..9a815c4
--- /dev/null
+++ b/example/awq/base.py
@@ -0,0 +1,244 @@
+import gc
+from typing import Dict, Union
+
+import torch
+import torch.nn as nn
+import transformers
+from accelerate.big_modeling import (init_empty_weights,
+                                     load_checkpoint_and_dispatch)
+from tqdm import tqdm
+from transformers import AutoConfig, PretrainedConfig, PreTrainedModel
+from typing_extensions import Annotated, Doc
+
+from ._config import AwqConfig
+from .act import ScaledActivation
+from .gemm import WQLinear_GEMM
+from .module import (exclude_layers_to_not_quantize, get_named_linears,
+                     set_op_by_name)
+
+# from turbomind import Linear
+# from turbomind.utils import turbomind_post_init
+
+# Since we support different `AutoModelForxxx` from transformers
+# we need to define a custom mapping dict as below:
+TRANSFORMERS_AUTO_MAPPING_DICT = {
+    'llama': 'AutoModelForCausalLM',
+}
+
+
+class BaseAWQForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        model,
+        model_type,
+        is_quantized,
+        config,
+        quant_config,
+    ):
+        """The base model for all AutoAWQ models.
+
+        Args:
+            model: The pretrained or quantized model.
+            model_type: The model type, found in config.json.
+            is_quantized: Indicates if the current model is quantized
+            config: The config of the model.
+            quant_config: The quantization config of the model.
+        """
+        super().__init__()
+        self.model: PreTrainedModel = model
+        self.model_type: str = model_type
+        self.is_quantized: bool = is_quantized
+        self.search_result = None
+        self.config: PretrainedConfig = config
+        self.quant_config: AwqConfig = quant_config
+
+    def to(self, device: Annotated[str,
+                                   Doc('The device to move your model to.')]):
+        """A utility function for moving the model to a device."""
+        return self.model.to(device)
+
+    def forward(self, *args, **kwargs):
+        """A forward function that mimics the torch forward."""
+        return self.model(*args, **kwargs)
+
+    def generate(self, *args, **kwargs):
+        """A generate function that mimics the HF generate function."""
+        with torch.inference_mode():
+            return self.model.generate(*args, **kwargs)
+
+    # @staticmethod
+    # def fuse_layers(model):
+    #     pass
+
+    @classmethod
+    def from_quantized(self,
+                       model_path: str,
+                       model_type: str,
+                       max_seq_len: int,
+                       torch_dtype: torch.dtype = torch.float16,
+                       device_map: Union[str, Dict] = 'balanced',
+                       **config_kwargs: Dict):
+        """A method for initialization of a quantized model, usually in INT4.
+
+        Args:
+            model_path (str): The model path
+            model_type (str): The model type, loaded from config.json.
+            max_seq_len (int): The maximum sequence cached sequence length of
+                the model. Larger values may increase loading time and
+                memory usage.
+            torch_dtype: The dtype to load the model as. May not work with
+                other values than float16.
+            device_map: A device map that will be passed onto the model
+                loading method from transformers.
+        **config_kwargs: Additional kwargs that are passed to the config
+            during initialization
+        """
+        # [STEP 1-2] Load weights path and configs
+        model_weights_path, config, quant_config = self._load_config(
+            self,
+            model_path,
+            max_seq_len=max_seq_len,
+            **config_kwargs,
+        )
+
+        target_cls_name = TRANSFORMERS_AUTO_MAPPING_DICT[config.model_type]
+        target_cls = getattr(transformers, target_cls_name)
+
+        # [STEP 3] Load model
+        with init_empty_weights():
+            model = target_cls.from_config(
+                config=config,
+                torch_dtype=torch_dtype,
+                trust_remote_code=True,
+            )
+        # Prepare WQLinear layers, replace nn.Linear
+        self._load_quantized_modules(
+            self,
+            model,
+            quant_config,
+            quant_config.version,
+            use_exllama=False,
+            use_exllama_v2=False,
+            use_qbits=False,
+        )
+
+        model.tie_weights()
+
+        # loads the weights into modules and distributes
+        # across available devices automatically
+        load_checkpoint_and_dispatch(
+            model,
+            checkpoint=model_weights_path,
+            device_map=device_map,
+            no_split_module_classes=[self.layer_type],
+            dtype=torch_dtype,
+        )
+
+        # TODO
+        # model = turbomind_post_init(model)
+
+        # # Dispatch to devices
+        # if fuse_layers:
+        #     self.fuse_layers(model)
+
+        model.eval()
+
+        return self(
+            model,
+            model_type,
+            is_quantized=True,
+            config=config,
+            quant_config=quant_config,
+        )
+
+    def _load_config(
+        self,
+        model_path,
+        max_seq_len=4096,
+        **config_kwargs,
+    ):
+        # [STEP 2] Load config and set sequence length
+        # TODO: Create BaseAWQConfig class
+        quant_config = AwqConfig.from_pretrained(model_path)
+
+        # Load model config and set max generation length
+        if max_seq_len is None and hasattr(self, 'max_seq_len_key'):
+            config = AutoConfig.from_pretrained(model_path,
+                                                trust_remote_code=True,
+                                                **config_kwargs)
+            config.max_seq_len = getattr(config, self.max_seq_len_key, 2048)
+            # To add the generate support for Multi-modal models as well
+            if hasattr(config, 'text_config'):
+                config.text_config.max_seq_len = getattr(
+                    config, self.max_seq_len_key, 2048)
+        else:
+            max_seq_len = 2048 if max_seq_len is None else max_seq_len
+            config = AutoConfig.from_pretrained(model_path,
+                                                trust_remote_code=True,
+                                                **config_kwargs)
+            config.max_seq_len = max_seq_len
+
+        return model_path, config, quant_config
+
+    def _load_quantized_modules(self,
+                                model,
+                                quant_config,
+                                version,
+                                use_exllama,
+                                use_exllama_v2,
+                                use_qbits=False):
+        # Real quantization of weights
+        assert not (version == 'gemv' and
+                    (use_exllama or use_exllama_v2 or
+                     use_qbits)), 'Exllama kernels only support GEMM version.'
+
+        # Get blocks of model
+        layers = self.get_model_layers(model)
+
+        for i in tqdm(range(len(layers)), desc='Replacing layers...'):
+            layer = layers[i]
+
+            # Get every linear layer in a block
+            named_linears = get_named_linears(layer)
+
+            # Filter out the linear layers we don't want to include
+            named_linears = exclude_layers_to_not_quantize(
+                named_linears, quant_config.modules_to_not_convert)
+
+            # Replace activation functions
+            self._scale_activations(self, layer)
+
+            # Replace nn.Linear with WQLinear
+            for name, module in named_linears.items():
+                assert version == 'gemm'
+
+                q_linear_module = WQLinear_GEMM
+                # q_linear_module = Linear
+                q_linear = q_linear_module.from_linear(
+                    module, quant_config.w_bit, quant_config.q_group_size,
+                    True)
+                q_linear.to(next(layer.parameters()).device)
+                set_op_by_name(layer, name, q_linear)
+
+            if not use_qbits:
+                torch.cuda.empty_cache()
+            gc.collect()
+
+    @staticmethod
+    def _scale_activations(self, layer):
+        scale_dict = self.get_act_for_scaling(layer)
+
+        if scale_dict['is_scalable']:
+            if not isinstance(scale_dict['scale_layer'], ScaledActivation):
+                param = next(layer.parameters())
+
+                # get activation scale
+                scale_like = torch.ones(scale_dict['scale_shape'],
+                                        dtype=param.dtype,
+                                        device=param.device)
+
+                # scale activation
+                scaled_act = ScaledActivation(scale_dict['scale_layer'],
+                                              scale_like)
+                set_op_by_name(layer, scale_dict['scale_name'], scaled_act)
diff --git a/example/awq/gemm.py b/example/awq/gemm.py
new file mode 100644
index 0000000..664b4f5
--- /dev/null
+++ b/example/awq/gemm.py
@@ -0,0 +1,287 @@
+import warnings
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+
+from .packing_utils import dequantize_gemm
+from .utils import get_best_device
+
+try:
+    import awq_ext  # with CUDA kernels (AutoAWQ_kernels)
+
+    AWQ_INSTALLED = True
+except Exception as ex:
+    AWQ_INSTALLED = False
+    warnings.warn(
+        f'AutoAWQ could not load GEMM kernels extension. Details: {ex}')
+
+
+# Adapted from https://github.com/compressa-ai/AutoAWQ/tree/dev
+class WQLinearMMFunction(Function):
+
+    @staticmethod
+    # ctx is the first argument to forward
+    def forward(
+        ctx,
+        x,
+        qweight,
+        qzeros,
+        scales,
+        w_bit=4,
+        group_size=128,
+        bias=None,
+        out_features=0,
+    ):
+        # The forward pass can use ctx.
+        ctx.save_for_backward(x, qweight, qzeros, scales, bias)
+        ctx.out_features = out_features
+
+        out_shape = x.shape[:-1] + (out_features, )
+        x = x.to(torch.float16)
+
+        if AWQ_INSTALLED:
+            FP16_MATMUL_HEURISTIC_CONDITION = x.shape[0] * x.shape[1] >= 1024
+
+            if FP16_MATMUL_HEURISTIC_CONDITION:
+                out = awq_ext.dequantize_weights_cuda(qweight, scales, qzeros,
+                                                      0, 0, 0, False)
+                out = torch.matmul(x, out)
+            else:
+                out = awq_ext.gemm_forward_cuda(x.reshape(-1, x.shape[-1]),
+                                                qweight, scales, qzeros, 8)
+        else:
+            out = dequantize_gemm(qweight, qzeros, scales, w_bit, group_size)
+            out = torch.matmul(x, out)
+
+        out = out + bias if bias is not None else out
+        out = out.reshape(out_shape)
+
+        # always want 3D tensor if tensor is 2D
+        if len(out.shape) == 2:
+            out = out.unsqueeze(0)
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, qweight, qzeros, scales, bias = ctx.saved_tensors
+
+        if not AWQ_INSTALLED:
+            raise ValueError(
+                'auto-awq kernels is needed to be installed to '
+                'use `.backward()`. Make sure to install the auto-awq kernels'
+                ' by following the installation guides in '
+                'https://github.com/casper-hansen/AutoAWQ_kernels')
+
+        # Cast to correct dtype for mixed precision training
+        weights = awq_ext.dequantize_weights_cuda(qweight, scales, qzeros, 1,
+                                                  0, 0,
+                                                  False).to(grad_output.dtype)
+
+        if ctx.needs_input_grad[0]:
+            # 3D matmul using torch.bmm:
+            # https://pytorch.org/docs/stable/generated/torch.bmm.html#torch.bmm # noqa
+            # to propagate gradient across all batch sizes.
+            batch_size = grad_output.shape[0]
+            grad_input = grad_output.bmm(
+                weights.transpose(0, 1).unsqueeze(0).repeat(batch_size, 1, 1))
+
+        return grad_input, None, None, None, None, None, None, None
+
+
+class WQLinear_GEMM(nn.Module):
+
+    def __init__(self,
+                 w_bit,
+                 group_size,
+                 in_features,
+                 out_features,
+                 bias,
+                 dev,
+                 training=False):
+        super().__init__()
+
+        if w_bit not in [4]:
+            raise NotImplementedError('Only 4-bit are supported for now.')
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.w_bit = w_bit
+        self.group_size = group_size if group_size != -1 else in_features
+        self.training = training
+
+        # quick sanity check (make sure alignment)
+        assert self.in_features % self.group_size == 0
+        assert out_features % (32 // self.w_bit) == 0
+
+        self.register_buffer(
+            'qweight',
+            torch.zeros(
+                (in_features, out_features // (32 // self.w_bit)),
+                dtype=torch.int32,
+                device=dev,
+            ),
+        )
+        self.register_buffer(
+            'qzeros',
+            torch.zeros(
+                (in_features // self.group_size, out_features //
+                 (32 // self.w_bit)),
+                dtype=torch.int32,
+                device=dev,
+            ),
+        )
+        self.register_buffer(
+            'scales',
+            torch.zeros(
+                (in_features // self.group_size, out_features),
+                dtype=torch.float16,
+                device=dev,
+            ),
+        )
+        if bias:
+            self.register_buffer(
+                'bias',
+                torch.zeros(
+                    (out_features),
+                    dtype=torch.float16,
+                    device=dev,
+                ),
+            )
+        else:
+            self.bias = None
+
+    @classmethod
+    def from_linear(cls,
+                    linear,
+                    w_bit,
+                    group_size,
+                    init_only=False,
+                    scales=None,
+                    zeros=None):
+        awq_linear = cls(
+            w_bit,
+            group_size,
+            linear.in_features,
+            linear.out_features,
+            linear.bias is not None,
+            linear.weight.device,
+        )
+        if init_only:  # just prepare for loading sd
+            return awq_linear
+
+        # need scales and zeros info for real quantization
+        assert scales is not None and zeros is not None
+        scale_zeros = zeros * scales
+
+        awq_linear.scales = scales.clone().half()
+        if linear.bias is not None:
+            awq_linear.bias = linear.bias.clone().half()
+
+        pack_num = 32 // awq_linear.w_bit
+
+        intweight = []
+        for idx in range(awq_linear.in_features):
+            intweight.append(
+                torch.round(
+                    (linear.weight.data[:, idx] +
+                     scale_zeros[idx // group_size]) /
+                    awq_linear.scales[idx // group_size]).to(torch.int)[:,
+                                                                        None])
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.to(dtype=torch.int32)
+
+        best_device = get_best_device()
+
+        # Avoid: The operator 'aten::__lshift__.Scalar' is not currently
+        # implemented for the MPS device
+        if 'mps' in best_device:
+            intweight = intweight.to('cpu')
+
+        qweight = torch.zeros(
+            (intweight.shape[0], intweight.shape[1] // 32 * awq_linear.w_bit),
+            dtype=torch.int32,
+            device=intweight.device,
+        )
+
+        for col in range(intweight.shape[1] // pack_num):
+            if awq_linear.w_bit == 4:
+                order_map = [0, 2, 4, 6, 1, 3, 5, 7]
+            else:
+                raise NotImplementedError('Only 4-bit are supported for now.')
+            for i in range(pack_num):
+                qweight_col = intweight[:, col * pack_num + order_map[i]]
+                qweight[:, col] |= qweight_col << (i * awq_linear.w_bit)
+        awq_linear.qweight = qweight
+
+        zeros = zeros.to(dtype=torch.int32, device=best_device)
+
+        if 'mps' in best_device:
+            zeros = zeros.to('cpu')
+
+        qzeros = torch.zeros(
+            (zeros.shape[0], zeros.shape[1] // 32 * awq_linear.w_bit),
+            dtype=torch.int32,
+            device=zeros.device,
+        )
+
+        for col in range(zeros.shape[1] // pack_num):
+            if awq_linear.w_bit == 4:
+                order_map = [0, 2, 4, 6, 1, 3, 5, 7]
+            else:
+                raise NotImplementedError('Only 4-bit are supported for now.')
+            for i in range(pack_num):
+                qzero_col = zeros[:, col * pack_num + order_map[i]]
+                qzeros[:, col] |= qzero_col << (i * awq_linear.w_bit)
+        awq_linear.qzeros = qzeros
+
+        return awq_linear
+
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.out_features, )
+
+        input_dtype = x.dtype
+        if input_dtype != torch.float16:
+            x = x.half()
+
+        if self.training:
+            out = WQLinearMMFunction.apply(
+                x,
+                self.qweight,
+                self.qzeros,
+                self.scales,
+                self.w_bit,
+                self.group_size,
+                self.bias,
+                self.out_features,
+            )
+        else:
+            with torch.no_grad():
+                out = WQLinearMMFunction.apply(
+                    x,
+                    self.qweight,
+                    self.qzeros,
+                    self.scales,
+                    self.w_bit,
+                    self.group_size,
+                    self.bias,
+                    self.out_features,
+                )
+
+        if input_dtype != torch.float16:
+            out = out.to(dtype=input_dtype)
+
+        return out.reshape(out_shape)
+
+    def extra_repr(self) -> str:
+        return (
+            'in_features={}, out_features={}, bias={}, w_bit={}, group_size={}'
+            .format(
+                self.in_features,
+                self.out_features,
+                self.bias is not None,
+                self.w_bit,
+                self.group_size,
+            ))
diff --git a/example/awq/llama.py b/example/awq/llama.py
new file mode 100644
index 0000000..f221e48
--- /dev/null
+++ b/example/awq/llama.py
@@ -0,0 +1,81 @@
+# from awq.utils.fused_utils import fuse_qkv
+# from awq.modules.fused.block import LlamaLikeBlock
+# from awq.modules.fused.model import LlamaLikeModel
+from transformers.models.llama.modeling_llama import \
+    LlamaDecoderLayer as OldLlamaDecoderLayer
+from transformers.models.llama.modeling_llama import \
+    LlamaForCausalLM as OldLlamaForCausalLM
+
+from .base import BaseAWQForCausalLM
+
+# from awq.modules.fused.norm import FasterTransformerRMSNorm
+
+
+class LlamaAWQForCausalLM(BaseAWQForCausalLM):
+    layer_type = 'LlamaDecoderLayer'
+    max_seq_len_key = 'max_position_embeddings'
+
+    # @staticmethod
+    # def fuse_layers(model: OldLlamaForCausalLM):
+    #     fuser = LlamaFuser(model)
+    #     fuser.fuse_transformer()
+
+    @staticmethod
+    def get_model_layers(model: OldLlamaForCausalLM):
+        return model.model.layers
+
+    @staticmethod
+    def get_act_for_scaling(module: OldLlamaDecoderLayer):
+        return dict(is_scalable=False)
+
+    @staticmethod
+    def move_embed(model: OldLlamaForCausalLM, device: str):
+        model.model.embed_tokens = model.model.embed_tokens.to(device)
+
+    @staticmethod
+    def get_layers_for_scaling(module: OldLlamaDecoderLayer, input_feat,
+                               module_kwargs):
+        layers = []
+
+        # attention input
+        layers.append(
+            dict(
+                prev_op=module.input_layernorm,
+                layers=[
+                    module.self_attn.q_proj,
+                    module.self_attn.k_proj,
+                    module.self_attn.v_proj,
+                ],
+                inp=input_feat['self_attn.q_proj'],
+                module2inspect=module.self_attn,
+                kwargs=module_kwargs,
+            ))
+
+        # attention out
+        # Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696 # noqa
+        if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:  # noqa
+            layers.append(
+                dict(
+                    prev_op=module.self_attn.v_proj,
+                    layers=[module.self_attn.o_proj],
+                    inp=input_feat['self_attn.o_proj'],
+                ))
+
+        # linear 1
+        layers.append(
+            dict(
+                prev_op=module.post_attention_layernorm,
+                layers=[module.mlp.gate_proj, module.mlp.up_proj],
+                inp=input_feat['mlp.gate_proj'],
+                module2inspect=module.mlp,
+            ))
+
+        # linear 2
+        layers.append(
+            dict(
+                prev_op=module.mlp.up_proj,
+                layers=[module.mlp.down_proj],
+                inp=input_feat['mlp.down_proj'],
+            ))
+
+        return layers
diff --git a/example/awq/module.py b/example/awq/module.py
new file mode 100644
index 0000000..aeefb66
--- /dev/null
+++ b/example/awq/module.py
@@ -0,0 +1,60 @@
+import torch.nn as nn
+
+
+def get_named_linears(module):
+    return {
+        name: m
+        for name, m in module.named_modules() if isinstance(m, nn.Linear)
+    }
+
+
+def get_op_by_name(module, op_name):
+    # get the op by its name relative to the module
+    for name, m in module.named_modules():
+        if name == op_name:
+            return m
+    raise ValueError(f'Cannot find op {op_name} in module {module}')
+
+
+def set_op_by_name(layer, name, new_module):
+    levels = name.split('.')
+    if len(levels) > 1:
+        mod_ = layer
+        for l_idx in range(len(levels) - 1):
+            if levels[l_idx].isdigit():
+                mod_ = mod_[int(levels[l_idx])]
+            else:
+                mod_ = getattr(mod_, levels[l_idx])
+        setattr(mod_, levels[-1], new_module)
+    else:
+        setattr(layer, name, new_module)
+
+
+def get_op_name(module, op):
+    # get the name of the op relative to the module
+    for name, m in module.named_modules():
+        if m is op:
+            return name
+    raise ValueError(f'Cannot find op {op} in module {module}')
+
+
+def append_str_prefix(x, prefix):
+    if isinstance(x, str):
+        return prefix + x
+    elif isinstance(x, tuple):
+        return tuple([append_str_prefix(y, prefix) for y in x])
+    elif isinstance(x, list):
+        return [append_str_prefix(y, prefix) for y in x]
+    else:
+        return x
+
+
+def exclude_layers_to_not_quantize(linear_layers, modules_to_not_convert):
+    if modules_to_not_convert is None:
+        return linear_layers
+
+    filtered_layers = {}
+    for name, linear_layer in linear_layers.items():
+        if not any(key in name for key in modules_to_not_convert):
+            filtered_layers[name] = linear_layer
+    return filtered_layers
diff --git a/example/awq/packing_utils.py b/example/awq/packing_utils.py
new file mode 100644
index 0000000..a9724fa
--- /dev/null
+++ b/example/awq/packing_utils.py
@@ -0,0 +1,97 @@
+import torch
+
+AWQ_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]
+AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
+
+
+def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int):
+    shifts = torch.arange(0, 32, bits, device=qzeros.device)
+
+    # unpacking columnwise
+    iweights = torch.bitwise_right_shift(
+        qweight[:, :, None],
+        shifts[None, None, :]).to(torch.int8  # smallest dtype available
+                                  )
+    iweights = iweights.view(iweights.shape[0], -1)
+
+    # unpacking columnwise
+    if qzeros is not None:
+        izeros = torch.bitwise_right_shift(
+            qzeros[:, :, None],
+            shifts[None, None, :]).to(torch.int8  # smallest dtype available
+                                      )
+        izeros = izeros.view(izeros.shape[0], -1)
+    else:
+        izeros = qzeros
+
+    return iweights, izeros
+
+
+def reverse_awq_order(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
+    reverse_order_tensor = torch.arange(
+        iweights.shape[-1],
+        dtype=torch.int32,
+        device=izeros.device,
+    )
+    reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
+    reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
+    reverse_order_tensor = reverse_order_tensor.view(-1)
+
+    if izeros is not None:
+        izeros = izeros[:, reverse_order_tensor]
+    iweights = iweights[:, reverse_order_tensor]
+
+    return iweights, izeros
+
+
+def pack_exllama(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
+    shifts = torch.arange(0, 32, bits, device=iweights.device)
+
+    # packing rowwise
+    iweights = iweights.view(iweights.shape[0] // (32 // bits), 32 // bits, -1)
+    qweight = (torch.bitwise_left_shift(
+        iweights, shifts[None, :, None]).sum(dim=1).to(torch.int32))
+
+    # packing columnwise
+    izeros = izeros.view(-1, izeros.shape[1] // (32 // bits), 32 // bits)
+    qzeros = (torch.bitwise_left_shift(
+        izeros, shifts[None, None, :]).sum(dim=-1).to(torch.int32))
+
+    return qweight, qzeros
+
+
+def unpack_reorder_pack(qweight, qzeros, bits):
+    # Unpack the qweight and qzeros tensors
+    iweight, izeros = unpack_awq(qweight, qzeros, bits)
+    # Reverse the order of the iweight and izeros tensors
+    iweight, izeros = reverse_awq_order(iweight, izeros, bits)
+
+    # overflow checks
+    iweight = torch.bitwise_and(iweight, (2**bits) - 1)
+    izeros = torch.bitwise_and(izeros, (2**bits) - 1)
+
+    # Subtract 1 from the izeros tensor (exllama adds 1 during inference)
+    # We can remove it if we remove the +1 in the exllama code
+    izeros = izeros - 1
+    # Pack the qweight and qzeros tensors
+    qweight, qzeros = pack_exllama(iweight, izeros, bits)
+
+    return qweight, qzeros
+
+
+def dequantize_gemm(qweight, qzeros, scales, bits, group_size):
+    # Unpack the qweight and qzeros tensors
+    iweight, izeros = unpack_awq(qweight, qzeros, bits)
+    # Reverse the order of the iweight and izeros tensors
+    iweight, izeros = reverse_awq_order(iweight, izeros, bits)
+
+    # overflow checks
+    iweight = torch.bitwise_and(iweight, (2**bits) - 1)
+    izeros = torch.bitwise_and(izeros, (2**bits) - 1)
+
+    # fp16 weights
+    scales = scales.repeat_interleave(group_size, dim=0)
+    izeros = izeros.repeat_interleave(group_size, dim=0)
+    iweight = (iweight - izeros) * scales
+
+    return iweight
diff --git a/example/awq/utils.py b/example/awq/utils.py
new file mode 100644
index 0000000..3ba594c
--- /dev/null
+++ b/example/awq/utils.py
@@ -0,0 +1,110 @@
+import gc
+import importlib
+
+import accelerate
+import torch
+
+qbits_available = importlib.util.find_spec(
+    'intel_extension_for_transformers') is not None
+
+
+def get_module_by_name_suffix(model, module_name: str):
+    for name, module in model.named_modules():
+        if name.endswith(module_name):
+            return module
+
+
+def simple_dispatch_model(model, device_map):
+    from accelerate.hooks import AlignDevicesHook, add_hook_to_module
+
+    if '' in device_map:
+        d = device_map['']
+        model = model.to(torch.device(d))
+        model.hf_device_map = device_map
+        return model
+
+    tied_params = accelerate.utils.modeling.find_tied_parameters(model)
+    if set(device_map.values()) == {'cpu'} or set(device_map.values()) == {
+            'cpu',
+            'disk',
+    }:
+        main_device = 'cpu'
+    else:
+        main_device = [
+            d for d in device_map.values() if d not in ['cpu', 'disk']
+        ][0]
+
+    cpu_offload_group = [(n, d) for n, d in device_map.items() if d == 'cpu']
+    prev_hook = None
+    for idx, (n, d) in enumerate(cpu_offload_group):
+        m = get_module_by_name_suffix(model, n)
+        _, prev_hook = accelerate.cpu_offload_with_hook(
+            m, execution_device=main_device, prev_module_hook=prev_hook)
+    # set first cpu offload module's prev_module_hook
+    # to the last cpu offload module's hook
+    if len(cpu_offload_group) > 1:
+        get_module_by_name_suffix(
+            model,
+            cpu_offload_group[0][0])._hf_hook.prev_module_hook = prev_hook
+
+    for n, d in device_map.items():
+        m = get_module_by_name_suffix(model, n)
+        if d != 'cpu':
+            d = torch.device(d)
+            hook = AlignDevicesHook(d,
+                                    io_same_device=True,
+                                    place_submodules=True)
+            add_hook_to_module(m, hook)
+    accelerate.utils.modeling.retie_parameters(model, tied_params)
+    model.hf_device_map = device_map
+
+    return model
+
+
+def set_module_name(model, name, value):
+    if '.' in name:
+        parent_name = name.rsplit('.', 1)[0]
+        child_name = name[len(parent_name) + 1:]
+        parent = model.get_submodule(parent_name)
+    else:
+        parent_name = ''
+        parent = model
+        child_name = name
+
+    setattr(parent, child_name, value)
+
+
+def clear_memory(weight=None):
+    if weight is not None:
+        del weight
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def compute_memory_used_pct(device):
+    memory_used = torch.cuda.max_memory_allocated(device) / (1024**3)
+    memory_pct = (memory_used /
+                  (torch.cuda.get_device_properties(device).total_memory /
+                   (1024**3)) * 100)
+    return memory_pct
+
+
+def get_best_device():
+    if torch.backends.mps.is_available():
+        return 'mps'
+    elif torch.cuda.is_available():
+        return 'cuda:0'
+    else:
+        return 'cpu'
+
+
+def get_lowest_memory_device_index():
+    device = None
+    curr_device_memory_pct = 0
+    for device_index in range(torch.cuda.device_count()):
+        device_memory_pct = compute_memory_used_pct(device_index)
+        if device is None or device_memory_pct < curr_device_memory_pct:
+            device = device_index
+            curr_device_memory_pct = device_memory_pct
+
+    return device
diff --git a/example/generate.py b/example/generate.py
new file mode 100644
index 0000000..cca53ab
--- /dev/null
+++ b/example/generate.py
@@ -0,0 +1,44 @@
+import torch
+from awq.auto import AutoAWQForCausalLM
+from transformers import AutoTokenizer, TextStreamer
+
+quant_path = '/mnt/140/llama3/Meta-Llama-3-8B-Instruct-hf-AWQ'
+
+# Load model
+model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
+
+tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
+streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+
+prompt = "You're standing on the surface of the Earth. "\
+        'You walk one mile south, one mile west and one mile north. '\
+        'You end up exactly where you started. Where are you?'
+
+chat = [
+    {
+        'role': 'system',
+        'content': 'You are a concise assistant that helps answer questions.'
+    },
+    {
+        'role': 'user',
+        'content': prompt
+    },
+]
+
+terminators = [
+    tokenizer.eos_token_id,
+    tokenizer.convert_tokens_to_ids('<|eot_id|>')
+]
+
+tokens = tokenizer.apply_chat_template(chat, return_tensors='pt')
+tokens = tokens.to(torch.device('cuda'))
+
+# Generate output
+generation_output = model.generate(tokens,
+                                   streamer=streamer,
+                                   max_new_tokens=64,
+                                   eos_token_id=terminators)
+
+res = tokenizer.decode(generation_output[0].cpu().numpy().tolist())
+print(f'token_ids: {generation_output}')
+print(f'output: {res}')

From 8cc45809aa1e08f4345403e8361a1a5c8783715f Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Sat, 12 Oct 2024 19:24:09 +0800
Subject: [PATCH 05/18] bind turbomind linear

---
 CMakeLists.txt                            |   5 +-
 MANIFEST.in                               |   5 +
 example/test_linear.py                    | 163 ++++++++
 generate.sh                               |   3 +-
 src/turbomind/CMakeLists.txt              |   2 -
 src/turbomind/api/python/CMakeLists.txt   |   4 +-
 src/turbomind/api/python/bind.cpp         | 367 +++++++++++++++++
 src/turbomind/api/python/linear.cc        | 284 ++++++++++++++
 src/turbomind/api/python/linear.h         |  50 +++
 src/turbomind/kernels/gemm/CMakeLists.txt |   4 +-
 src/turbomind/utils/cuda_utils.h          | 456 ++++++++++++++++++++++
 src/turbomind/utils/tensor.h              |  86 ++++
 turbomind/__init__.py                     |   4 +
 turbomind/linear.py                       | 225 ++++++++++-
 14 files changed, 1645 insertions(+), 13 deletions(-)
 create mode 100644 MANIFEST.in
 create mode 100644 example/test_linear.py
 create mode 100644 src/turbomind/api/python/linear.cc
 create mode 100644 src/turbomind/api/python/linear.h
 create mode 100644 src/turbomind/utils/cuda_utils.h
 create mode 100644 src/turbomind/utils/tensor.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6f8f984..29299f2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,9 +150,8 @@ link_directories(
 add_subdirectory(src/turbomind)
 
 # install python api
-if (BUILD_PY_FFI)
-    install(TARGETS _tm DESTINATION ${CMAKE_SOURCE_DIR}/turbomind/lib)
-endif ()
+install(TARGETS turbomind_kernels DESTINATION ${CMAKE_SOURCE_DIR}/turbomind/lib)
+
 
 if (MSVC)
     return()
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..0462a5a
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,5 @@
+
+include turbomind/lib/*.so
+include turbomind/lib/*.so*
+include turbomind/lib/*.dll
+include turbomind/lib/*.pyd
diff --git a/example/test_linear.py b/example/test_linear.py
new file mode 100644
index 0000000..2bd6e46
--- /dev/null
+++ b/example/test_linear.py
@@ -0,0 +1,163 @@
+import torch
+from safetensors import safe_open
+
+import turbomind as tm
+
+torch.manual_seed(0)
+
+
+def i32x8_to_i4x8(w):
+    """merge 8 integers (range from 0 to 15) into one 32-bit integer."""
+    assert w.shape[-1] % 8 == 0
+    shape = (w.shape[0], w.numel() // (w.shape[0] * 8), 8)
+    shape = shape[:-1] + (1, )
+    result = torch.zeros(shape, dtype=w.dtype, device=w.device)
+    mask = torch.tensor([15], dtype=w.dtype, device=w.device)
+    for i in range(8):
+        shift = 4 * (7 - i)
+        result[..., 0] |= (w[..., i] & mask) << shift
+    result = result.view(w.shape[0], -1)
+    return result
+
+
+def i4x8_to_i32x8(w):
+    """split one integer every 4bits into 8 integers (range from 0 to 15)"""
+    shape = w.shape + (8, )
+    result = torch.zeros(shape, dtype=w.dtype, device=w.device)
+    mask = torch.tensor([15], dtype=w.dtype, device=w.device)
+    for i in range(8):
+        shift = 4 * (7 - i)
+        result[..., i] = (w >> shift) & mask
+    result = result.view(w.shape[0], -1)
+    return result
+
+
+# ## test i4x8_to_i32x8
+# value = 1636164468
+# print(hex(value))
+# a = torch.tensor([[value, value], [value, value]], dtype=torch.int32)
+# b = i4x8_to_i32x8(a)
+# print(b)
+# c = i32x8_to_i4x8(b)
+# print(c)
+# cmp = a == c
+# assert torch.sum(cmp) == cmp.numel()
+# exit(0)
+# ## end test
+
+
+def makeup_qweight(in_features: int, out_features: int):
+    assert out_features % 8 == 0
+    qweight = torch.randint(0,
+                            16, (in_features, out_features // 8, 8),
+                            dtype=torch.int32,
+                            device='cuda')
+    print(f'-- makeup qweight: shape {qweight.shape}')
+    print(qweight.view(in_features, -1))
+    qweight = i32x8_to_i4x8(qweight)
+    print(f'-- merge qweight: shape {qweight.shape}')
+    print(qweight)
+    return qweight
+
+
+def makup_qzeros(in_features: int, out_features: int, group_size: int):
+    assert out_features % 8 == 0
+    assert in_features % group_size == 0 and in_features // group_size >= 1
+
+    qzeros = torch.randint(0,
+                           16,
+                           (in_features // group_size, out_features // 8, 8),
+                           dtype=torch.int32,
+                           device='cuda')
+    print(f'-- makeup qzero: shape {qzeros.shape}')
+    print(qzeros.view(in_features // group_size, -1))
+    qzeros = i32x8_to_i4x8(qzeros)
+    print(f'-- merge qzero: shape {qzeros.shape}\n{qzeros}')
+    return qzeros
+
+
+def makup_scales(in_features: int, out_featurse: int, group_size: int):
+    assert in_features % group_size == 0 and in_features // group_size >= 1
+    scales = torch.rand((in_features // group_size, out_features),
+                        dtype=torch.float16,
+                        device='cuda')
+    print(f'-- makeup scales: shape {scales.shape}\n{scales}')
+    return scales
+
+
+def dequantize(qweight, qzeros, scales, group_size: int = 128):
+    _qweight = i4x8_to_i32x8(qweight)
+    _qzeros = i4x8_to_i32x8(qzeros)
+    _qzeros = _qzeros.half()
+    weight = _qweight.clone().half()
+    for i in range(qzeros.shape[0]):
+        start = i * group_size
+        end = start + group_size
+        weight[start:end] = (weight[start:end, :] -
+                             _qzeros[i:i + 1, :]) * scales[i:i + 1, :]
+    return weight
+
+
+# in_features = 128
+# out_features = 8
+# group_size = 128
+# qweight = makeup_qweight(in_features, out_features)
+# qzeros = makup_qzeros(in_features=in_features,
+#                       out_features=out_features,
+#                       group_size=group_size)
+# scales = makup_scales(in_features,
+#                       out_featurse=out_features,
+#                       group_size=group_size)
+
+# weight = dequantize(qweight, qzeros, scales, group_size)
+# print(f'-- dequantization: weight.shape={weight.shape}, weight: \n{weight}')
+# ref_linear = nn.Linear(in_features, out_features, bias=False, device='cuda')
+# with torch.no_grad():
+#     ref_linear.weight = nn.Parameter(weight.T)
+
+# x = torch.randn(in_features, device=weight.device, dtype=weight.dtype)
+# print(f'input: {x}')
+# print(weight.device, x.device)
+# ref_res = ref_linear(x)
+# print(ref_res)
+
+
+def load_specified_linear_weights():
+    ckpt_path = '/models/140/llama3/Meta-Llama-3-8B-Instruct-hf-AWQ/model-00001-of-00002.safetensors'  # noqa
+    layer_id = 0
+    # prefix = f'model.layers.{layer_id}.self_attn.q_proj.'
+    prefix = f'model.layers.{layer_id}.mlp.gate_proj.'
+    keys = ['qweight', 'qzeros', 'scales']
+    tensors = {}
+    with safe_open(ckpt_path, framework='pt', device='cuda') as f:
+        for key in keys:
+            tensors[key] = f.get_tensor(prefix + key)
+
+    return tensors
+
+
+tensors = load_specified_linear_weights()
+qweight, qzeros, scales = tensors['qweight'], tensors['qzeros'], tensors[
+    'scales']
+
+group_size = 128
+in_features = qweight.shape[0]
+out_features = qweight.shape[1] * 8
+
+model = tm.Linear(in_features=in_features,
+                  out_features=out_features,
+                  bias=False,
+                  quant_method='awq',
+                  w_bit=4,
+                  group_size=group_size)
+
+model.qweight = qweight
+model.qzeros = qzeros
+model.scales = scales
+
+model.post_init()
+
+x = torch.randn(in_features, device=qweight.device, dtype=torch.float16)
+res = model(x)
+# max_diff = max(abs(ref_res - res))
+# ave_diff = sum(abs(ref_res - res)) / ref_res.numel()
diff --git a/generate.sh b/generate.sh
index 7ed3168..09c51a7 100755
--- a/generate.sh
+++ b/generate.sh
@@ -1,9 +1,8 @@
 #!/bin/sh
 
-cmake -G Ninja .. \
+cmake .. \
     -DCMAKE_BUILD_TYPE=RelWithDebInfo \
     -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-    -DCMAKE_CUDA_COMPILER=/usr/local/cuda-11.8/bin/nvcc \
     -DCMAKE_CUDA_FLAGS="-lineinfo" \
     -DUSE_NVTX=ON \
     -DFETCHCONTENT_UPDATES_DISCONNECTED=ON \
diff --git a/src/turbomind/CMakeLists.txt b/src/turbomind/CMakeLists.txt
index ed840d6..009d0f8 100644
--- a/src/turbomind/CMakeLists.txt
+++ b/src/turbomind/CMakeLists.txt
@@ -1,5 +1,3 @@
-
-
 add_subdirectory(utils)
 add_subdirectory(kernels/gemm)
 add_subdirectory(api/python)
diff --git a/src/turbomind/api/python/CMakeLists.txt b/src/turbomind/api/python/CMakeLists.txt
index 170fb36..bf9acb7 100644
--- a/src/turbomind/api/python/CMakeLists.txt
+++ b/src/turbomind/api/python/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 cmake_minimum_required(VERSION 3.8)
-project(_turbomind_ext)
+project(turbomind_kernels)
 
 find_package(pybind11 CONFIG)
 if(NOT pybind11_FOUND)
@@ -12,7 +12,7 @@ if(NOT pybind11_FOUND)
     find_package(pybind11 CONFIG)
 endif()
 
-pybind11_add_module(${PROJECT_NAME} bind.cpp)
+pybind11_add_module(${PROJECT_NAME} bind.cpp linear.cc)
 target_link_libraries(${PROJECT_NAME} PRIVATE gemm2)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
 
diff --git a/src/turbomind/api/python/bind.cpp b/src/turbomind/api/python/bind.cpp
index e69de29..bd81fd9 100644
--- a/src/turbomind/api/python/bind.cpp
+++ b/src/turbomind/api/python/bind.cpp
@@ -0,0 +1,367 @@
+
+#include "src/turbomind/api/python/dlpack.h"
+#include "src/turbomind/api/python/linear.h"
+#include "src/turbomind/utils/tensor.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include <cuda_runtime.h>
+#include <memory>
+#include <pybind11/functional.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+#include <stdexcept>
+#include <numeric>
+
+namespace py = pybind11;
+// namespace tm = turbomind;
+using namespace pybind11::literals;
+
+static const char kDlTensorCapsuleName[] = "dltensor";
+
+DLDevice getDLDevice(turbomind::Tensor& tensor)
+{
+    int device_id = 0;
+    if (tensor.where == turbomind::MEMORY_GPU) {
+        cudaPointerAttributes ptr_attr;
+        cudaPointerGetAttributes(&ptr_attr, tensor.data);
+        device_id = ptr_attr.device;
+    }
+
+    DLDevice device{kDLCPU, device_id};
+
+    switch (tensor.where) {
+        case turbomind::MEMORY_CPU:
+            device.device_type = DLDeviceType::kDLCPU;
+            break;
+        case turbomind::MEMORY_CPU_PINNED:
+            device.device_type = DLDeviceType::kDLCUDAHost;
+            break;
+        case turbomind::MEMORY_GPU:
+            device.device_type = DLDeviceType::kDLCUDA;
+            break;
+        default:
+            break;
+    }
+
+    return device;
+}
+
+DLManagedTensor* TurbomindTensorToDLManagedTensor(turbomind::Tensor& tensor)
+{
+    DLDevice device = getDLDevice(tensor);
+
+    DLDataType data_type{0, 0, 1};
+    switch (tensor.type) {
+        case turbomind::TYPE_BOOL:
+            data_type.code = DLDataTypeCode::kDLBool;
+            data_type.bits = 8;
+            break;
+        case turbomind::TYPE_UINT8:
+            data_type.code = DLDataTypeCode::kDLUInt;
+            data_type.bits = 8;
+            break;
+        case turbomind::TYPE_UINT16:
+            data_type.code = DLDataTypeCode::kDLUInt;
+            data_type.bits = 16;
+            break;
+        case turbomind::TYPE_UINT32:
+            data_type.code = DLDataTypeCode::kDLUInt;
+            data_type.bits = 32;
+            break;
+        case turbomind::TYPE_UINT64:
+            data_type.code = DLDataTypeCode::kDLUInt;
+            data_type.bits = 64;
+            break;
+        case turbomind::TYPE_INT8:
+        case turbomind::TYPE_BYTES:
+            data_type.code = DLDataTypeCode::kDLInt;
+            data_type.bits = 8;
+            break;
+        case turbomind::TYPE_INT16:
+            data_type.code = DLDataTypeCode::kDLInt;
+            data_type.bits = 16;
+            break;
+        case turbomind::TYPE_INT32:
+            data_type.code = DLDataTypeCode::kDLInt;
+            data_type.bits = 32;
+            break;
+        case turbomind::TYPE_INT64:
+            data_type.code = DLDataTypeCode::kDLInt;
+            data_type.bits = 64;
+            break;
+        case turbomind::TYPE_FP16:
+            data_type.code = DLDataTypeCode::kDLFloat;
+            data_type.bits = 16;
+            break;
+        case turbomind::TYPE_FP32:
+            data_type.code = DLDataTypeCode::kDLFloat;
+            data_type.bits = 32;
+            break;
+        case turbomind::TYPE_FP64:
+            data_type.code = DLDataTypeCode::kDLFloat;
+            data_type.bits = 64;
+            break;
+        case turbomind::TYPE_BF16:
+            data_type.code = DLDataTypeCode::kDLBfloat;
+            data_type.bits = 16;
+            break;
+        default:
+            break;
+    }
+    DLTensor dl_tensor{const_cast<void*>(tensor.data),
+                       device,
+                       (int32_t)(tensor.shape.size()),
+                       data_type,
+                       reinterpret_cast<int64_t*>(const_cast<size_t*>(tensor.shape.data())),
+                       (int64_t*)(nullptr),
+                       0};
+    return new DLManagedTensor{dl_tensor, nullptr, [](DLManagedTensor* dlmt) { delete dlmt; }};
+}
+
+turbomind::MemoryType getMemoryType(DLDevice device)
+{
+    switch (device.device_type) {
+        case DLDeviceType::kDLCUDAHost:
+            return turbomind::MemoryType::MEMORY_CPU_PINNED;
+        case DLDeviceType::kDLCUDA:
+            return turbomind::MemoryType::MEMORY_GPU;
+        case DLDeviceType::kDLCPU:
+        default:
+            return turbomind::MemoryType::MEMORY_CPU;
+    }
+}
+
+turbomind::DataType getDataType(DLDataType data_type)
+{
+    switch (data_type.code) {
+        case DLDataTypeCode::kDLUInt:
+            switch (data_type.bits) {
+                case 8:
+                    return turbomind::TYPE_UINT8;
+                case 16:
+                    return turbomind::TYPE_UINT16;
+                case 32:
+                    return turbomind::TYPE_UINT32;
+                case 64:
+                    return turbomind::TYPE_UINT64;
+                default:
+                    return turbomind::TYPE_INVALID;
+            }
+            break;
+        case DLDataTypeCode::kDLInt:
+            switch (data_type.bits) {
+                case 8:
+                    return turbomind::TYPE_INT8;
+                case 16:
+                    return turbomind::TYPE_INT16;
+                case 32:
+                    return turbomind::TYPE_INT32;
+                case 64:
+                    return turbomind::TYPE_INT64;
+                default:
+                    return turbomind::TYPE_INVALID;
+            }
+            break;
+        case DLDataTypeCode::kDLFloat:
+            switch (data_type.bits) {
+                case 16:
+                    return turbomind::TYPE_FP16;
+                case 32:
+                    return turbomind::TYPE_FP32;
+                case 64:
+                    return turbomind::TYPE_FP64;
+                default:
+                    return turbomind::TYPE_INVALID;
+            }
+            break;
+        case DLDataTypeCode::kDLBfloat:
+            switch (data_type.bits) {
+                case 16:
+                    return turbomind::TYPE_BF16;
+                default:
+                    return turbomind::TYPE_INVALID;
+            }
+            break;
+        case DLDataTypeCode::kDLBool:
+            return turbomind::TYPE_BOOL;
+        default:
+            return turbomind::TYPE_INVALID;
+    }
+}
+
+std::shared_ptr<turbomind::Tensor> DLManagedTensorToTurbomindTensor(DLManagedTensor* tensor)
+{
+    auto& dl_tensor = tensor->dl_tensor;
+    auto  where     = getMemoryType(dl_tensor.device);
+    auto  dtype     = getDataType(dl_tensor.dtype);
+    assert(dl_tensor.ndim > 0);
+    std::vector<size_t> shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim);
+    auto                data = dl_tensor.data;
+
+    return std::make_shared<turbomind::Tensor>(where, dtype, shape, data);
+}
+
+std::shared_ptr<turbomind::Tensor> TorchTensorToTurbomindTensor(py::object obj)
+{
+    py::capsule      cap  = obj.attr("__dlpack__")();
+    DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName));
+    return DLManagedTensorToTurbomindTensor(dlmt);
+}
+
+PYBIND11_MODULE(turbomind_kernels, m) {
+    py::enum_<turbomind::WeightType>(m, "WeightType")
+        .value("kFP32", turbomind::WeightType::kFP32)
+        .value("kFP16", turbomind::WeightType::kFP16)
+        .value("kFP8", turbomind::WeightType::kFP8)
+        .value("kBF16", turbomind::WeightType::kBF16)
+        .value("kINT8", turbomind::WeightType::kINT8)
+        .value("kINT4", turbomind::WeightType::kINT4);
+
+    // data type
+    py::enum_<turbomind::DataType>(m, "DataType")
+        .value("TYPE_INVALID", turbomind::DataType::TYPE_INVALID)
+        .value("TYPE_BOOL", turbomind::DataType::TYPE_BOOL)
+        .value("TYPE_UINT8", turbomind::DataType::TYPE_UINT8)
+        .value("TYPE_UINT16", turbomind::DataType::TYPE_UINT16)
+        .value("TYPE_UINT32", turbomind::DataType::TYPE_UINT32)
+        .value("TYPE_UINT64", turbomind::DataType::TYPE_UINT64)
+        .value("TYPE_INT8", turbomind::DataType::TYPE_INT8)
+        .value("TYPE_INT16", turbomind::DataType::TYPE_INT16)
+        .value("TYPE_INT32", turbomind::DataType::TYPE_INT32)
+        .value("TYPE_INT64", turbomind::DataType::TYPE_INT64)
+        .value("TYPE_FP16", turbomind::DataType::TYPE_FP16)
+        .value("TYPE_FP32", turbomind::DataType::TYPE_FP32)
+        .value("TYPE_FP64", turbomind::DataType::TYPE_FP64)
+        .value("TYPE_BYTES", turbomind::DataType::TYPE_BYTES)
+        .value("TYPE_BF16", turbomind::DataType::TYPE_BF16);
+
+    // memory type
+    py::enum_<turbomind::MemoryType>(m, "MemoryType")
+        .value("MEMORY_CPU", turbomind::MemoryType::MEMORY_CPU)
+        .value("MEMORY_CPU_PINNED", turbomind::MemoryType::MEMORY_CPU_PINNED)
+        .value("MEMORY_GPU", turbomind::MemoryType::MEMORY_GPU);
+
+    // tensor
+    py::class_<turbomind::Tensor, std::shared_ptr<turbomind::Tensor>>(m, "Tensor")
+        .def_readonly("where", &turbomind::Tensor::where)
+        .def_readonly("type", &turbomind::Tensor::type)
+        .def_readonly("shape", &turbomind::Tensor::shape)
+        .def_readonly("data", &turbomind::Tensor::data)
+        .def(py::init([](const turbomind::MemoryType   where,
+                         const turbomind::DataType     type,
+                         const std::vector<size_t>&    shape,
+                         const long                    data) {
+            auto data_ptr = reinterpret_cast<void*>(data);
+            return new turbomind::Tensor(where, type, shape, data_ptr);
+        }))
+        .def(
+            "view",
+            [](turbomind::Tensor* self, turbomind::DataType new_type) {
+                return new turbomind::Tensor(self->where, new_type, self->shape, self->data);
+            },
+            "new_type"_a)
+        .def(
+            "view",
+            [](turbomind::Tensor* self, std::vector<size_t> new_shape) {
+                return new turbomind::Tensor(self->where, self->type, new_shape, self->data);
+            },
+            "new_shape"_a)
+        .def(
+            "copy_from",
+            [](turbomind::Tensor* self, py::object obj) {
+                py::capsule      cap = obj.attr("__dlpack__")();
+                DLManagedTensor* dlmt =
+                    static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName));
+                auto src = DLManagedTensorToTurbomindTensor(dlmt);
+                switch (self->type) {
+                    case turbomind::TYPE_FP16:
+                    case turbomind::TYPE_FP32:
+                    case turbomind::TYPE_INT32:
+                    case turbomind::TYPE_BF16: {
+                        auto num_element =
+                            std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies<int64_t>());
+                        auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8;
+                        turbomind::TM_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]);
+                        cudaMemcpy(
+                            const_cast<void*>(self->data), const_cast<void*>(src->data), num_bytes, cudaMemcpyDefault);
+                        break;
+                    }
+                    default:
+                        turbomind::TM_CHECK(0);
+                }
+            },
+            "tensor"_a)
+        .def(
+            "__dlpack__",
+            [](turbomind::Tensor* self, long stream) {
+                DLManagedTensor* dlmt = TurbomindTensorToDLManagedTensor(*self);
+                return py::capsule(dlmt, kDlTensorCapsuleName, [](PyObject* obj) {
+                    DLManagedTensor* dlmt =
+                        static_cast<DLManagedTensor*>(PyCapsule_GetPointer(obj, kDlTensorCapsuleName));
+                    if (dlmt) {
+                        dlmt->deleter(dlmt);
+                    }
+                    else {
+                        // The tensor has been deleted. Clear any error from
+                        // PyCapsule_GetPointer.
+                        PyErr_Clear();
+                    }
+                });
+            },
+            "stream"_a = 0)
+        .def("__dlpack_device__", [](turbomind::Tensor* self) {
+            auto device = getDLDevice(*self);
+            return std::tuple<int, int>(int(device.device_type), device.device_id);
+        });
+    m.def(
+        "from_dlpack",
+        [](py::object obj) {
+            py::capsule      cap = obj.attr("__dlpack__")();
+            DLManagedTensor* dlmt =
+                static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName));
+            auto ret = DLManagedTensorToTurbomindTensor(dlmt);
+            return ret;
+        },
+        "dl_managed_tensor"_a);
+    // m.def("convert_qweight", [](py::object qweight, int input_dims, int output_dims, bool simt) {
+    //     py::capsule      cap = qweight.attr("__dlpack__")();
+    //     DLManagedTensor* dlmt =
+    //         static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName));
+    //     auto _qweight = DLManagedTensorToTurbomindTensor(dlmt);
+    //     _qweight = convert_qweight(_qweight, input_dims, output_dims, simt);
+    //     return *_qweight;
+    // });
+    // m.def("convert_scales_zeros",
+    // [](py::object scales, py::object qzeros, py::object scales_zeros, int input_dims, int output_dims, int group_size, bool simt) {
+    //     auto cap_scales = scales.attr("__dlpack__")();
+    //     auto cap_zeros = qzeros.attr("__dlpack__")();
+    //     auto cap_scales_zeros = scales_zeros.attr("__dlpack__")();
+    //     auto dlmt_scales = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap_scales.ptr(), kDlTensorCapsuleName));
+    //     auto dlmt_zeros = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap_zeros.ptr(), kDlTensorCapsuleName));
+    //     auto dlmt_scales_zeros = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap_scales_zeros.ptr(), kDlTensorCapsuleName));
+    //     auto _scales = DLManagedTensorToTurbomindTensor(dlmt_scales);
+    //     auto _zeros = DLManagedTensorToTurbomindTensor(dlmt_zeros);
+    //     auto _scales_zeros = DLManagedTensorToTurbomindTensor(dlmt_scales_zeros);
+    //     _scales_zeros = convert_scales_zeros(_scales, _zeros, _scales_zeros, input_dims, output_dims, group_size, simt);
+    //     return *_scales_zeros;
+    // });
+
+    // Instantiate turbomind::Linear
+    py::class_<turbomind::Linear, std::shared_ptr<turbomind::Linear>>(m, "Linear")
+        .def(py::init([](size_t in_features, size_t out_features, int w_bit, int group_size) {
+            return new turbomind::Linear(in_features, out_features, w_bit, group_size);
+        }))
+        .def("post_init", [](turbomind::Linear* linear, py::object qweight, py::object scales, py::object qzeros,
+                             bool simt){
+            auto _qweight = TorchTensorToTurbomindTensor(qweight);
+            auto _scales = TorchTensorToTurbomindTensor(scales);
+            auto _qzeros = TorchTensorToTurbomindTensor(qzeros);
+            linear->post_init(_qweight, _scales, _qzeros, simt);
+        })
+        .def("forward", [](turbomind::Linear* linear, py::object in, py::object out) {
+            auto _in = TorchTensorToTurbomindTensor(in);
+            auto _out = TorchTensorToTurbomindTensor(out);
+            return linear->forward(_in, _out);
+        });
+}
diff --git a/src/turbomind/api/python/linear.cc b/src/turbomind/api/python/linear.cc
new file mode 100644
index 0000000..8c68cdd
--- /dev/null
+++ b/src/turbomind/api/python/linear.cc
@@ -0,0 +1,284 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/cast.h"
+#include "src/turbomind/kernels/gemm/gemm.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/api/python/linear.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include <fstream>
+#include <cuda_runtime.h>
+#include <iostream>
+
+
+namespace turbomind {
+
+class GemmSinglton {
+public:
+    static GemmSinglton& getInstance() {
+        static GemmSinglton singleton;
+        return singleton;
+    }
+private:
+    GemmSinglton() {
+
+    }
+    // cublasMMWrapper*     cublas_wrapper_;
+    // gemm::Gemm           gemm_;
+};
+
+
+struct Linear::Impl {
+
+    Impl(size_t input_dims, size_t output_dims, int w_bit, int group_size) :
+        input_dims_(input_dims), output_dims_(output_dims), w_bit_(w_bit), group_size_(group_size) {
+        workspace_ = {};
+
+        workspace_.barriers_size = gemm::Gemm::kBarriersSize;
+        workspace_.partials_size = gemm::Gemm::kPartialsSize;
+        cudaMallocAsync(&workspace_.barriers, workspace_.barriers_size, stream_);
+        cudaMallocAsync(&workspace_.partials, workspace_.partials_size, stream_);
+        cudaMemsetAsync(workspace_.barriers, 0, workspace_.barriers_size, stream_);
+    }
+
+    ~Impl()
+    {
+        cudaFreeAsync(workspace_.barriers, stream_);
+        cudaFreeAsync(workspace_.partials, stream_);
+        workspace_ = {};
+        check_cuda_error(cudaFree(scales_zeros_));
+    }
+
+    void post_init(std::shared_ptr<Tensor> qweight, std::shared_ptr<Tensor> scales, std::shared_ptr<Tensor> qzeros,
+                   bool simt) {
+        const auto workspace_size = input_dims_ * output_dims_ * sizeof(uint16_t);
+        void *workspace {};
+        check_cuda_error(cudaMalloc((void**)&workspace, workspace_size));
+
+        convert_qweight(workspace, qweight, input_dims_, output_dims_, simt);
+        convert_scales_zeros(workspace, scales, qzeros, input_dims_, output_dims_, group_size_, simt);
+
+        check_cuda_error(cudaFree(workspace));
+    }
+
+    void forward(std::shared_ptr<Tensor> in, std::shared_ptr<Tensor> out) {
+        TM_CHECK(in->type == TYPE_FP16 && out->type == TYPE_FP16);
+        TM_CHECK(in->shape.size() == 2 && in->shape[1] == input_dims_);
+        TM_CHECK(out->shape.size() == 2 && out->shape[0] == in->shape[0] && out->shape[1] == output_dims_);
+
+        using namespace gemm;
+
+        const Operation operation{dispatch_policy_,
+                                  Epilogue::kNone,
+                                  {QuantType::kNone},
+                                  {QuantType::kDefault, group_size_},
+                                  0,
+                                  nullptr};
+
+        const MatrixLayout a_desc{
+            gemm::DataType::F16, // get_data_type_v<T>,
+            kRowMajor,
+            (int)in->shape[0], // row
+            (int)input_dims_,  // col
+            (int)input_dims_   // input_data.pitch, // input_data.pitch = input_dims_ if input_data.pitch==0
+        };
+
+        const MatrixLayout c_desc{
+            gemm::DataType::F16, // get_data_type_v<T>,
+            kRowMajor,
+            (int)in->shape[0], // row
+            (int)output_dims_, // col
+            (int)output_dims_
+        };
+
+        auto ec = gemm_.Run(operation,
+                            1.f,
+                            in->data,
+                            a_desc,
+                            nullptr,
+                            {},
+                            weight_->data,
+                            k_desc_,
+                            scales_zeros_,
+                            q_desc_,
+                            0.0f,
+                            out->data,
+                            c_desc,
+                            const_cast<void*>(out->data),
+                            c_desc,
+                            workspace_,
+                            stream_);
+
+        if (ec) {
+            // TM_LOG_ERROR("%s: %d", __PRETTY_FUNCTION__, ec);
+            std::abort();
+        }
+    }
+    void convert_qweight(void* workspace, std::shared_ptr<Tensor> weight, size_t input_dims, size_t output_dims, bool use_simt) {
+        // const auto workspace_size = input_dims * output_dims * sizeof(uint16_t);
+        // void *workspace {};
+        // check_cuda_error(cudaMalloc((void**)&workspace, workspace_size));
+        // std::cout << "where: " << weight->where << ", type: " << weight->type << ", shape: ";
+        // for (size_t i = 0; i < weight->shape.size(); ++i) {
+        //     std::cout << weight->shape[i] << ", ";
+        // }
+        // std::cout << std::endl;
+        // std::vector<int> _temp(weight->shape[0] * weight->shape[1]);
+        // cudaMemcpy(_temp.data(), weight->data, _temp.size() * sizeof(int), cudaMemcpyDeviceToHost);
+        // int row = 4095;
+        // for (size_t i = 0; i < _temp.size() && i < 100; i++) {
+        //     std::cout << _temp[row * weight->shape[1] + i] << ", ";
+        // }
+        // cudaDeviceSynchronize();
+
+        using namespace gemm;
+        auto [order_b, pack_b, order_v, pack_v] = get_weight_and_scales_layout(getSMVersion(), use_simt);
+
+
+        // std::cout << "oder_b: " << int(order_b) << ", input_dims: " << input_dims << ", output_dims: " << output_dims << std::endl;
+        if (order_b == kColMajor) {
+            transpose_u4((uint4_t*)workspace, (const uint4_t*)weight->data, input_dims, output_dims);
+            cudaMemcpy(const_cast<void*>(weight->data), workspace, input_dims * output_dims / 2, cudaMemcpyDefault);
+        }
+
+        extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight->data, input_dims * output_dims);
+        sync_check_cuda_error();
+
+        if constexpr (0) {
+            std::vector<uint16_t> tmp(input_dims * output_dims);
+            cudaMemcpy(tmp.data(), workspace, sizeof(uint16_t) * tmp.size(), cudaMemcpyDefault);
+            cudaDeviceSynchronize();
+            int i = 0;
+            for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
+                std::cout << *it << " ";
+            }
+            i = 0;
+            std::cout << "\n";
+            for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) {
+                std::cout << *it << " ";
+            }
+        }
+
+        MatrixLayout w_desc{
+            gemm::DataType::F16,
+            order_b,
+            (int)input_dims,   // k
+            (int)output_dims,  // n
+            order_b == kRowMajor ? (int)output_dims : (int)input_dims,
+        };
+
+        k_desc_      = w_desc;
+        k_desc_.type = gemm::DataType::U4;
+        k_desc_.pack = pack_b;
+
+        cudaMemset(const_cast<void*>(weight->data), 0, input_dims * output_dims / 2);
+
+        TM_CHECK(Convert(workspace, w_desc, const_cast<void*>(weight->data), k_desc_, 0) == 0);
+        sync_check_cuda_error();
+
+        cudaDeviceSynchronize();
+
+        if constexpr (0) {
+            std::vector<uint32_t> tmp(input_dims * output_dims / 8);
+            cudaMemcpy(tmp.data(), weight->data, sizeof(uint32_t) * tmp.size(), cudaMemcpyDefault);
+            cudaDeviceSynchronize();
+            int i = 0;
+            for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
+                std::cout << std::hex << *it << " ";
+            }
+            i = 0;
+            std::cout << "\n";
+            for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) {
+                std::cout << std::hex << *it << " ";
+            }
+        }
+
+        weight_ = weight;
+    }
+
+    void convert_scales_zeros(void* workspace,
+                            std::shared_ptr<Tensor> scales,
+                            std::shared_ptr<Tensor> qzeros,
+                            size_t input_dims,
+                            size_t output_dims,
+                            int group_size,
+                            bool use_simt) {
+        const auto scale_count = input_dims / group_size * output_dims;
+
+        using namespace gemm;
+        auto [order_b, pack_b, order_v, pack_v] = get_weight_and_scales_layout(getSMVersion(), use_simt);
+
+        fuse_scales_and_zeros((half*)workspace, (const half*)scales->data, (half*)qzeros->data, scale_count);
+        sync_check_cuda_error();
+
+        cudaDeviceSynchronize();
+
+        check_cuda_error(cudaMalloc((half**)&scales_zeros_, scale_count * 2));
+
+        MatrixLayout s_desc{
+            gemm::DataType::U32,
+            order_v,
+            (int)input_dims / group_size,  // k
+            (int)output_dims, // n
+            (int)output_dims,
+        };
+
+        q_desc_      = s_desc;
+        q_desc_.pack = pack_v;
+
+        TM_CHECK(Convert(workspace, s_desc, scales_zeros_, q_desc_, 0) == 0);
+        sync_check_cuda_error();
+
+        // if constexpr (0) {
+        //     std::vector<half> tmp(scale_count * 2);
+        //     cudaMemcpy(tmp.data(), scales_zeros_, sizeof(half) * tmp.size(), cudaMemcpyDefault);
+        //     cudaDeviceSynchronize();
+        //     // for (const auto& x: tmp) {
+        //     //     std::cout << (float)x << " ";
+        //     // }
+        //     int i = 0;
+        //     for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
+        //         std::cout << std::hex << *it << " ";
+        //     }
+        //     i = 0;
+        //     std::cout << "\n";
+        //     for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) {
+        //         std::cout << std::hex << *it << " ";
+        //     }
+        // }
+    }
+
+private:
+    // cublasMMWrapper*     cublas_wrapper_;
+    gemm::Gemm           gemm_;
+    gemm::DispatchPolicy dispatch_policy_{gemm::DispatchPolicy::kDefault};
+    gemm::Workspace      workspace_;
+    cudaStream_t         stream_{};
+
+    size_t input_dims_;
+    size_t output_dims_;
+    int    w_bit_;
+    int    group_size_;
+
+    std::shared_ptr<Tensor> weight_;
+    half*                   scales_zeros_;
+
+    gemm::MatrixLayout k_desc_;
+    gemm::MatrixLayout q_desc_;
+};
+
+Linear::Linear(size_t input_dims, size_t output_dims, int w_bit, int group_size) {
+    impl_ = std::make_shared<Impl>(input_dims, output_dims, w_bit, group_size);
+}
+
+void Linear::post_init(std::shared_ptr<Tensor> qweight,
+                       std::shared_ptr<Tensor> scales,
+                       std::shared_ptr<Tensor> qzeros,
+                       bool simt) {
+    impl_->post_init(qweight, scales, qzeros, simt);
+}
+
+void Linear::forward(std::shared_ptr<Tensor> in, std::shared_ptr<Tensor> out)
+{
+    impl_->forward(in, out);
+}
+}  // namespace turbomind
diff --git a/src/turbomind/api/python/linear.h b/src/turbomind/api/python/linear.h
new file mode 100644
index 0000000..ee56b3a
--- /dev/null
+++ b/src/turbomind/api/python/linear.h
@@ -0,0 +1,50 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include <istream>
+#include <ostream>
+#include <memory>
+#include <cuda_runtime.h>
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/utils/tensor.h"
+
+namespace turbomind {
+
+
+enum class WeightType : int
+{
+    kFP32,
+    kFP16,
+    kFP8,  // not supported yet
+    kBF16,
+    kINT8,
+    kINT4
+};
+
+std::shared_ptr<Tensor> convert_qweight(std::shared_ptr<Tensor> qweight,
+                                        size_t input_dims,
+                                        size_t output_dims,
+                                        bool use_simt);
+std::shared_ptr<Tensor> convert_scales_zeros(std::shared_ptr<Tensor> scales,
+                                             std::shared_ptr<Tensor> qzeros,
+                                             std::shared_ptr<Tensor> scales_zeros,
+                                             size_t input_dims,
+                                             size_t output_dims,
+                                             int group_size,
+                                             bool use_simt);
+
+
+class Linear {
+public:
+    Linear(size_t input_dims, size_t output_dims, int w_bit, int group_size);
+    void post_init(std::shared_ptr<Tensor> qweight, std::shared_ptr<Tensor> scales, std::shared_ptr<Tensor> qzeros,
+                   bool simt);
+    void forward(std::shared_ptr<Tensor> in, std::shared_ptr<Tensor> out);
+    ~Linear() {}
+
+private:
+    struct Impl;
+    std::shared_ptr<Impl> impl_;
+};
+};
diff --git a/src/turbomind/kernels/gemm/CMakeLists.txt b/src/turbomind/kernels/gemm/CMakeLists.txt
index 5484719..6fc634d 100644
--- a/src/turbomind/kernels/gemm/CMakeLists.txt
+++ b/src/turbomind/kernels/gemm/CMakeLists.txt
@@ -34,7 +34,7 @@ target_compile_options(gemm2 PRIVATE
 set_property(TARGET gemm2 PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET gemm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 
-# if (BUILD_TEST)
+if (BUILD_TEST)
         add_executable(gemm_test
                 test/gemm_test.cu
                 test/test_utils.cu
@@ -61,4 +61,4 @@ set_property(TARGET gemm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
                         test/reference.cu)
                 target_link_libraries(gemm_bench PRIVATE gemm2 nvbench::nvbench cublas)
         endif ()
-# endif ()
+endif ()
diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h
new file mode 100644
index 0000000..157fd8e
--- /dev/null
+++ b/src/turbomind/utils/cuda_utils.h
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// #include "src/turbomind/macro.h"
+// #include "src/turbomind/utils/cuda_bf16_wrapper.h"
+// #include "src/turbomind/utils/logger.h"
+
+#include <algorithm>
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#ifdef SPARSITY_ENABLED
+#include <cusparseLt.h>
+#endif
+
+namespace turbomind {
+
+#define MAX_CONFIG_NUM 20
+#define COL32_ 32
+// workspace for cublas gemm : 32MB
+#define CUBLAS_WORKSPACE_SIZE 33554432
+
+typedef struct __align__(4)
+{
+    half x, y, z, w;
+}
+half4;
+
+/* **************************** type definition ***************************** */
+
+enum CublasDataType
+{
+    FLOAT_DATATYPE    = 0,
+    HALF_DATATYPE     = 1,
+    BFLOAT16_DATATYPE = 2,
+    INT8_DATATYPE     = 3,
+    FP8_DATATYPE      = 4
+};
+
+enum FtCudaDataType
+{
+    FP32 = 0,
+    FP16 = 1,
+    BF16 = 2,
+    INT8 = 3,
+    FP8  = 4
+};
+
+enum class OperationType
+{
+    FP32,
+    FP16,
+    BF16,
+    INT8,
+    FP8
+};
+
+/* **************************** debug tools ********************************* */
+static const char* _cudaGetErrorEnum(cudaError_t error)
+{
+    return cudaGetErrorString(error);
+}
+
+static const char* _cudaGetErrorEnum(cublasStatus_t error)
+{
+    switch (error) {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+    return "<unknown>";
+}
+
+template<typename T>
+void check(T result, char const* const func, const char* const file, int const line)
+{
+    if (result) {
+        throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " "
+                                 + file + ":" + std::to_string(line) + " \n");
+    }
+}
+
+#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
+#define check_cuda_error_2(val, file, line) check((val), #val, file, line)
+
+inline void syncAndCheck(const char* const file, int const line)
+{
+    // When FT_DEBUG_LEVEL=DEBUG, must check error
+    static char* level_name = std::getenv("TM_DEBUG_LEVEL");
+    if (level_name != nullptr) {
+        static std::string level = std::string(level_name);
+        if (level == "DEBUG") {
+            cudaDeviceSynchronize();
+            cudaError_t result = cudaGetLastError();
+            if (result) {
+                throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result))
+                                         + " " + file + ":" + std::to_string(line) + " \n");
+            }
+            // TM_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line));
+        }
+    }
+
+#ifndef NDEBUG
+    cudaDeviceSynchronize();
+    cudaError_t result = cudaGetLastError();
+    if (result) {
+        throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " "
+                                 + file + ":" + std::to_string(line) + " \n");
+    }
+#endif
+}
+
+#define sync_check_cuda_error() syncAndCheck(__FILE__, __LINE__)
+
+#define checkCUDNN(expression)                                                                                         \
+    {                                                                                                                  \
+        cudnnStatus_t status = (expression);                                                                           \
+        if (status != CUDNN_STATUS_SUCCESS) {                                                                          \
+            std::cerr << "Error on file " << __FILE__ << " line " << __LINE__ << ": " << cudnnGetErrorString(status)   \
+                      << std::endl;                                                                                    \
+            std::exit(EXIT_FAILURE);                                                                                   \
+        }                                                                                                              \
+    }
+
+[[noreturn]] inline void throwRuntimeError(const char* const file, int const line, std::string const& info = "")
+{
+    throw std::runtime_error(std::string("[TM][ERROR] ") + info + " Assertion fail: " + file + ":"
+                             + std::to_string(line) + " \n");
+}
+
+inline void myAssert(bool result, const char* const file, int const line, std::string const& info = "")
+{
+    if (!result) {
+        throwRuntimeError(file, line, info);
+    }
+}
+
+#define TM_CHECK(val) myAssert(val, __FILE__, __LINE__)
+#define TM_CHECK_WITH_INFO(val, info)                                                                                  \
+    do {                                                                                                               \
+        bool is_valid_val = (val);                                                                                     \
+        if (!is_valid_val) {                                                                                           \
+            turbomind::myAssert(is_valid_val, __FILE__, __LINE__, (info));                                             \
+        }                                                                                                              \
+    } while (0)
+
+#define TM_THROW(info) throwRuntimeError(__FILE__, __LINE__, info)
+
+#ifdef SPARSITY_ENABLED
+#define CHECK_CUSPARSE(func)                                                                                           \
+    {                                                                                                                  \
+        cusparseStatus_t status = (func);                                                                              \
+        if (status != CUSPARSE_STATUS_SUCCESS) {                                                                       \
+            throw std::runtime_error(std::string("[TM][ERROR] CUSPARSE API failed at line ")                           \
+                                     + std::to_string(__LINE__) + " in file " + __FILE__ + ": "                        \
+                                     + cusparseGetErrorString(status) + " " + std::to_string(status));                 \
+        }                                                                                                              \
+    }
+#endif
+
+/*************Time Handling**************/
+class CudaTimer {
+private:
+    cudaEvent_t  event_start_;
+    cudaEvent_t  event_stop_;
+    cudaStream_t stream_;
+
+public:
+    explicit CudaTimer(cudaStream_t stream = 0)
+    {
+        stream_ = stream;
+    }
+    void start()
+    {
+        check_cuda_error(cudaEventCreate(&event_start_));
+        check_cuda_error(cudaEventCreate(&event_stop_));
+        check_cuda_error(cudaEventRecord(event_start_, stream_));
+    }
+    float stop()
+    {
+        float time;
+        check_cuda_error(cudaEventRecord(event_stop_, stream_));
+        check_cuda_error(cudaEventSynchronize(event_stop_));
+        check_cuda_error(cudaEventElapsedTime(&time, event_start_, event_stop_));
+        check_cuda_error(cudaEventDestroy(event_start_));
+        check_cuda_error(cudaEventDestroy(event_stop_));
+        return time;
+    }
+    ~CudaTimer() {}
+};
+
+/* ***************************** common utils ****************************** */
+
+inline void print_mem_usage(std::string time = "after allocation")
+{
+    size_t free_bytes, total_bytes;
+    check_cuda_error(cudaMemGetInfo(&free_bytes, &total_bytes));
+    float free  = static_cast<float>(free_bytes) / 1024.0 / 1024.0 / 1024.0;
+    float total = static_cast<float>(total_bytes) / 1024.0 / 1024.0 / 1024.0;
+    float used  = total - free;
+    printf("%-20s: free: %5.2f GB, total: %5.2f GB, used: %5.2f GB\n", time.c_str(), free, total, used);
+}
+
+inline int getSMVersion()
+{
+    int device{-1};
+    check_cuda_error(cudaGetDevice(&device));
+    int sm_major = 0;
+    int sm_minor = 0;
+    check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
+    check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
+    return sm_major * 10 + sm_minor;
+}
+
+inline int getMaxSharedMemoryPerBlock()
+{
+    int device{-1};
+    check_cuda_error(cudaGetDevice(&device));
+    int max_shared_memory_size = 0;
+    check_cuda_error(cudaDeviceGetAttribute(&max_shared_memory_size, cudaDevAttrMaxSharedMemoryPerBlock, device));
+    return max_shared_memory_size;
+}
+
+inline std::string getDeviceName()
+{
+    int device{-1};
+    check_cuda_error(cudaGetDevice(&device));
+    cudaDeviceProp props;
+    check_cuda_error(cudaGetDeviceProperties(&props, device));
+    return std::string(props.name);
+}
+
+inline int div_up(int a, int n)
+{
+    return (a + n - 1) / n;
+}
+
+cudaError_t getSetDevice(int i_device, int* o_device = NULL);
+
+inline int getDevice()
+{
+    int current_dev_id = 0;
+    check_cuda_error(cudaGetDevice(&current_dev_id));
+    return current_dev_id;
+}
+
+inline int getDeviceCount()
+{
+    int count = 0;
+    check_cuda_error(cudaGetDeviceCount(&count));
+    return count;
+}
+
+template<typename T>
+CublasDataType getCublasDataType()
+{
+    if (std::is_same<T, half>::value) {
+        return HALF_DATATYPE;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        return BFLOAT16_DATATYPE;
+    }
+#endif
+    else if (std::is_same<T, float>::value) {
+        return FLOAT_DATATYPE;
+    }
+    else {
+        TM_CHECK(false);
+        return FLOAT_DATATYPE;
+    }
+}
+
+template<typename T>
+cudaDataType_t getCudaDataType()
+{
+    if (std::is_same<T, half>::value) {
+        return CUDA_R_16F;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        return CUDA_R_16BF;
+    }
+#endif
+    else if (std::is_same<T, float>::value) {
+        return CUDA_R_32F;
+    }
+    else {
+        TM_CHECK(false);
+        return CUDA_R_32F;
+    }
+}
+
+template<CublasDataType T>
+struct getTypeFromCudaDataType {
+    using Type = float;
+};
+
+template<>
+struct getTypeFromCudaDataType<HALF_DATATYPE> {
+    using Type = half;
+};
+
+#ifdef ENABLE_BF16
+template<>
+struct getTypeFromCudaDataType<BFLOAT16_DATATYPE> {
+    using Type = __nv_bfloat16;
+};
+#endif
+
+// clang-format off
+template<typename T> struct packed_type;
+template <>          struct packed_type<float>         { using type = float; }; // we don't need to pack float by default
+template <>          struct packed_type<half>          { using type = half2; };
+
+#ifdef ENABLE_BF16
+template<>
+struct packed_type<__nv_bfloat16> {
+    using type = __nv_bfloat162;
+};
+#endif
+
+template<typename T> struct num_elems;
+template <>          struct num_elems<float>           { static constexpr int value = 1; };
+template <>          struct num_elems<float2>          { static constexpr int value = 2; };
+template <>          struct num_elems<float4>          { static constexpr int value = 4; };
+template <>          struct num_elems<half>            { static constexpr int value = 1; };
+template <>          struct num_elems<half2>           { static constexpr int value = 2; };
+#ifdef ENABLE_BF16
+template <>          struct num_elems<__nv_bfloat16>   { static constexpr int value = 1; };
+template <>          struct num_elems<__nv_bfloat162>  { static constexpr int value = 2; };
+#endif
+
+template<typename T, int num> struct packed_as;
+template<typename T>          struct packed_as<T, 1>              { using type = T; };
+template<>                    struct packed_as<half,  2>          { using type = half2; };
+template<>                    struct packed_as<float,  2>         { using type = float2; };
+template<>                    struct packed_as<int8_t, 2>         { using type = int16_t; };
+template<>                    struct packed_as<int32_t, 2>        { using type = int2; };
+template<>                    struct packed_as<half2, 1>          { using type = half; };
+#ifdef ENABLE_BF16
+template<> struct packed_as<__nv_bfloat16,  2> { using type = __nv_bfloat162; };
+template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16;  };
+#endif
+
+inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+inline __device__ float2 operator*(float2 a, float  b) { return make_float2(a.x * b, a.y * b); }
+// clang-format on
+
+template<typename T1, typename T2>
+void compareTwoTensor(
+    const T1* pred, const T2* ref, const int size, const int print_size = 0, const std::string filename = "")
+{
+    T1* h_pred = new T1[size];
+    T2* h_ref  = new T2[size];
+    check_cuda_error(cudaMemcpy(h_pred, pred, size * sizeof(T1), cudaMemcpyDeviceToHost));
+    check_cuda_error(cudaMemcpy(h_ref, ref, size * sizeof(T2), cudaMemcpyDeviceToHost));
+
+    FILE* fd = nullptr;
+    if (filename != "") {
+        fd = fopen(filename.c_str(), "w");
+        fprintf(fd, "| %10s | %10s | %10s | %10s | \n", "pred", "ref", "abs_diff", "rel_diff(%)");
+    }
+
+    if (print_size > 0) {
+        printf("  id |   pred  |   ref   |abs diff | rel diff (%) |");
+    }
+    float mean_abs_diff = 0.0f;
+    float mean_rel_diff = 0.0f;
+    int   count         = 0;
+    for (int i = 0; i < size; i++) {
+        if (i < print_size) {
+            printf("%4d | % 6.4f | % 6.4f | % 6.4f | % 7.4f |",
+                        i,
+                        (float)h_pred[i],
+                        (float)h_ref[i],
+                        abs((float)h_pred[i] - (float)h_ref[i]),
+                        abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f);
+        }
+        if ((float)h_pred[i] == 0) {
+            continue;
+        }
+        count += 1;
+        mean_abs_diff += abs((float)h_pred[i] - (float)h_ref[i]);
+        mean_rel_diff += abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f;
+
+        if (fd != nullptr) {
+            fprintf(fd,
+                    "| %10.5f | %10.5f | %10.5f | %11.5f |\n",
+                    (float)h_pred[i],
+                    (float)h_ref[i],
+                    abs((float)h_pred[i] - (float)h_ref[i]),
+                    abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f);
+        }
+    }
+    mean_abs_diff = mean_abs_diff / (float)count;
+    mean_rel_diff = mean_rel_diff / (float)count;
+    printf("mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff);
+
+    if (fd != nullptr) {
+        fprintf(fd, "mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff);
+        fclose(fd);
+    }
+    delete[] h_pred;
+    delete[] h_ref;
+}
+
+bool is_16xx_series(const char* name);
+
+/* ************************** end of common utils ************************** */
+}  // namespace turbomind
diff --git a/src/turbomind/utils/tensor.h b/src/turbomind/utils/tensor.h
new file mode 100644
index 0000000..e765eeb
--- /dev/null
+++ b/src/turbomind/utils/tensor.h
@@ -0,0 +1,86 @@
+#pragma once
+
+
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <numeric>
+#include <stdlib.h>
+#include <string>
+// #include <sys/stat.h>
+// #include <sys/types.h>
+#include <unordered_map>
+#include <vector>
+
+namespace turbomind{
+
+typedef enum datatype_enum
+{
+    TYPE_INVALID,
+    TYPE_BOOL,
+    TYPE_UINT8,
+    TYPE_UINT16,
+    TYPE_UINT32,
+    TYPE_UINT64,
+    TYPE_INT8,
+    TYPE_INT16,
+    TYPE_INT32,
+    TYPE_INT64,
+    TYPE_FP16,
+    TYPE_FP32,
+    TYPE_FP64,
+    TYPE_BYTES,
+    TYPE_BF16
+} DataType;
+
+typedef enum memorytype_enum
+{
+    MEMORY_CPU,
+    MEMORY_CPU_PINNED,
+    MEMORY_GPU
+} MemoryType;
+
+
+struct Tensor {
+    MemoryType          where;
+    DataType            type;
+    std::vector<size_t> shape;
+    const void*         data;
+
+    Tensor(): where(MEMORY_CPU), type(TYPE_INVALID), shape({}), data(nullptr) {}
+    Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data):
+        where(_where), type(_type), shape(_shape), data(_data)
+    {
+    }
+
+    size_t size() const {
+        if (data == nullptr || shape.size() == 0) {
+            return 0;
+        }
+        return std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies<size_t>());
+    }
+
+    size_t sizeBytes() const {
+        return size() * typeSize();
+    }
+
+    size_t typeSize() const {
+        static const std::unordered_map<DataType, size_t> type_map{{TYPE_BOOL, sizeof(bool)},
+                                                                   {TYPE_BYTES, sizeof(char)},
+                                                                   {TYPE_UINT8, sizeof(uint8_t)},
+                                                                   {TYPE_UINT16, sizeof(uint16_t)},
+                                                                   {TYPE_UINT32, sizeof(uint32_t)},
+                                                                   {TYPE_UINT64, sizeof(uint64_t)},
+                                                                   {TYPE_INT8, sizeof(int8_t)},
+                                                                   {TYPE_INT16, sizeof(int16_t)},
+                                                                   {TYPE_INT32, sizeof(int32_t)},
+                                                                   {TYPE_INT64, sizeof(int64_t)},
+#ifdef ENABLE_BF16
+                                                                   {TYPE_BF16, sizeof(__nv_bfloat16)},
+#endif
+                                                                   {TYPE_FP16, sizeof(half)},
+                                                                   {TYPE_FP32, sizeof(float)},
+                                                                   {TYPE_FP64, sizeof(double)}};
+        return type_map.at(type);
+    }
+};
+} // namespace turbomind
diff --git a/turbomind/__init__.py b/turbomind/__init__.py
index ef101fe..5de11b0 100644
--- a/turbomind/__init__.py
+++ b/turbomind/__init__.py
@@ -1 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+
+from .linear import Linear
+
+__all__ = ['Linear']
diff --git a/turbomind/linear.py b/turbomind/linear.py
index 0b322a1..3abdc59 100644
--- a/turbomind/linear.py
+++ b/turbomind/linear.py
@@ -1,8 +1,229 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os.path as osp
+import sys
+from typing import List
+
 import torch
 
-# from . import utils
+import turbomind
+
+turbomind_dir = osp.split(turbomind.__file__)[0]
+sys.path.append(osp.join(turbomind_dir, 'lib'))
+
+try:
+    import turbomind_kernels
+    TURBOMIND_KERNELS_INSTALLED = True
+except Exception as e:
+    logging.error(f'turbomind_kernels is not installed: {e}')
+    TURBOMIND_KERNELS_INSTALLED = False
+
+
+def pack_u4_row(x: torch.Tensor) -> torch.Tensor:
+    assert x.dtype == torch.uint8
+    xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1)
+    a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device)
+    for t in reversed(xs):
+        a = (a << 4) | t
+    return a.squeeze(dim=-1)
+
+
+def transpose(x):
+    return x.t() if x is not None else x
+
+
+def pad_out_dims(x: torch.Tensor, dims: int):
+    pad = dims - x.size(-1)
+    assert pad >= 0
+    return torch.nn.functional.pad(x, (0, pad), 'constant', 0)
+
+
+def pad_in_dims(x: torch.Tensor, dims: int):
+    pad = dims - x.size(0)
+    assert x.dim() == 2
+    assert pad >= 0
+    return torch.nn.functional.pad(x, (0, 0, 0, pad), 'constant', 0)
+
+
+def to_cuda(x: torch.Tensor, *args):
+    return x.cuda()
+
+
+def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> List[torch.Tensor]:
+    assert x.dtype == torch.int32
+    xs = []
+    for _ in range(8):
+        xs.append((x & 15).to(dtype))
+        x = x >> 4
+    return xs
+
+
+def unpack_awq_gemm(x: torch.Tensor) -> torch.Tensor:
+    xs = get_u4_slices(x, torch.uint8)
+    order = [0, 4, 1, 5, 2, 6, 3, 7]
+    ys = [xs[i] for i in order]
+    return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1)
+
+
+def process_awq_gemm(x: torch.Tensor, kind: str):
+    x = x.cuda()
+    if x.dtype == torch.int32:
+        x = unpack_awq_gemm(x)
+    if kind in ['qweight', 'qzeros', 'scales']:
+        x = x.t()
+    return x
+
+
+def process_gptq(x: torch.Tensor, kind: str):
+    x = x.cuda()
+    if x.dtype == torch.int32:
+        xs = get_u4_slices(x, torch.uint8)
+        if kind == 'qweight':  # (k/8,n)
+            x = torch.stack(xs, dim=1).view(-1, x.size(-1))
+        else:  # 'qzeros' (k/g,n/8)
+            x = torch.stack(xs, dim=-1).view(x.size(0), -1) + 1
+    if kind in ['qweight', 'qzeros', 'scales']:
+        x = x.t()
+    return x
 
 
 class Linear(torch.nn.Module):
-    pass
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 bias: bool = False,
+                 quant_method: str = '',
+                 w_bit: int = 4,
+                 group_size: int = 128,
+                 device: str = 'cuda'):
+        super().__init__()
+
+        if w_bit not in [4]:
+            raise NotImplementedError(
+                f'Only 4-bit is supported for now, but got {w_bit} bit')
+        if group_size != 128:
+            raise NotImplementedError(
+                f'Only group_size 128 is supported for now, '
+                f'but got group_size {group_size}')
+        if bias:
+            raise NotImplementedError('bias has not been supported yet')
+        self.w_bit = w_bit
+        self.group_size = group_size
+        self.in_features = in_features
+        self.out_features = out_features
+        self.bias = bias
+        self.quant_method = quant_method
+
+        # quick sanity check (make sure alignment)
+        assert self.in_features % self.group_size == 0
+        assert self.out_features % (32 // self.w_bit) == 0
+
+        self.register_buffer(
+            'qweight',
+            torch.zeros(
+                (in_features, out_features // (32 // self.w_bit)),
+                dtype=torch.int32,
+                device=device,
+            ),
+        )
+        self.register_buffer(
+            'qzeros',
+            torch.zeros(
+                (in_features // self.group_size, out_features //
+                 (32 // self.w_bit)),
+                dtype=torch.int32,
+                device=device,
+            ),
+        )
+        self.register_buffer(
+            'scales',
+            torch.zeros(
+                (in_features // self.group_size, out_features),
+                dtype=torch.float16,
+                device=device,
+            ),
+        )
+
+        # if bias:
+        #     self.register_buffer(
+        #         'bias',
+        #         torch.zeros(
+        #             (out_features),
+        #             dtype=torch.float16,
+        #             device=device,
+        #         ),
+        #     )
+        # else:
+        #     self.bias = None
+
+        self.linear = turbomind_kernels.Linear(self.in_features,
+                                               self.out_features, self.w_bit,
+                                               self.group_size)
+
+    def post_init(self):
+        assert self.qweight.device.type == 'cuda'
+        if self.quant_method == 'awq':
+            self.qweight = unpack_awq_gemm(self.qweight).t()
+            self.qzeros = unpack_awq_gemm(self.qzeros).t()
+            self.scales = self.scales.t()
+        elif self.quant_method == 'gptq':
+            xs = get_u4_slices(self.qweight, torch.uint8)
+            self.qweight = torch.stack(xs, dim=1).view(-1,
+                                                       self.qweight.size(-1))
+            xs = get_u4_slices(self.qzeros, torch.uint8)
+            self.qzeros = torch.stack(xs, dim=-1).view(self.qzeros.size(0),
+                                                       -1) + 1
+            self.qweight = self.qweight.t()
+            self.qzeros = self.qzeros.t()
+            self.scales = self.scales.t()
+        else:
+            return
+
+        self.qweight = transpose(self.qweight)
+        self.qzeros = transpose(self.qzeros)
+        self.scales = transpose(self.scales)
+
+        self.qweight = pack_u4_row(self.qweight)
+        self.qzeros = self.qzeros.to(torch.half)
+
+        device_id = self.qweight.device.index
+        properties = torch.cuda.get_device_properties(device_id)
+
+        def is_16xx_series(name):
+            import re
+            pattern = r'GTX 16\d\d'
+            return bool(re.search(pattern, name))
+
+        simt = is_16xx_series(properties.name)
+        self.qweight = self.qweight.contiguous()
+        self.scales = self.scales.contiguous()
+        self.qzeros = self.qzeros.contiguous()
+        self.linear.post_init(self.qweight, self.scales, self.qzeros, simt)
+
+    @torch.no_grad()
+    def forward(self, x):
+        assert TURBOMIND_KERNELS_INSTALLED, (
+            'turbomind kernels are not installed. '
+            'Please perform `pip install turbomind` to install turbomind '
+            'kernels.')
+        input_dtype = x.dtype
+        if input_dtype != torch.float16:
+            x = x.half()
+        x = x.view(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (self.out_features, )
+        out = torch.empty(
+            (x.shape[0], self.out_features),
+            dtype=torch.float16,
+            device=x.device,
+        )
+        self.linear.forward(x, out)
+        out = torch.from_dlpack(out)
+        print(out)
+        return out.view(out_shape)
+
+    def __call__(self, x: torch.Tensor):
+        return self.forward(x)
+
+    def to_half(x: torch.Tensor):
+        return x.to(torch.half)

From e6eaafe42140de3fe3cf730f5dff50805e1e341e Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Sat, 12 Oct 2024 19:25:47 +0800
Subject: [PATCH 06/18] remove example/awq

---
 example/awq/__init__.py      |   0
 example/awq/_config.py       | 109 -------------
 example/awq/act.py           |  12 --
 example/awq/auto.py          |  64 --------
 example/awq/base.py          | 244 -----------------------------
 example/awq/gemm.py          | 287 -----------------------------------
 example/awq/llama.py         |  81 ----------
 example/awq/module.py        |  60 --------
 example/awq/packing_utils.py |  97 ------------
 example/awq/utils.py         | 110 --------------
 10 files changed, 1064 deletions(-)
 delete mode 100644 example/awq/__init__.py
 delete mode 100644 example/awq/_config.py
 delete mode 100644 example/awq/act.py
 delete mode 100644 example/awq/auto.py
 delete mode 100644 example/awq/base.py
 delete mode 100644 example/awq/gemm.py
 delete mode 100644 example/awq/llama.py
 delete mode 100644 example/awq/module.py
 delete mode 100644 example/awq/packing_utils.py
 delete mode 100644 example/awq/utils.py

diff --git a/example/awq/__init__.py b/example/awq/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/example/awq/_config.py b/example/awq/_config.py
deleted file mode 100644
index a966e5e..0000000
--- a/example/awq/_config.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import json
-import os
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional
-
-from transformers.utils.hub import PushToHubMixin, cached_file
-
-
-@dataclass
-class AwqConfig(PushToHubMixin):
-    quant_method: str = field(default='awq')
-    zero_point: bool = field(default=True)
-    q_group_size: int = field(default=128)
-    w_bit: int = field(default=4)
-    version: str = field(default='gemm')
-    config_file_name = 'config.json'
-    modules_to_not_convert: Optional[List] = None
-
-    @classmethod
-    def from_dict(cls, quant_config: Dict = {}):
-        if not quant_config:
-            quant_config = cls()
-        else:
-            quant_config = cls(**quant_config)
-            quant_config.version = quant_config.version.lower()
-
-        return quant_config
-
-    @classmethod
-    def from_pretrained(cls, save_dir: str, **kwargs):
-        cache_dir = kwargs.pop('cache_dir', None)
-        force_download = kwargs.pop('force_download', False)
-        resume_download = kwargs.pop('resume_download', False)
-        proxies = kwargs.pop('proxies', None)
-        local_files_only = kwargs.pop('local_files_only', False)
-        use_auth_token = kwargs.pop('use_auth_token', None)
-        revision = kwargs.pop('revision', None)
-        subfolder = kwargs.pop('subfolder', None)
-        commit_hash = kwargs.pop('_commit_hash', None)
-
-        if os.path.isdir(save_dir):  # Local
-            resolved_config_file = os.path.join(save_dir, cls.config_file_name)
-        else:  # Remote
-            resolved_config_file = cached_file(
-                save_dir,
-                cls.config_file_name,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                local_files_only=local_files_only,
-                subfolder=subfolder,
-                _raise_exceptions_for_missing_entries=False,
-                _raise_exceptions_for_connection_errors=False,
-                _commit_hash=commit_hash,
-            )
-
-        quant_config = None
-        if os.path.exists(resolved_config_file):
-            with open(resolved_config_file, 'r', encoding='utf-8') as file:
-                loaded_config = json.loads(file.read())
-
-            quant_config = loaded_config.get('quantization_config')
-
-            if quant_config is not None:
-                awq_config = cls.from_transformers_dict(cls, quant_config)
-                quant_config = cls(**awq_config)
-
-        if quant_config is None:
-            quant_config = cls()
-
-        return quant_config
-
-    def to_dict(self):
-        return {
-            'zero_point': self.zero_point,
-            'q_group_size': self.q_group_size,
-            'w_bit': self.w_bit,
-            'version': self.version,
-            'modules_to_not_convert': self.modules_to_not_convert,
-        }
-
-    def to_transformers_dict(self):
-        return {
-            'quant_method': self.quant_method,
-            'zero_point': self.zero_point,
-            'group_size': self.q_group_size,
-            'bits': self.w_bit,
-            'version': self.version.lower(),
-            'modules_to_not_convert': self.modules_to_not_convert,
-        }
-
-    def from_transformers_dict(self, transformers_dict: Dict):
-        return {
-            'quant_method':
-            transformers_dict.get('quant_method'),
-            'zero_point':
-            transformers_dict.get('zero_point'),
-            'q_group_size':
-            transformers_dict.get('group_size'),
-            'w_bit':
-            transformers_dict.get('bits'),
-            'version':
-            transformers_dict.get('version'),
-            'modules_to_not_convert':
-            transformers_dict.get('modules_to_not_convert'),
-        }
diff --git a/example/awq/act.py b/example/awq/act.py
deleted file mode 100644
index 59ca7c4..0000000
--- a/example/awq/act.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import torch.nn as nn
-
-
-class ScaledActivation(nn.Module):
-
-    def __init__(self, module, scales):
-        super().__init__()
-        self.act = module
-        self.scales = nn.Parameter(scales.data)
-
-    def forward(self, x):
-        return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
diff --git a/example/awq/auto.py b/example/awq/auto.py
deleted file mode 100644
index a18db0c..0000000
--- a/example/awq/auto.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import logging
-import os
-
-from transformers import AutoConfig
-
-from .base import BaseAWQForCausalLM
-from .llama import LlamaAWQForCausalLM
-
-AWQ_CAUSAL_LM_MODEL_MAP = {
-    'llama': LlamaAWQForCausalLM,
-}
-
-
-def check_and_get_model_type(model_dir, **model_init_kwargs):
-    config = AutoConfig.from_pretrained(model_dir,
-                                        trust_remote_code=True,
-                                        **model_init_kwargs)
-    if config.model_type not in AWQ_CAUSAL_LM_MODEL_MAP.keys():
-        raise TypeError(f"{config.model_type} isn't supported yet.")
-    model_type = config.model_type
-    return model_type
-
-
-class AutoAWQForCausalLM:
-
-    def __init__(self):
-        raise EnvironmentError(
-            'You must instantiate AutoAWQForCausalLM with\n'
-            'AutoAWQForCausalLM.from_quantized or AutoAWQForCausalLM.'
-            'from_pretrained')
-
-    @classmethod
-    def from_quantized(
-        self,
-        quant_path,
-        max_seq_len=2048,
-        fuse_layers=True,
-        batch_size=1,
-        device_map='balanced',
-        max_memory=None,
-        offload_folder=None,
-        download_kwargs=None,
-        **config_kwargs,
-    ) -> BaseAWQForCausalLM:
-        os.environ['AWQ_BATCH_SIZE'] = str(batch_size)
-        model_type = check_and_get_model_type(quant_path)
-
-        if config_kwargs.get('max_new_tokens') is not None:
-            max_seq_len = config_kwargs['max_new_tokens']
-            logging.warning(
-                'max_new_tokens argument is deprecated... gracefully '
-                'setting max_seq_len=max_new_tokens.')
-
-        return AWQ_CAUSAL_LM_MODEL_MAP[model_type].from_quantized(
-            quant_path,
-            model_type,
-            max_seq_len,
-            fuse_layers=fuse_layers,
-            device_map=device_map,
-            max_memory=max_memory,
-            offload_folder=offload_folder,
-            download_kwargs=download_kwargs,
-            **config_kwargs,
-        )
diff --git a/example/awq/base.py b/example/awq/base.py
deleted file mode 100644
index 9a815c4..0000000
--- a/example/awq/base.py
+++ /dev/null
@@ -1,244 +0,0 @@
-import gc
-from typing import Dict, Union
-
-import torch
-import torch.nn as nn
-import transformers
-from accelerate.big_modeling import (init_empty_weights,
-                                     load_checkpoint_and_dispatch)
-from tqdm import tqdm
-from transformers import AutoConfig, PretrainedConfig, PreTrainedModel
-from typing_extensions import Annotated, Doc
-
-from ._config import AwqConfig
-from .act import ScaledActivation
-from .gemm import WQLinear_GEMM
-from .module import (exclude_layers_to_not_quantize, get_named_linears,
-                     set_op_by_name)
-
-# from turbomind import Linear
-# from turbomind.utils import turbomind_post_init
-
-# Since we support different `AutoModelForxxx` from transformers
-# we need to define a custom mapping dict as below:
-TRANSFORMERS_AUTO_MAPPING_DICT = {
-    'llama': 'AutoModelForCausalLM',
-}
-
-
-class BaseAWQForCausalLM(nn.Module):
-
-    def __init__(
-        self,
-        model,
-        model_type,
-        is_quantized,
-        config,
-        quant_config,
-    ):
-        """The base model for all AutoAWQ models.
-
-        Args:
-            model: The pretrained or quantized model.
-            model_type: The model type, found in config.json.
-            is_quantized: Indicates if the current model is quantized
-            config: The config of the model.
-            quant_config: The quantization config of the model.
-        """
-        super().__init__()
-        self.model: PreTrainedModel = model
-        self.model_type: str = model_type
-        self.is_quantized: bool = is_quantized
-        self.search_result = None
-        self.config: PretrainedConfig = config
-        self.quant_config: AwqConfig = quant_config
-
-    def to(self, device: Annotated[str,
-                                   Doc('The device to move your model to.')]):
-        """A utility function for moving the model to a device."""
-        return self.model.to(device)
-
-    def forward(self, *args, **kwargs):
-        """A forward function that mimics the torch forward."""
-        return self.model(*args, **kwargs)
-
-    def generate(self, *args, **kwargs):
-        """A generate function that mimics the HF generate function."""
-        with torch.inference_mode():
-            return self.model.generate(*args, **kwargs)
-
-    # @staticmethod
-    # def fuse_layers(model):
-    #     pass
-
-    @classmethod
-    def from_quantized(self,
-                       model_path: str,
-                       model_type: str,
-                       max_seq_len: int,
-                       torch_dtype: torch.dtype = torch.float16,
-                       device_map: Union[str, Dict] = 'balanced',
-                       **config_kwargs: Dict):
-        """A method for initialization of a quantized model, usually in INT4.
-
-        Args:
-            model_path (str): The model path
-            model_type (str): The model type, loaded from config.json.
-            max_seq_len (int): The maximum sequence cached sequence length of
-                the model. Larger values may increase loading time and
-                memory usage.
-            torch_dtype: The dtype to load the model as. May not work with
-                other values than float16.
-            device_map: A device map that will be passed onto the model
-                loading method from transformers.
-        **config_kwargs: Additional kwargs that are passed to the config
-            during initialization
-        """
-        # [STEP 1-2] Load weights path and configs
-        model_weights_path, config, quant_config = self._load_config(
-            self,
-            model_path,
-            max_seq_len=max_seq_len,
-            **config_kwargs,
-        )
-
-        target_cls_name = TRANSFORMERS_AUTO_MAPPING_DICT[config.model_type]
-        target_cls = getattr(transformers, target_cls_name)
-
-        # [STEP 3] Load model
-        with init_empty_weights():
-            model = target_cls.from_config(
-                config=config,
-                torch_dtype=torch_dtype,
-                trust_remote_code=True,
-            )
-        # Prepare WQLinear layers, replace nn.Linear
-        self._load_quantized_modules(
-            self,
-            model,
-            quant_config,
-            quant_config.version,
-            use_exllama=False,
-            use_exllama_v2=False,
-            use_qbits=False,
-        )
-
-        model.tie_weights()
-
-        # loads the weights into modules and distributes
-        # across available devices automatically
-        load_checkpoint_and_dispatch(
-            model,
-            checkpoint=model_weights_path,
-            device_map=device_map,
-            no_split_module_classes=[self.layer_type],
-            dtype=torch_dtype,
-        )
-
-        # TODO
-        # model = turbomind_post_init(model)
-
-        # # Dispatch to devices
-        # if fuse_layers:
-        #     self.fuse_layers(model)
-
-        model.eval()
-
-        return self(
-            model,
-            model_type,
-            is_quantized=True,
-            config=config,
-            quant_config=quant_config,
-        )
-
-    def _load_config(
-        self,
-        model_path,
-        max_seq_len=4096,
-        **config_kwargs,
-    ):
-        # [STEP 2] Load config and set sequence length
-        # TODO: Create BaseAWQConfig class
-        quant_config = AwqConfig.from_pretrained(model_path)
-
-        # Load model config and set max generation length
-        if max_seq_len is None and hasattr(self, 'max_seq_len_key'):
-            config = AutoConfig.from_pretrained(model_path,
-                                                trust_remote_code=True,
-                                                **config_kwargs)
-            config.max_seq_len = getattr(config, self.max_seq_len_key, 2048)
-            # To add the generate support for Multi-modal models as well
-            if hasattr(config, 'text_config'):
-                config.text_config.max_seq_len = getattr(
-                    config, self.max_seq_len_key, 2048)
-        else:
-            max_seq_len = 2048 if max_seq_len is None else max_seq_len
-            config = AutoConfig.from_pretrained(model_path,
-                                                trust_remote_code=True,
-                                                **config_kwargs)
-            config.max_seq_len = max_seq_len
-
-        return model_path, config, quant_config
-
-    def _load_quantized_modules(self,
-                                model,
-                                quant_config,
-                                version,
-                                use_exllama,
-                                use_exllama_v2,
-                                use_qbits=False):
-        # Real quantization of weights
-        assert not (version == 'gemv' and
-                    (use_exllama or use_exllama_v2 or
-                     use_qbits)), 'Exllama kernels only support GEMM version.'
-
-        # Get blocks of model
-        layers = self.get_model_layers(model)
-
-        for i in tqdm(range(len(layers)), desc='Replacing layers...'):
-            layer = layers[i]
-
-            # Get every linear layer in a block
-            named_linears = get_named_linears(layer)
-
-            # Filter out the linear layers we don't want to include
-            named_linears = exclude_layers_to_not_quantize(
-                named_linears, quant_config.modules_to_not_convert)
-
-            # Replace activation functions
-            self._scale_activations(self, layer)
-
-            # Replace nn.Linear with WQLinear
-            for name, module in named_linears.items():
-                assert version == 'gemm'
-
-                q_linear_module = WQLinear_GEMM
-                # q_linear_module = Linear
-                q_linear = q_linear_module.from_linear(
-                    module, quant_config.w_bit, quant_config.q_group_size,
-                    True)
-                q_linear.to(next(layer.parameters()).device)
-                set_op_by_name(layer, name, q_linear)
-
-            if not use_qbits:
-                torch.cuda.empty_cache()
-            gc.collect()
-
-    @staticmethod
-    def _scale_activations(self, layer):
-        scale_dict = self.get_act_for_scaling(layer)
-
-        if scale_dict['is_scalable']:
-            if not isinstance(scale_dict['scale_layer'], ScaledActivation):
-                param = next(layer.parameters())
-
-                # get activation scale
-                scale_like = torch.ones(scale_dict['scale_shape'],
-                                        dtype=param.dtype,
-                                        device=param.device)
-
-                # scale activation
-                scaled_act = ScaledActivation(scale_dict['scale_layer'],
-                                              scale_like)
-                set_op_by_name(layer, scale_dict['scale_name'], scaled_act)
diff --git a/example/awq/gemm.py b/example/awq/gemm.py
deleted file mode 100644
index 664b4f5..0000000
--- a/example/awq/gemm.py
+++ /dev/null
@@ -1,287 +0,0 @@
-import warnings
-
-import torch
-import torch.nn as nn
-from torch.autograd import Function
-
-from .packing_utils import dequantize_gemm
-from .utils import get_best_device
-
-try:
-    import awq_ext  # with CUDA kernels (AutoAWQ_kernels)
-
-    AWQ_INSTALLED = True
-except Exception as ex:
-    AWQ_INSTALLED = False
-    warnings.warn(
-        f'AutoAWQ could not load GEMM kernels extension. Details: {ex}')
-
-
-# Adapted from https://github.com/compressa-ai/AutoAWQ/tree/dev
-class WQLinearMMFunction(Function):
-
-    @staticmethod
-    # ctx is the first argument to forward
-    def forward(
-        ctx,
-        x,
-        qweight,
-        qzeros,
-        scales,
-        w_bit=4,
-        group_size=128,
-        bias=None,
-        out_features=0,
-    ):
-        # The forward pass can use ctx.
-        ctx.save_for_backward(x, qweight, qzeros, scales, bias)
-        ctx.out_features = out_features
-
-        out_shape = x.shape[:-1] + (out_features, )
-        x = x.to(torch.float16)
-
-        if AWQ_INSTALLED:
-            FP16_MATMUL_HEURISTIC_CONDITION = x.shape[0] * x.shape[1] >= 1024
-
-            if FP16_MATMUL_HEURISTIC_CONDITION:
-                out = awq_ext.dequantize_weights_cuda(qweight, scales, qzeros,
-                                                      0, 0, 0, False)
-                out = torch.matmul(x, out)
-            else:
-                out = awq_ext.gemm_forward_cuda(x.reshape(-1, x.shape[-1]),
-                                                qweight, scales, qzeros, 8)
-        else:
-            out = dequantize_gemm(qweight, qzeros, scales, w_bit, group_size)
-            out = torch.matmul(x, out)
-
-        out = out + bias if bias is not None else out
-        out = out.reshape(out_shape)
-
-        # always want 3D tensor if tensor is 2D
-        if len(out.shape) == 2:
-            out = out.unsqueeze(0)
-
-        return out
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, qweight, qzeros, scales, bias = ctx.saved_tensors
-
-        if not AWQ_INSTALLED:
-            raise ValueError(
-                'auto-awq kernels is needed to be installed to '
-                'use `.backward()`. Make sure to install the auto-awq kernels'
-                ' by following the installation guides in '
-                'https://github.com/casper-hansen/AutoAWQ_kernels')
-
-        # Cast to correct dtype for mixed precision training
-        weights = awq_ext.dequantize_weights_cuda(qweight, scales, qzeros, 1,
-                                                  0, 0,
-                                                  False).to(grad_output.dtype)
-
-        if ctx.needs_input_grad[0]:
-            # 3D matmul using torch.bmm:
-            # https://pytorch.org/docs/stable/generated/torch.bmm.html#torch.bmm # noqa
-            # to propagate gradient across all batch sizes.
-            batch_size = grad_output.shape[0]
-            grad_input = grad_output.bmm(
-                weights.transpose(0, 1).unsqueeze(0).repeat(batch_size, 1, 1))
-
-        return grad_input, None, None, None, None, None, None, None
-
-
-class WQLinear_GEMM(nn.Module):
-
-    def __init__(self,
-                 w_bit,
-                 group_size,
-                 in_features,
-                 out_features,
-                 bias,
-                 dev,
-                 training=False):
-        super().__init__()
-
-        if w_bit not in [4]:
-            raise NotImplementedError('Only 4-bit are supported for now.')
-
-        self.in_features = in_features
-        self.out_features = out_features
-        self.w_bit = w_bit
-        self.group_size = group_size if group_size != -1 else in_features
-        self.training = training
-
-        # quick sanity check (make sure alignment)
-        assert self.in_features % self.group_size == 0
-        assert out_features % (32 // self.w_bit) == 0
-
-        self.register_buffer(
-            'qweight',
-            torch.zeros(
-                (in_features, out_features // (32 // self.w_bit)),
-                dtype=torch.int32,
-                device=dev,
-            ),
-        )
-        self.register_buffer(
-            'qzeros',
-            torch.zeros(
-                (in_features // self.group_size, out_features //
-                 (32 // self.w_bit)),
-                dtype=torch.int32,
-                device=dev,
-            ),
-        )
-        self.register_buffer(
-            'scales',
-            torch.zeros(
-                (in_features // self.group_size, out_features),
-                dtype=torch.float16,
-                device=dev,
-            ),
-        )
-        if bias:
-            self.register_buffer(
-                'bias',
-                torch.zeros(
-                    (out_features),
-                    dtype=torch.float16,
-                    device=dev,
-                ),
-            )
-        else:
-            self.bias = None
-
-    @classmethod
-    def from_linear(cls,
-                    linear,
-                    w_bit,
-                    group_size,
-                    init_only=False,
-                    scales=None,
-                    zeros=None):
-        awq_linear = cls(
-            w_bit,
-            group_size,
-            linear.in_features,
-            linear.out_features,
-            linear.bias is not None,
-            linear.weight.device,
-        )
-        if init_only:  # just prepare for loading sd
-            return awq_linear
-
-        # need scales and zeros info for real quantization
-        assert scales is not None and zeros is not None
-        scale_zeros = zeros * scales
-
-        awq_linear.scales = scales.clone().half()
-        if linear.bias is not None:
-            awq_linear.bias = linear.bias.clone().half()
-
-        pack_num = 32 // awq_linear.w_bit
-
-        intweight = []
-        for idx in range(awq_linear.in_features):
-            intweight.append(
-                torch.round(
-                    (linear.weight.data[:, idx] +
-                     scale_zeros[idx // group_size]) /
-                    awq_linear.scales[idx // group_size]).to(torch.int)[:,
-                                                                        None])
-        intweight = torch.cat(intweight, dim=1)
-        intweight = intweight.t().contiguous()
-        intweight = intweight.to(dtype=torch.int32)
-
-        best_device = get_best_device()
-
-        # Avoid: The operator 'aten::__lshift__.Scalar' is not currently
-        # implemented for the MPS device
-        if 'mps' in best_device:
-            intweight = intweight.to('cpu')
-
-        qweight = torch.zeros(
-            (intweight.shape[0], intweight.shape[1] // 32 * awq_linear.w_bit),
-            dtype=torch.int32,
-            device=intweight.device,
-        )
-
-        for col in range(intweight.shape[1] // pack_num):
-            if awq_linear.w_bit == 4:
-                order_map = [0, 2, 4, 6, 1, 3, 5, 7]
-            else:
-                raise NotImplementedError('Only 4-bit are supported for now.')
-            for i in range(pack_num):
-                qweight_col = intweight[:, col * pack_num + order_map[i]]
-                qweight[:, col] |= qweight_col << (i * awq_linear.w_bit)
-        awq_linear.qweight = qweight
-
-        zeros = zeros.to(dtype=torch.int32, device=best_device)
-
-        if 'mps' in best_device:
-            zeros = zeros.to('cpu')
-
-        qzeros = torch.zeros(
-            (zeros.shape[0], zeros.shape[1] // 32 * awq_linear.w_bit),
-            dtype=torch.int32,
-            device=zeros.device,
-        )
-
-        for col in range(zeros.shape[1] // pack_num):
-            if awq_linear.w_bit == 4:
-                order_map = [0, 2, 4, 6, 1, 3, 5, 7]
-            else:
-                raise NotImplementedError('Only 4-bit are supported for now.')
-            for i in range(pack_num):
-                qzero_col = zeros[:, col * pack_num + order_map[i]]
-                qzeros[:, col] |= qzero_col << (i * awq_linear.w_bit)
-        awq_linear.qzeros = qzeros
-
-        return awq_linear
-
-    def forward(self, x):
-        out_shape = x.shape[:-1] + (self.out_features, )
-
-        input_dtype = x.dtype
-        if input_dtype != torch.float16:
-            x = x.half()
-
-        if self.training:
-            out = WQLinearMMFunction.apply(
-                x,
-                self.qweight,
-                self.qzeros,
-                self.scales,
-                self.w_bit,
-                self.group_size,
-                self.bias,
-                self.out_features,
-            )
-        else:
-            with torch.no_grad():
-                out = WQLinearMMFunction.apply(
-                    x,
-                    self.qweight,
-                    self.qzeros,
-                    self.scales,
-                    self.w_bit,
-                    self.group_size,
-                    self.bias,
-                    self.out_features,
-                )
-
-        if input_dtype != torch.float16:
-            out = out.to(dtype=input_dtype)
-
-        return out.reshape(out_shape)
-
-    def extra_repr(self) -> str:
-        return (
-            'in_features={}, out_features={}, bias={}, w_bit={}, group_size={}'
-            .format(
-                self.in_features,
-                self.out_features,
-                self.bias is not None,
-                self.w_bit,
-                self.group_size,
-            ))
diff --git a/example/awq/llama.py b/example/awq/llama.py
deleted file mode 100644
index f221e48..0000000
--- a/example/awq/llama.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# from awq.utils.fused_utils import fuse_qkv
-# from awq.modules.fused.block import LlamaLikeBlock
-# from awq.modules.fused.model import LlamaLikeModel
-from transformers.models.llama.modeling_llama import \
-    LlamaDecoderLayer as OldLlamaDecoderLayer
-from transformers.models.llama.modeling_llama import \
-    LlamaForCausalLM as OldLlamaForCausalLM
-
-from .base import BaseAWQForCausalLM
-
-# from awq.modules.fused.norm import FasterTransformerRMSNorm
-
-
-class LlamaAWQForCausalLM(BaseAWQForCausalLM):
-    layer_type = 'LlamaDecoderLayer'
-    max_seq_len_key = 'max_position_embeddings'
-
-    # @staticmethod
-    # def fuse_layers(model: OldLlamaForCausalLM):
-    #     fuser = LlamaFuser(model)
-    #     fuser.fuse_transformer()
-
-    @staticmethod
-    def get_model_layers(model: OldLlamaForCausalLM):
-        return model.model.layers
-
-    @staticmethod
-    def get_act_for_scaling(module: OldLlamaDecoderLayer):
-        return dict(is_scalable=False)
-
-    @staticmethod
-    def move_embed(model: OldLlamaForCausalLM, device: str):
-        model.model.embed_tokens = model.model.embed_tokens.to(device)
-
-    @staticmethod
-    def get_layers_for_scaling(module: OldLlamaDecoderLayer, input_feat,
-                               module_kwargs):
-        layers = []
-
-        # attention input
-        layers.append(
-            dict(
-                prev_op=module.input_layernorm,
-                layers=[
-                    module.self_attn.q_proj,
-                    module.self_attn.k_proj,
-                    module.self_attn.v_proj,
-                ],
-                inp=input_feat['self_attn.q_proj'],
-                module2inspect=module.self_attn,
-                kwargs=module_kwargs,
-            ))
-
-        # attention out
-        # Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696 # noqa
-        if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:  # noqa
-            layers.append(
-                dict(
-                    prev_op=module.self_attn.v_proj,
-                    layers=[module.self_attn.o_proj],
-                    inp=input_feat['self_attn.o_proj'],
-                ))
-
-        # linear 1
-        layers.append(
-            dict(
-                prev_op=module.post_attention_layernorm,
-                layers=[module.mlp.gate_proj, module.mlp.up_proj],
-                inp=input_feat['mlp.gate_proj'],
-                module2inspect=module.mlp,
-            ))
-
-        # linear 2
-        layers.append(
-            dict(
-                prev_op=module.mlp.up_proj,
-                layers=[module.mlp.down_proj],
-                inp=input_feat['mlp.down_proj'],
-            ))
-
-        return layers
diff --git a/example/awq/module.py b/example/awq/module.py
deleted file mode 100644
index aeefb66..0000000
--- a/example/awq/module.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import torch.nn as nn
-
-
-def get_named_linears(module):
-    return {
-        name: m
-        for name, m in module.named_modules() if isinstance(m, nn.Linear)
-    }
-
-
-def get_op_by_name(module, op_name):
-    # get the op by its name relative to the module
-    for name, m in module.named_modules():
-        if name == op_name:
-            return m
-    raise ValueError(f'Cannot find op {op_name} in module {module}')
-
-
-def set_op_by_name(layer, name, new_module):
-    levels = name.split('.')
-    if len(levels) > 1:
-        mod_ = layer
-        for l_idx in range(len(levels) - 1):
-            if levels[l_idx].isdigit():
-                mod_ = mod_[int(levels[l_idx])]
-            else:
-                mod_ = getattr(mod_, levels[l_idx])
-        setattr(mod_, levels[-1], new_module)
-    else:
-        setattr(layer, name, new_module)
-
-
-def get_op_name(module, op):
-    # get the name of the op relative to the module
-    for name, m in module.named_modules():
-        if m is op:
-            return name
-    raise ValueError(f'Cannot find op {op} in module {module}')
-
-
-def append_str_prefix(x, prefix):
-    if isinstance(x, str):
-        return prefix + x
-    elif isinstance(x, tuple):
-        return tuple([append_str_prefix(y, prefix) for y in x])
-    elif isinstance(x, list):
-        return [append_str_prefix(y, prefix) for y in x]
-    else:
-        return x
-
-
-def exclude_layers_to_not_quantize(linear_layers, modules_to_not_convert):
-    if modules_to_not_convert is None:
-        return linear_layers
-
-    filtered_layers = {}
-    for name, linear_layer in linear_layers.items():
-        if not any(key in name for key in modules_to_not_convert):
-            filtered_layers[name] = linear_layer
-    return filtered_layers
diff --git a/example/awq/packing_utils.py b/example/awq/packing_utils.py
deleted file mode 100644
index a9724fa..0000000
--- a/example/awq/packing_utils.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import torch
-
-AWQ_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]
-AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
-
-
-def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int):
-    shifts = torch.arange(0, 32, bits, device=qzeros.device)
-
-    # unpacking columnwise
-    iweights = torch.bitwise_right_shift(
-        qweight[:, :, None],
-        shifts[None, None, :]).to(torch.int8  # smallest dtype available
-                                  )
-    iweights = iweights.view(iweights.shape[0], -1)
-
-    # unpacking columnwise
-    if qzeros is not None:
-        izeros = torch.bitwise_right_shift(
-            qzeros[:, :, None],
-            shifts[None, None, :]).to(torch.int8  # smallest dtype available
-                                      )
-        izeros = izeros.view(izeros.shape[0], -1)
-    else:
-        izeros = qzeros
-
-    return iweights, izeros
-
-
-def reverse_awq_order(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
-    reverse_order_tensor = torch.arange(
-        iweights.shape[-1],
-        dtype=torch.int32,
-        device=izeros.device,
-    )
-    reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
-    reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
-    reverse_order_tensor = reverse_order_tensor.view(-1)
-
-    if izeros is not None:
-        izeros = izeros[:, reverse_order_tensor]
-    iweights = iweights[:, reverse_order_tensor]
-
-    return iweights, izeros
-
-
-def pack_exllama(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
-    shifts = torch.arange(0, 32, bits, device=iweights.device)
-
-    # packing rowwise
-    iweights = iweights.view(iweights.shape[0] // (32 // bits), 32 // bits, -1)
-    qweight = (torch.bitwise_left_shift(
-        iweights, shifts[None, :, None]).sum(dim=1).to(torch.int32))
-
-    # packing columnwise
-    izeros = izeros.view(-1, izeros.shape[1] // (32 // bits), 32 // bits)
-    qzeros = (torch.bitwise_left_shift(
-        izeros, shifts[None, None, :]).sum(dim=-1).to(torch.int32))
-
-    return qweight, qzeros
-
-
-def unpack_reorder_pack(qweight, qzeros, bits):
-    # Unpack the qweight and qzeros tensors
-    iweight, izeros = unpack_awq(qweight, qzeros, bits)
-    # Reverse the order of the iweight and izeros tensors
-    iweight, izeros = reverse_awq_order(iweight, izeros, bits)
-
-    # overflow checks
-    iweight = torch.bitwise_and(iweight, (2**bits) - 1)
-    izeros = torch.bitwise_and(izeros, (2**bits) - 1)
-
-    # Subtract 1 from the izeros tensor (exllama adds 1 during inference)
-    # We can remove it if we remove the +1 in the exllama code
-    izeros = izeros - 1
-    # Pack the qweight and qzeros tensors
-    qweight, qzeros = pack_exllama(iweight, izeros, bits)
-
-    return qweight, qzeros
-
-
-def dequantize_gemm(qweight, qzeros, scales, bits, group_size):
-    # Unpack the qweight and qzeros tensors
-    iweight, izeros = unpack_awq(qweight, qzeros, bits)
-    # Reverse the order of the iweight and izeros tensors
-    iweight, izeros = reverse_awq_order(iweight, izeros, bits)
-
-    # overflow checks
-    iweight = torch.bitwise_and(iweight, (2**bits) - 1)
-    izeros = torch.bitwise_and(izeros, (2**bits) - 1)
-
-    # fp16 weights
-    scales = scales.repeat_interleave(group_size, dim=0)
-    izeros = izeros.repeat_interleave(group_size, dim=0)
-    iweight = (iweight - izeros) * scales
-
-    return iweight
diff --git a/example/awq/utils.py b/example/awq/utils.py
deleted file mode 100644
index 3ba594c..0000000
--- a/example/awq/utils.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import gc
-import importlib
-
-import accelerate
-import torch
-
-qbits_available = importlib.util.find_spec(
-    'intel_extension_for_transformers') is not None
-
-
-def get_module_by_name_suffix(model, module_name: str):
-    for name, module in model.named_modules():
-        if name.endswith(module_name):
-            return module
-
-
-def simple_dispatch_model(model, device_map):
-    from accelerate.hooks import AlignDevicesHook, add_hook_to_module
-
-    if '' in device_map:
-        d = device_map['']
-        model = model.to(torch.device(d))
-        model.hf_device_map = device_map
-        return model
-
-    tied_params = accelerate.utils.modeling.find_tied_parameters(model)
-    if set(device_map.values()) == {'cpu'} or set(device_map.values()) == {
-            'cpu',
-            'disk',
-    }:
-        main_device = 'cpu'
-    else:
-        main_device = [
-            d for d in device_map.values() if d not in ['cpu', 'disk']
-        ][0]
-
-    cpu_offload_group = [(n, d) for n, d in device_map.items() if d == 'cpu']
-    prev_hook = None
-    for idx, (n, d) in enumerate(cpu_offload_group):
-        m = get_module_by_name_suffix(model, n)
-        _, prev_hook = accelerate.cpu_offload_with_hook(
-            m, execution_device=main_device, prev_module_hook=prev_hook)
-    # set first cpu offload module's prev_module_hook
-    # to the last cpu offload module's hook
-    if len(cpu_offload_group) > 1:
-        get_module_by_name_suffix(
-            model,
-            cpu_offload_group[0][0])._hf_hook.prev_module_hook = prev_hook
-
-    for n, d in device_map.items():
-        m = get_module_by_name_suffix(model, n)
-        if d != 'cpu':
-            d = torch.device(d)
-            hook = AlignDevicesHook(d,
-                                    io_same_device=True,
-                                    place_submodules=True)
-            add_hook_to_module(m, hook)
-    accelerate.utils.modeling.retie_parameters(model, tied_params)
-    model.hf_device_map = device_map
-
-    return model
-
-
-def set_module_name(model, name, value):
-    if '.' in name:
-        parent_name = name.rsplit('.', 1)[0]
-        child_name = name[len(parent_name) + 1:]
-        parent = model.get_submodule(parent_name)
-    else:
-        parent_name = ''
-        parent = model
-        child_name = name
-
-    setattr(parent, child_name, value)
-
-
-def clear_memory(weight=None):
-    if weight is not None:
-        del weight
-    gc.collect()
-    torch.cuda.empty_cache()
-
-
-def compute_memory_used_pct(device):
-    memory_used = torch.cuda.max_memory_allocated(device) / (1024**3)
-    memory_pct = (memory_used /
-                  (torch.cuda.get_device_properties(device).total_memory /
-                   (1024**3)) * 100)
-    return memory_pct
-
-
-def get_best_device():
-    if torch.backends.mps.is_available():
-        return 'mps'
-    elif torch.cuda.is_available():
-        return 'cuda:0'
-    else:
-        return 'cpu'
-
-
-def get_lowest_memory_device_index():
-    device = None
-    curr_device_memory_pct = 0
-    for device_index in range(torch.cuda.device_count()):
-        device_memory_pct = compute_memory_used_pct(device_index)
-        if device is None or device_memory_pct < curr_device_memory_pct:
-            device = device_index
-            curr_device_memory_pct = device_memory_pct
-
-    return device

From a8c64c4095d5b0f9d49a6227f349da6a64207239 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Sat, 12 Oct 2024 19:31:09 +0800
Subject: [PATCH 07/18] remove useless code

---
 example/generate.py                | 44 ------------------------------
 src/turbomind/api/python/bind.cpp  | 22 ---------------
 src/turbomind/api/python/linear.cc | 20 +-------------
 3 files changed, 1 insertion(+), 85 deletions(-)
 delete mode 100644 example/generate.py

diff --git a/example/generate.py b/example/generate.py
deleted file mode 100644
index cca53ab..0000000
--- a/example/generate.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-from awq.auto import AutoAWQForCausalLM
-from transformers import AutoTokenizer, TextStreamer
-
-quant_path = '/mnt/140/llama3/Meta-Llama-3-8B-Instruct-hf-AWQ'
-
-# Load model
-model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
-
-tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
-streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-
-prompt = "You're standing on the surface of the Earth. "\
-        'You walk one mile south, one mile west and one mile north. '\
-        'You end up exactly where you started. Where are you?'
-
-chat = [
-    {
-        'role': 'system',
-        'content': 'You are a concise assistant that helps answer questions.'
-    },
-    {
-        'role': 'user',
-        'content': prompt
-    },
-]
-
-terminators = [
-    tokenizer.eos_token_id,
-    tokenizer.convert_tokens_to_ids('<|eot_id|>')
-]
-
-tokens = tokenizer.apply_chat_template(chat, return_tensors='pt')
-tokens = tokens.to(torch.device('cuda'))
-
-# Generate output
-generation_output = model.generate(tokens,
-                                   streamer=streamer,
-                                   max_new_tokens=64,
-                                   eos_token_id=terminators)
-
-res = tokenizer.decode(generation_output[0].cpu().numpy().tolist())
-print(f'token_ids: {generation_output}')
-print(f'output: {res}')
diff --git a/src/turbomind/api/python/bind.cpp b/src/turbomind/api/python/bind.cpp
index bd81fd9..409fc5d 100644
--- a/src/turbomind/api/python/bind.cpp
+++ b/src/turbomind/api/python/bind.cpp
@@ -324,28 +324,6 @@ PYBIND11_MODULE(turbomind_kernels, m) {
             return ret;
         },
         "dl_managed_tensor"_a);
-    // m.def("convert_qweight", [](py::object qweight, int input_dims, int output_dims, bool simt) {
-    //     py::capsule      cap = qweight.attr("__dlpack__")();
-    //     DLManagedTensor* dlmt =
-    //         static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName));
-    //     auto _qweight = DLManagedTensorToTurbomindTensor(dlmt);
-    //     _qweight = convert_qweight(_qweight, input_dims, output_dims, simt);
-    //     return *_qweight;
-    // });
-    // m.def("convert_scales_zeros",
-    // [](py::object scales, py::object qzeros, py::object scales_zeros, int input_dims, int output_dims, int group_size, bool simt) {
-    //     auto cap_scales = scales.attr("__dlpack__")();
-    //     auto cap_zeros = qzeros.attr("__dlpack__")();
-    //     auto cap_scales_zeros = scales_zeros.attr("__dlpack__")();
-    //     auto dlmt_scales = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap_scales.ptr(), kDlTensorCapsuleName));
-    //     auto dlmt_zeros = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap_zeros.ptr(), kDlTensorCapsuleName));
-    //     auto dlmt_scales_zeros = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap_scales_zeros.ptr(), kDlTensorCapsuleName));
-    //     auto _scales = DLManagedTensorToTurbomindTensor(dlmt_scales);
-    //     auto _zeros = DLManagedTensorToTurbomindTensor(dlmt_zeros);
-    //     auto _scales_zeros = DLManagedTensorToTurbomindTensor(dlmt_scales_zeros);
-    //     _scales_zeros = convert_scales_zeros(_scales, _zeros, _scales_zeros, input_dims, output_dims, group_size, simt);
-    //     return *_scales_zeros;
-    // });
 
     // Instantiate turbomind::Linear
     py::class_<turbomind::Linear, std::shared_ptr<turbomind::Linear>>(m, "Linear")
diff --git a/src/turbomind/api/python/linear.cc b/src/turbomind/api/python/linear.cc
index 8c68cdd..cfb1202 100644
--- a/src/turbomind/api/python/linear.cc
+++ b/src/turbomind/api/python/linear.cc
@@ -109,32 +109,14 @@ struct Linear::Impl {
                             stream_);
 
         if (ec) {
-            // TM_LOG_ERROR("%s: %d", __PRETTY_FUNCTION__, ec);
+            printf("%s: %d", __PRETTY_FUNCTION__, ec);
             std::abort();
         }
     }
     void convert_qweight(void* workspace, std::shared_ptr<Tensor> weight, size_t input_dims, size_t output_dims, bool use_simt) {
-        // const auto workspace_size = input_dims * output_dims * sizeof(uint16_t);
-        // void *workspace {};
-        // check_cuda_error(cudaMalloc((void**)&workspace, workspace_size));
-        // std::cout << "where: " << weight->where << ", type: " << weight->type << ", shape: ";
-        // for (size_t i = 0; i < weight->shape.size(); ++i) {
-        //     std::cout << weight->shape[i] << ", ";
-        // }
-        // std::cout << std::endl;
-        // std::vector<int> _temp(weight->shape[0] * weight->shape[1]);
-        // cudaMemcpy(_temp.data(), weight->data, _temp.size() * sizeof(int), cudaMemcpyDeviceToHost);
-        // int row = 4095;
-        // for (size_t i = 0; i < _temp.size() && i < 100; i++) {
-        //     std::cout << _temp[row * weight->shape[1] + i] << ", ";
-        // }
-        // cudaDeviceSynchronize();
-
         using namespace gemm;
         auto [order_b, pack_b, order_v, pack_v] = get_weight_and_scales_layout(getSMVersion(), use_simt);
 
-
-        // std::cout << "oder_b: " << int(order_b) << ", input_dims: " << input_dims << ", output_dims: " << output_dims << std::endl;
         if (order_b == kColMajor) {
             transpose_u4((uint4_t*)workspace, (const uint4_t*)weight->data, input_dims, output_dims);
             cudaMemcpy(const_cast<void*>(weight->data), workspace, input_dims * output_dims / 2, cudaMemcpyDefault);

From 9b6e35bd3a564f80c34d4ca13d4a4cfb91955ccf Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Sat, 12 Oct 2024 19:42:58 +0800
Subject: [PATCH 08/18] update

---
 turbomind/linear.py | 50 ++-------------------------------------------
 1 file changed, 2 insertions(+), 48 deletions(-)

diff --git a/turbomind/linear.py b/turbomind/linear.py
index 3abdc59..c91380a 100644
--- a/turbomind/linear.py
+++ b/turbomind/linear.py
@@ -2,12 +2,13 @@
 import logging
 import os.path as osp
 import sys
-from typing import List
 
 import torch
 
 import turbomind
 
+from .utils import get_u4_slices, pack_u4_row, unpack_awq_gemm
+
 turbomind_dir = osp.split(turbomind.__file__)[0]
 sys.path.append(osp.join(turbomind_dir, 'lib'))
 
@@ -19,15 +20,6 @@
     TURBOMIND_KERNELS_INSTALLED = False
 
 
-def pack_u4_row(x: torch.Tensor) -> torch.Tensor:
-    assert x.dtype == torch.uint8
-    xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1)
-    a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device)
-    for t in reversed(xs):
-        a = (a << 4) | t
-    return a.squeeze(dim=-1)
-
-
 def transpose(x):
     return x.t() if x is not None else x
 
@@ -49,44 +41,6 @@ def to_cuda(x: torch.Tensor, *args):
     return x.cuda()
 
 
-def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> List[torch.Tensor]:
-    assert x.dtype == torch.int32
-    xs = []
-    for _ in range(8):
-        xs.append((x & 15).to(dtype))
-        x = x >> 4
-    return xs
-
-
-def unpack_awq_gemm(x: torch.Tensor) -> torch.Tensor:
-    xs = get_u4_slices(x, torch.uint8)
-    order = [0, 4, 1, 5, 2, 6, 3, 7]
-    ys = [xs[i] for i in order]
-    return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1)
-
-
-def process_awq_gemm(x: torch.Tensor, kind: str):
-    x = x.cuda()
-    if x.dtype == torch.int32:
-        x = unpack_awq_gemm(x)
-    if kind in ['qweight', 'qzeros', 'scales']:
-        x = x.t()
-    return x
-
-
-def process_gptq(x: torch.Tensor, kind: str):
-    x = x.cuda()
-    if x.dtype == torch.int32:
-        xs = get_u4_slices(x, torch.uint8)
-        if kind == 'qweight':  # (k/8,n)
-            x = torch.stack(xs, dim=1).view(-1, x.size(-1))
-        else:  # 'qzeros' (k/g,n/8)
-            x = torch.stack(xs, dim=-1).view(x.size(0), -1) + 1
-    if kind in ['qweight', 'qzeros', 'scales']:
-        x = x.t()
-    return x
-
-
 class Linear(torch.nn.Module):
 
     def __init__(self,

From 2fc6cf751db69b77e5fe13a542c3d55c1808754f Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Mon, 14 Oct 2024 16:49:24 +0800
Subject: [PATCH 09/18] fix cudmalloc error size

---
 example/test_linear.py             | 36 +++++++--------
 generate.sh                        |  1 +
 src/turbomind/api/python/linear.cc | 74 ++++++++++++++++++++++--------
 turbomind/linear.py                | 17 ++++++-
 4 files changed, 91 insertions(+), 37 deletions(-)

diff --git a/example/test_linear.py b/example/test_linear.py
index 2bd6e46..87572bd 100644
--- a/example/test_linear.py
+++ b/example/test_linear.py
@@ -1,4 +1,5 @@
 import torch
+import torch.nn as nn
 from safetensors import safe_open
 
 import turbomind as tm
@@ -76,7 +77,7 @@ def makup_qzeros(in_features: int, out_features: int, group_size: int):
     return qzeros
 
 
-def makup_scales(in_features: int, out_featurse: int, group_size: int):
+def makup_scales(in_features: int, out_features: int, group_size: int):
     assert in_features % group_size == 0 and in_features // group_size >= 1
     scales = torch.rand((in_features // group_size, out_features),
                         dtype=torch.float16,
@@ -106,27 +107,15 @@ def dequantize(qweight, qzeros, scales, group_size: int = 128):
 #                       out_features=out_features,
 #                       group_size=group_size)
 # scales = makup_scales(in_features,
-#                       out_featurse=out_features,
+#                       out_features=out_features,
 #                       group_size=group_size)
 
-# weight = dequantize(qweight, qzeros, scales, group_size)
-# print(f'-- dequantization: weight.shape={weight.shape}, weight: \n{weight}')
-# ref_linear = nn.Linear(in_features, out_features, bias=False, device='cuda')
-# with torch.no_grad():
-#     ref_linear.weight = nn.Parameter(weight.T)
-
-# x = torch.randn(in_features, device=weight.device, dtype=weight.dtype)
-# print(f'input: {x}')
-# print(weight.device, x.device)
-# ref_res = ref_linear(x)
-# print(ref_res)
-
 
 def load_specified_linear_weights():
     ckpt_path = '/models/140/llama3/Meta-Llama-3-8B-Instruct-hf-AWQ/model-00001-of-00002.safetensors'  # noqa
     layer_id = 0
     # prefix = f'model.layers.{layer_id}.self_attn.q_proj.'
-    prefix = f'model.layers.{layer_id}.mlp.gate_proj.'
+    prefix = f'model.layers.{layer_id}.self_attn.o_proj.'
     keys = ['qweight', 'qzeros', 'scales']
     tensors = {}
     with safe_open(ckpt_path, framework='pt', device='cuda') as f:
@@ -144,6 +133,16 @@ def load_specified_linear_weights():
 in_features = qweight.shape[0]
 out_features = qweight.shape[1] * 8
 
+x = torch.randn(in_features, device=qweight.device, dtype=torch.float16)
+
+weight = dequantize(qweight, qzeros, scales, group_size)
+print(f'-- dequantization: weight.shape={weight.shape}, weight: \n{weight}')
+ref_linear = nn.Linear(in_features, out_features, bias=False, device='cuda')
+with torch.no_grad():
+    ref_linear.weight = nn.Parameter(weight.T)
+    ref_res = ref_linear(x)
+    print(f'nn.linear.res: {ref_res}')
+
 model = tm.Linear(in_features=in_features,
                   out_features=out_features,
                   bias=False,
@@ -157,7 +156,8 @@ def load_specified_linear_weights():
 
 model.post_init()
 
-x = torch.randn(in_features, device=qweight.device, dtype=torch.float16)
 res = model(x)
-# max_diff = max(abs(ref_res - res))
-# ave_diff = sum(abs(ref_res - res)) / ref_res.numel()
+print(f'tm.linear.res: {res}')
+max_diff = torch.max(abs(ref_res - res))
+ave_diff = torch.sum(abs(ref_res - res)) / ref_res.numel()
+print(f'max_diff {max_diff}, ave_diff {ave_diff}')
diff --git a/generate.sh b/generate.sh
index 09c51a7..4083da6 100755
--- a/generate.sh
+++ b/generate.sh
@@ -5,6 +5,7 @@ cmake .. \
     -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
     -DCMAKE_CUDA_FLAGS="-lineinfo" \
     -DUSE_NVTX=ON \
+    -DBUILD_TEST=ON \
     -DFETCHCONTENT_UPDATES_DISCONNECTED=ON \
     -DLMDEPLOY_ASAN_ENABLE=OFF \
     -DLMDEPLOY_UBSAN_ENABLE=OFF \
diff --git a/src/turbomind/api/python/linear.cc b/src/turbomind/api/python/linear.cc
index cfb1202..aec546b 100644
--- a/src/turbomind/api/python/linear.cc
+++ b/src/turbomind/api/python/linear.cc
@@ -8,6 +8,7 @@
 #include <fstream>
 #include <cuda_runtime.h>
 #include <iostream>
+#include <cuda_fp16.h>
 
 
 namespace turbomind {
@@ -50,6 +51,7 @@ struct Linear::Impl {
 
     void post_init(std::shared_ptr<Tensor> qweight, std::shared_ptr<Tensor> scales, std::shared_ptr<Tensor> qzeros,
                    bool simt) {
+        cudaDeviceSynchronize();
         const auto workspace_size = input_dims_ * output_dims_ * sizeof(uint16_t);
         void *workspace {};
         check_cuda_error(cudaMalloc((void**)&workspace, workspace_size));
@@ -184,6 +186,41 @@ struct Linear::Impl {
                             size_t output_dims,
                             int group_size,
                             bool use_simt) {
+        if constexpr (0) {
+            std::cout << "scales: " << std::endl;
+            std::vector<__half> tmp(input_dims / group_size * output_dims);
+            cudaMemcpy(tmp.data(), scales->data, sizeof(__half) * tmp.size(), cudaMemcpyDefault);
+            cudaDeviceSynchronize();
+            int i = 0;
+            for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
+                std::cout << __half2float(*it) << " ";
+            }
+            std::cout << std::endl;
+            i = 0;
+            for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) {
+                std::cout << __half2float(*it) << " ";
+            }
+            std::cout << std::endl;
+        }
+
+        if constexpr (0) {
+            std::cout << "zeros: " << std::endl;
+            std::vector<__half> tmp(input_dims / group_size * output_dims / 8);
+            cudaMemcpy(tmp.data(), qzeros->data, sizeof(__half) * tmp.size(), cudaMemcpyDefault);
+            cudaDeviceSynchronize();
+            int i = 0;
+            for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
+                std::cout << __half2float(*it) << " ";
+            }
+            std::cout << std::endl;
+            i = 0;
+            for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) {
+                std::cout << __half2float(*it) << " ";
+            }
+            std::cout << std::endl;
+        }
+
+
         const auto scale_count = input_dims / group_size * output_dims;
 
         using namespace gemm;
@@ -194,7 +231,7 @@ struct Linear::Impl {
 
         cudaDeviceSynchronize();
 
-        check_cuda_error(cudaMalloc((half**)&scales_zeros_, scale_count * 2));
+        check_cuda_error(cudaMalloc(&scales_zeros_, sizeof(uint16_t) * scale_count * 2));
 
         MatrixLayout s_desc{
             gemm::DataType::U32,
@@ -210,23 +247,24 @@ struct Linear::Impl {
         TM_CHECK(Convert(workspace, s_desc, scales_zeros_, q_desc_, 0) == 0);
         sync_check_cuda_error();
 
-        // if constexpr (0) {
-        //     std::vector<half> tmp(scale_count * 2);
-        //     cudaMemcpy(tmp.data(), scales_zeros_, sizeof(half) * tmp.size(), cudaMemcpyDefault);
-        //     cudaDeviceSynchronize();
-        //     // for (const auto& x: tmp) {
-        //     //     std::cout << (float)x << " ";
-        //     // }
-        //     int i = 0;
-        //     for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
-        //         std::cout << std::hex << *it << " ";
-        //     }
-        //     i = 0;
-        //     std::cout << "\n";
-        //     for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) {
-        //         std::cout << std::hex << *it << " ";
-        //     }
-        // }
+        if constexpr (0) {
+            std::vector<__half> tmp(scale_count * 2);
+            cudaMemcpy(tmp.data(), scales_zeros_, sizeof(__half) * tmp.size(), cudaMemcpyDefault);
+            cudaDeviceSynchronize();
+            // for (const auto& x: tmp) {
+            //     std::cout << (float)x << " ";
+            // }
+            int i = 0;
+            for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
+                std::cout << __half2float(*it) << " ";
+            }
+            std::cout << std::endl;
+            i = 0;
+            for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) {
+                std::cout << __half2float(*it) << " ";
+            }
+            std::cout << std::endl;
+        }
     }
 
 private:
diff --git a/turbomind/linear.py b/turbomind/linear.py
index c91380a..4a19913 100644
--- a/turbomind/linear.py
+++ b/turbomind/linear.py
@@ -153,6 +153,22 @@ def is_16xx_series(name):
         self.qweight = self.qweight.contiguous()
         self.scales = self.scales.contiguous()
         self.qzeros = self.qzeros.contiguous()
+
+        # _list = self.scales.flatten().cpu().tolist()
+        # def print_formatted_float(a):
+        #     formatted_list = " ".join(f"{num:.8f}" for num in a)
+        #     print(formatted_list)
+        # print_formatted_float(_list[0:1000])
+        # print_formatted_float(_list[-1000:][::-1])
+
+        # def print_formatted_int(a):
+        #     formatted_list = ' '.join(f'{num:.0f}' for num in a)
+        #     print(formatted_list)
+
+        # _list = self.qzeros.flatten().cpu().tolist()
+        # print_formatted_int(_list[0:1000])
+        # print_formatted_int(_list[-1000:][::-1])
+
         self.linear.post_init(self.qweight, self.scales, self.qzeros, simt)
 
     @torch.no_grad()
@@ -173,7 +189,6 @@ def forward(self, x):
         )
         self.linear.forward(x, out)
         out = torch.from_dlpack(out)
-        print(out)
         return out.view(out_shape)
 
     def __call__(self, x: torch.Tensor):

From a25827b7ecec500b5ac9ac30a5490ea2c1f7af9c Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Mon, 14 Oct 2024 17:36:34 +0800
Subject: [PATCH 10/18] update

---
 src/turbomind/api/python/linear.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/turbomind/api/python/linear.cc b/src/turbomind/api/python/linear.cc
index aec546b..d2bc25c 100644
--- a/src/turbomind/api/python/linear.cc
+++ b/src/turbomind/api/python/linear.cc
@@ -205,7 +205,7 @@ struct Linear::Impl {
 
         if constexpr (0) {
             std::cout << "zeros: " << std::endl;
-            std::vector<__half> tmp(input_dims / group_size * output_dims / 8);
+            std::vector<__half> tmp(input_dims / group_size * output_dims);
             cudaMemcpy(tmp.data(), qzeros->data, sizeof(__half) * tmp.size(), cudaMemcpyDefault);
             cudaDeviceSynchronize();
             int i = 0;
@@ -249,7 +249,7 @@ struct Linear::Impl {
 
         if constexpr (0) {
             std::vector<__half> tmp(scale_count * 2);
-            cudaMemcpy(tmp.data(), scales_zeros_, sizeof(__half) * tmp.size(), cudaMemcpyDefault);
+            cudaMemcpy(tmp.data(), workspace, sizeof(__half) * tmp.size(), cudaMemcpyDefault);
             cudaDeviceSynchronize();
             // for (const auto& x: tmp) {
             //     std::cout << (float)x << " ";

From 584c80d79e49fc69ded71a1e08ecb41e9b2a60cd Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Mon, 14 Oct 2024 18:08:39 +0800
Subject: [PATCH 11/18] remove useless code

---
 example/test_linear.py | 96 ++----------------------------------------
 1 file changed, 3 insertions(+), 93 deletions(-)

diff --git a/example/test_linear.py b/example/test_linear.py
index 87572bd..1293929 100644
--- a/example/test_linear.py
+++ b/example/test_linear.py
@@ -3,92 +3,14 @@
 from safetensors import safe_open
 
 import turbomind as tm
+from turbomind.utils import unpack_awq_gemm
 
 torch.manual_seed(0)
 
 
-def i32x8_to_i4x8(w):
-    """merge 8 integers (range from 0 to 15) into one 32-bit integer."""
-    assert w.shape[-1] % 8 == 0
-    shape = (w.shape[0], w.numel() // (w.shape[0] * 8), 8)
-    shape = shape[:-1] + (1, )
-    result = torch.zeros(shape, dtype=w.dtype, device=w.device)
-    mask = torch.tensor([15], dtype=w.dtype, device=w.device)
-    for i in range(8):
-        shift = 4 * (7 - i)
-        result[..., 0] |= (w[..., i] & mask) << shift
-    result = result.view(w.shape[0], -1)
-    return result
-
-
-def i4x8_to_i32x8(w):
-    """split one integer every 4bits into 8 integers (range from 0 to 15)"""
-    shape = w.shape + (8, )
-    result = torch.zeros(shape, dtype=w.dtype, device=w.device)
-    mask = torch.tensor([15], dtype=w.dtype, device=w.device)
-    for i in range(8):
-        shift = 4 * (7 - i)
-        result[..., i] = (w >> shift) & mask
-    result = result.view(w.shape[0], -1)
-    return result
-
-
-# ## test i4x8_to_i32x8
-# value = 1636164468
-# print(hex(value))
-# a = torch.tensor([[value, value], [value, value]], dtype=torch.int32)
-# b = i4x8_to_i32x8(a)
-# print(b)
-# c = i32x8_to_i4x8(b)
-# print(c)
-# cmp = a == c
-# assert torch.sum(cmp) == cmp.numel()
-# exit(0)
-# ## end test
-
-
-def makeup_qweight(in_features: int, out_features: int):
-    assert out_features % 8 == 0
-    qweight = torch.randint(0,
-                            16, (in_features, out_features // 8, 8),
-                            dtype=torch.int32,
-                            device='cuda')
-    print(f'-- makeup qweight: shape {qweight.shape}')
-    print(qweight.view(in_features, -1))
-    qweight = i32x8_to_i4x8(qweight)
-    print(f'-- merge qweight: shape {qweight.shape}')
-    print(qweight)
-    return qweight
-
-
-def makup_qzeros(in_features: int, out_features: int, group_size: int):
-    assert out_features % 8 == 0
-    assert in_features % group_size == 0 and in_features // group_size >= 1
-
-    qzeros = torch.randint(0,
-                           16,
-                           (in_features // group_size, out_features // 8, 8),
-                           dtype=torch.int32,
-                           device='cuda')
-    print(f'-- makeup qzero: shape {qzeros.shape}')
-    print(qzeros.view(in_features // group_size, -1))
-    qzeros = i32x8_to_i4x8(qzeros)
-    print(f'-- merge qzero: shape {qzeros.shape}\n{qzeros}')
-    return qzeros
-
-
-def makup_scales(in_features: int, out_features: int, group_size: int):
-    assert in_features % group_size == 0 and in_features // group_size >= 1
-    scales = torch.rand((in_features // group_size, out_features),
-                        dtype=torch.float16,
-                        device='cuda')
-    print(f'-- makeup scales: shape {scales.shape}\n{scales}')
-    return scales
-
-
 def dequantize(qweight, qzeros, scales, group_size: int = 128):
-    _qweight = i4x8_to_i32x8(qweight)
-    _qzeros = i4x8_to_i32x8(qzeros)
+    _qweight = unpack_awq_gemm(qweight)
+    _qzeros = unpack_awq_gemm(qzeros)
     _qzeros = _qzeros.half()
     weight = _qweight.clone().half()
     for i in range(qzeros.shape[0]):
@@ -99,18 +21,6 @@ def dequantize(qweight, qzeros, scales, group_size: int = 128):
     return weight
 
 
-# in_features = 128
-# out_features = 8
-# group_size = 128
-# qweight = makeup_qweight(in_features, out_features)
-# qzeros = makup_qzeros(in_features=in_features,
-#                       out_features=out_features,
-#                       group_size=group_size)
-# scales = makup_scales(in_features,
-#                       out_features=out_features,
-#                       group_size=group_size)
-
-
 def load_specified_linear_weights():
     ckpt_path = '/models/140/llama3/Meta-Llama-3-8B-Instruct-hf-AWQ/model-00001-of-00002.safetensors'  # noqa
     layer_id = 0

From 3136c42c73acfaa91e7579dccce34942c99ff19e Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Tue, 15 Oct 2024 11:49:39 +0800
Subject: [PATCH 12/18] change the data type of zeros, scales, in and out from
 std::shared_ptr to reference

---
 src/turbomind/api/python/bind.cpp  |  4 +--
 src/turbomind/api/python/linear.cc | 43 ++++++++++++------------------
 src/turbomind/api/python/linear.h  | 16 ++---------
 3 files changed, 21 insertions(+), 42 deletions(-)

diff --git a/src/turbomind/api/python/bind.cpp b/src/turbomind/api/python/bind.cpp
index 409fc5d..276cd12 100644
--- a/src/turbomind/api/python/bind.cpp
+++ b/src/turbomind/api/python/bind.cpp
@@ -335,11 +335,11 @@ PYBIND11_MODULE(turbomind_kernels, m) {
             auto _qweight = TorchTensorToTurbomindTensor(qweight);
             auto _scales = TorchTensorToTurbomindTensor(scales);
             auto _qzeros = TorchTensorToTurbomindTensor(qzeros);
-            linear->post_init(_qweight, _scales, _qzeros, simt);
+            linear->post_init(_qweight, *_scales, *_qzeros, simt);
         })
         .def("forward", [](turbomind::Linear* linear, py::object in, py::object out) {
             auto _in = TorchTensorToTurbomindTensor(in);
             auto _out = TorchTensorToTurbomindTensor(out);
-            return linear->forward(_in, _out);
+            return linear->forward(*_in, *_out);
         });
 }
diff --git a/src/turbomind/api/python/linear.cc b/src/turbomind/api/python/linear.cc
index d2bc25c..6641110 100644
--- a/src/turbomind/api/python/linear.cc
+++ b/src/turbomind/api/python/linear.cc
@@ -49,9 +49,7 @@ struct Linear::Impl {
         check_cuda_error(cudaFree(scales_zeros_));
     }
 
-    void post_init(std::shared_ptr<Tensor> qweight, std::shared_ptr<Tensor> scales, std::shared_ptr<Tensor> qzeros,
-                   bool simt) {
-        cudaDeviceSynchronize();
+    void post_init(std::shared_ptr<Tensor> qweight, const Tensor& scales, const Tensor& qzeros, bool simt) {
         const auto workspace_size = input_dims_ * output_dims_ * sizeof(uint16_t);
         void *workspace {};
         check_cuda_error(cudaMalloc((void**)&workspace, workspace_size));
@@ -62,10 +60,10 @@ struct Linear::Impl {
         check_cuda_error(cudaFree(workspace));
     }
 
-    void forward(std::shared_ptr<Tensor> in, std::shared_ptr<Tensor> out) {
-        TM_CHECK(in->type == TYPE_FP16 && out->type == TYPE_FP16);
-        TM_CHECK(in->shape.size() == 2 && in->shape[1] == input_dims_);
-        TM_CHECK(out->shape.size() == 2 && out->shape[0] == in->shape[0] && out->shape[1] == output_dims_);
+    void forward(const Tensor& in, Tensor& out) {
+        TM_CHECK(in.type == TYPE_FP16 && out.type == TYPE_FP16);
+        TM_CHECK(in.shape.size() == 2 && in.shape[1] == input_dims_);
+        TM_CHECK(out.shape.size() == 2 && out.shape[0] == in.shape[0] && out.shape[1] == output_dims_);
 
         using namespace gemm;
 
@@ -79,7 +77,7 @@ struct Linear::Impl {
         const MatrixLayout a_desc{
             gemm::DataType::F16, // get_data_type_v<T>,
             kRowMajor,
-            (int)in->shape[0], // row
+            (int)in.shape[0], // row
             (int)input_dims_,  // col
             (int)input_dims_   // input_data.pitch, // input_data.pitch = input_dims_ if input_data.pitch==0
         };
@@ -87,14 +85,14 @@ struct Linear::Impl {
         const MatrixLayout c_desc{
             gemm::DataType::F16, // get_data_type_v<T>,
             kRowMajor,
-            (int)in->shape[0], // row
+            (int)in.shape[0], // row
             (int)output_dims_, // col
             (int)output_dims_
         };
 
         auto ec = gemm_.Run(operation,
                             1.f,
-                            in->data,
+                            in.data,
                             a_desc,
                             nullptr,
                             {},
@@ -103,9 +101,9 @@ struct Linear::Impl {
                             scales_zeros_,
                             q_desc_,
                             0.0f,
-                            out->data,
+                            out.data,
                             c_desc,
-                            const_cast<void*>(out->data),
+                            const_cast<void*>(out.data),
                             c_desc,
                             workspace_,
                             stream_);
@@ -180,8 +178,8 @@ struct Linear::Impl {
     }
 
     void convert_scales_zeros(void* workspace,
-                            std::shared_ptr<Tensor> scales,
-                            std::shared_ptr<Tensor> qzeros,
+                            const Tensor& scales,
+                            const Tensor& qzeros,
                             size_t input_dims,
                             size_t output_dims,
                             int group_size,
@@ -189,7 +187,7 @@ struct Linear::Impl {
         if constexpr (0) {
             std::cout << "scales: " << std::endl;
             std::vector<__half> tmp(input_dims / group_size * output_dims);
-            cudaMemcpy(tmp.data(), scales->data, sizeof(__half) * tmp.size(), cudaMemcpyDefault);
+            cudaMemcpy(tmp.data(), scales.data, sizeof(__half) * tmp.size(), cudaMemcpyDefault);
             cudaDeviceSynchronize();
             int i = 0;
             for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
@@ -206,7 +204,7 @@ struct Linear::Impl {
         if constexpr (0) {
             std::cout << "zeros: " << std::endl;
             std::vector<__half> tmp(input_dims / group_size * output_dims);
-            cudaMemcpy(tmp.data(), qzeros->data, sizeof(__half) * tmp.size(), cudaMemcpyDefault);
+            cudaMemcpy(tmp.data(), qzeros.data, sizeof(__half) * tmp.size(), cudaMemcpyDefault);
             cudaDeviceSynchronize();
             int i = 0;
             for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
@@ -226,7 +224,7 @@ struct Linear::Impl {
         using namespace gemm;
         auto [order_b, pack_b, order_v, pack_v] = get_weight_and_scales_layout(getSMVersion(), use_simt);
 
-        fuse_scales_and_zeros((half*)workspace, (const half*)scales->data, (half*)qzeros->data, scale_count);
+        fuse_scales_and_zeros((half*)workspace, (const half*)scales.data, (half*)qzeros.data, scale_count);
         sync_check_cuda_error();
 
         cudaDeviceSynchronize();
@@ -251,9 +249,6 @@ struct Linear::Impl {
             std::vector<__half> tmp(scale_count * 2);
             cudaMemcpy(tmp.data(), workspace, sizeof(__half) * tmp.size(), cudaMemcpyDefault);
             cudaDeviceSynchronize();
-            // for (const auto& x: tmp) {
-            //     std::cout << (float)x << " ";
-            // }
             int i = 0;
             for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
                 std::cout << __half2float(*it) << " ";
@@ -268,7 +263,6 @@ struct Linear::Impl {
     }
 
 private:
-    // cublasMMWrapper*     cublas_wrapper_;
     gemm::Gemm           gemm_;
     gemm::DispatchPolicy dispatch_policy_{gemm::DispatchPolicy::kDefault};
     gemm::Workspace      workspace_;
@@ -290,14 +284,11 @@ Linear::Linear(size_t input_dims, size_t output_dims, int w_bit, int group_size)
     impl_ = std::make_shared<Impl>(input_dims, output_dims, w_bit, group_size);
 }
 
-void Linear::post_init(std::shared_ptr<Tensor> qweight,
-                       std::shared_ptr<Tensor> scales,
-                       std::shared_ptr<Tensor> qzeros,
-                       bool simt) {
+void Linear::post_init(std::shared_ptr<Tensor> qweight, const Tensor& scales, const Tensor& qzeros, bool simt) {
     impl_->post_init(qweight, scales, qzeros, simt);
 }
 
-void Linear::forward(std::shared_ptr<Tensor> in, std::shared_ptr<Tensor> out)
+void Linear::forward(const Tensor& in, Tensor& out)
 {
     impl_->forward(in, out);
 }
diff --git a/src/turbomind/api/python/linear.h b/src/turbomind/api/python/linear.h
index ee56b3a..11e81eb 100644
--- a/src/turbomind/api/python/linear.h
+++ b/src/turbomind/api/python/linear.h
@@ -22,25 +22,13 @@ enum class WeightType : int
     kINT4
 };
 
-std::shared_ptr<Tensor> convert_qweight(std::shared_ptr<Tensor> qweight,
-                                        size_t input_dims,
-                                        size_t output_dims,
-                                        bool use_simt);
-std::shared_ptr<Tensor> convert_scales_zeros(std::shared_ptr<Tensor> scales,
-                                             std::shared_ptr<Tensor> qzeros,
-                                             std::shared_ptr<Tensor> scales_zeros,
-                                             size_t input_dims,
-                                             size_t output_dims,
-                                             int group_size,
-                                             bool use_simt);
-
 
 class Linear {
 public:
     Linear(size_t input_dims, size_t output_dims, int w_bit, int group_size);
-    void post_init(std::shared_ptr<Tensor> qweight, std::shared_ptr<Tensor> scales, std::shared_ptr<Tensor> qzeros,
+    void post_init(std::shared_ptr<Tensor> qweight, const Tensor& scales, const Tensor& qzeros,
                    bool simt);
-    void forward(std::shared_ptr<Tensor> in, std::shared_ptr<Tensor> out);
+    void forward(const Tensor& in, Tensor& out);
     ~Linear() {}
 
 private:

From 2de746ec3918a65bf82f5a6de0fded49d4daf6f6 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Tue, 15 Oct 2024 12:00:56 +0800
Subject: [PATCH 13/18] rename to _turbomind_ext

---
 CMakeLists.txt                          |  2 +-
 src/turbomind/api/python/CMakeLists.txt |  2 +-
 src/turbomind/api/python/bind.cpp       |  3 +--
 turbomind/linear.py                     | 10 +++++-----
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 29299f2..9ef743b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,7 +150,7 @@ link_directories(
 add_subdirectory(src/turbomind)
 
 # install python api
-install(TARGETS turbomind_kernels DESTINATION ${CMAKE_SOURCE_DIR}/turbomind/lib)
+install(TARGETS _turbomind_ext DESTINATION ${CMAKE_SOURCE_DIR}/turbomind/lib)
 
 
 if (MSVC)
diff --git a/src/turbomind/api/python/CMakeLists.txt b/src/turbomind/api/python/CMakeLists.txt
index bf9acb7..4bda327 100644
--- a/src/turbomind/api/python/CMakeLists.txt
+++ b/src/turbomind/api/python/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 cmake_minimum_required(VERSION 3.8)
-project(turbomind_kernels)
+project(_turbomind_ext)
 
 find_package(pybind11 CONFIG)
 if(NOT pybind11_FOUND)
diff --git a/src/turbomind/api/python/bind.cpp b/src/turbomind/api/python/bind.cpp
index 276cd12..8140b46 100644
--- a/src/turbomind/api/python/bind.cpp
+++ b/src/turbomind/api/python/bind.cpp
@@ -14,7 +14,6 @@
 #include <numeric>
 
 namespace py = pybind11;
-// namespace tm = turbomind;
 using namespace pybind11::literals;
 
 static const char kDlTensorCapsuleName[] = "dltensor";
@@ -209,7 +208,7 @@ std::shared_ptr<turbomind::Tensor> TorchTensorToTurbomindTensor(py::object obj)
     return DLManagedTensorToTurbomindTensor(dlmt);
 }
 
-PYBIND11_MODULE(turbomind_kernels, m) {
+PYBIND11_MODULE(_turbomind_ext, m) {
     py::enum_<turbomind::WeightType>(m, "WeightType")
         .value("kFP32", turbomind::WeightType::kFP32)
         .value("kFP16", turbomind::WeightType::kFP16)
diff --git a/turbomind/linear.py b/turbomind/linear.py
index 4a19913..f09f944 100644
--- a/turbomind/linear.py
+++ b/turbomind/linear.py
@@ -13,10 +13,10 @@
 sys.path.append(osp.join(turbomind_dir, 'lib'))
 
 try:
-    import turbomind_kernels
+    import _turbomind_ext
     TURBOMIND_KERNELS_INSTALLED = True
 except Exception as e:
-    logging.error(f'turbomind_kernels is not installed: {e}')
+    logging.error(f'_turbomind_ext is not installed: {e}')
     TURBOMIND_KERNELS_INSTALLED = False
 
 
@@ -111,9 +111,9 @@ def __init__(self,
         # else:
         #     self.bias = None
 
-        self.linear = turbomind_kernels.Linear(self.in_features,
-                                               self.out_features, self.w_bit,
-                                               self.group_size)
+        self.linear = _turbomind_ext.Linear(self.in_features,
+                                            self.out_features, self.w_bit,
+                                            self.group_size)
 
     def post_init(self):
         assert self.qweight.device.type == 'cuda'

From 66ca496971012fc5d6cef08b3f317e7cc556fa23 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Tue, 15 Oct 2024 12:03:51 +0800
Subject: [PATCH 14/18] remove useless code

---
 turbomind/linear.py | 36 ++++++------------------------------
 1 file changed, 6 insertions(+), 30 deletions(-)

diff --git a/turbomind/linear.py b/turbomind/linear.py
index f09f944..e888248 100644
--- a/turbomind/linear.py
+++ b/turbomind/linear.py
@@ -20,10 +20,6 @@
     TURBOMIND_KERNELS_INSTALLED = False
 
 
-def transpose(x):
-    return x.t() if x is not None else x
-
-
 def pad_out_dims(x: torch.Tensor, dims: int):
     pad = dims - x.size(-1)
     assert pad >= 0
@@ -118,9 +114,9 @@ def __init__(self,
     def post_init(self):
         assert self.qweight.device.type == 'cuda'
         if self.quant_method == 'awq':
-            self.qweight = unpack_awq_gemm(self.qweight).t()
-            self.qzeros = unpack_awq_gemm(self.qzeros).t()
-            self.scales = self.scales.t()
+            self.qweight = unpack_awq_gemm(self.qweight)
+            self.qzeros = unpack_awq_gemm(self.qzeros)
+            self.scales = self.scales
         elif self.quant_method == 'gptq':
             xs = get_u4_slices(self.qweight, torch.uint8)
             self.qweight = torch.stack(xs, dim=1).view(-1,
@@ -128,16 +124,12 @@ def post_init(self):
             xs = get_u4_slices(self.qzeros, torch.uint8)
             self.qzeros = torch.stack(xs, dim=-1).view(self.qzeros.size(0),
                                                        -1) + 1
-            self.qweight = self.qweight.t()
-            self.qzeros = self.qzeros.t()
-            self.scales = self.scales.t()
+            self.qweight = self.qweight
+            self.qzeros = self.qzeros
+            self.scales = self.scales
         else:
             return
 
-        self.qweight = transpose(self.qweight)
-        self.qzeros = transpose(self.qzeros)
-        self.scales = transpose(self.scales)
-
         self.qweight = pack_u4_row(self.qweight)
         self.qzeros = self.qzeros.to(torch.half)
 
@@ -153,22 +145,6 @@ def is_16xx_series(name):
         self.qweight = self.qweight.contiguous()
         self.scales = self.scales.contiguous()
         self.qzeros = self.qzeros.contiguous()
-
-        # _list = self.scales.flatten().cpu().tolist()
-        # def print_formatted_float(a):
-        #     formatted_list = " ".join(f"{num:.8f}" for num in a)
-        #     print(formatted_list)
-        # print_formatted_float(_list[0:1000])
-        # print_formatted_float(_list[-1000:][::-1])
-
-        # def print_formatted_int(a):
-        #     formatted_list = ' '.join(f'{num:.0f}' for num in a)
-        #     print(formatted_list)
-
-        # _list = self.qzeros.flatten().cpu().tolist()
-        # print_formatted_int(_list[0:1000])
-        # print_formatted_int(_list[-1000:][::-1])
-
         self.linear.post_init(self.qweight, self.scales, self.qzeros, simt)
 
     @torch.no_grad()

From 824c9e89883b4eea7254ba36d3a37263394df911 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Tue, 15 Oct 2024 14:32:16 +0800
Subject: [PATCH 15/18] update linear test

---
 example/test_linear.py | 79 ++++++++++++++++++++++++++++++++++++------
 turbomind/linear.py    | 30 ++++++++--------
 2 files changed, 85 insertions(+), 24 deletions(-)

diff --git a/example/test_linear.py b/example/test_linear.py
index 1293929..025f60c 100644
--- a/example/test_linear.py
+++ b/example/test_linear.py
@@ -8,6 +8,53 @@
 torch.manual_seed(0)
 
 
+def i32x8_to_i4x8(w):
+    """merge 8 integers (range from 0 to 15) into one 32-bit integer."""
+    assert w.shape[-1] % 8 == 0
+    shape = (w.shape[0], w.numel() // (w.shape[0] * 8), 8)
+    shape = shape[:-1] + (1, )
+    result = torch.zeros(shape, dtype=w.dtype, device=w.device)
+    mask = torch.tensor([15], dtype=w.dtype, device=w.device)
+    for i in range(8):
+        shift = 4 * (7 - i)
+        result[..., 0] |= (w[..., i] & mask) << shift
+    result = result.view(w.shape[0], -1)
+    return result
+
+
+def makeup_weights(in_features: int, out_features: int, group_size: int = 128):
+    # make up qweight
+    assert out_features % 8 == 0
+    qweight = torch.randint(0,
+                            16, (in_features, out_features // 8, 8),
+                            dtype=torch.int32,
+                            device='cuda')
+    print(f'-- makeup qweight: shape {qweight.shape}')
+    print(qweight.view(in_features, -1))
+    qweight = i32x8_to_i4x8(qweight)
+    print(f'-- merge qweight: shape {qweight.shape}')
+    print(qweight)
+
+    # make up qzeros
+    assert in_features % group_size == 0 and in_features // group_size >= 1
+    qzeros = torch.randint(0,
+                           16,
+                           (in_features // group_size, out_features // 8, 8),
+                           dtype=torch.int32,
+                           device='cuda')
+    print(f'-- makeup qzero: shape {qzeros.shape}')
+    print(qzeros.view(in_features // group_size, -1))
+    qzeros = i32x8_to_i4x8(qzeros)
+    print(f'-- merge qzero: shape {qzeros.shape}\n{qzeros}')
+
+    # make up scales
+    scales = torch.rand((in_features // group_size, out_features),
+                        dtype=torch.float16,
+                        device='cuda')
+    print(f'-- makeup scales: shape {scales.shape}\n{scales}')
+    return qweight, qzeros, scales
+
+
 def dequantize(qweight, qzeros, scales, group_size: int = 128):
     _qweight = unpack_awq_gemm(qweight)
     _qzeros = unpack_awq_gemm(qzeros)
@@ -32,18 +79,22 @@ def load_specified_linear_weights():
         for key in keys:
             tensors[key] = f.get_tensor(prefix + key)
 
-    return tensors
+    return tensors['qweight'], tensors['qzeros'], tensors['scales']
 
 
-tensors = load_specified_linear_weights()
-qweight, qzeros, scales = tensors['qweight'], tensors['qzeros'], tensors[
-    'scales']
+# qweight, qzeros, scales = load_specified_linear_weights()
+# in_features = qweight.shape[0]
+# out_features = qweight.shape[1] * 8
 
 group_size = 128
-in_features = qweight.shape[0]
-out_features = qweight.shape[1] * 8
+batch_size = 16384
+in_features = 16384
+out_features = 16384
+qweight, qzeros, scales = makeup_weights(in_features, out_features, group_size)
 
-x = torch.randn(in_features, device=qweight.device, dtype=torch.float16)
+x = torch.randn((batch_size, in_features),
+                device=qweight.device,
+                dtype=torch.float16)
 
 weight = dequantize(qweight, qzeros, scales, group_size)
 print(f'-- dequantization: weight.shape={weight.shape}, weight: \n{weight}')
@@ -68,6 +119,14 @@ def load_specified_linear_weights():
 
 res = model(x)
 print(f'tm.linear.res: {res}')
-max_diff = torch.max(abs(ref_res - res))
-ave_diff = torch.sum(abs(ref_res - res)) / ref_res.numel()
-print(f'max_diff {max_diff}, ave_diff {ave_diff}')
+abs_diff = torch.abs(res - ref_res).float()
+rel_diff = abs_diff / torch.max(torch.abs(ref_res), torch.abs(res))
+rtol = 0.01
+atol = 0.0001
+outliers = abs_diff > atol + rtol * torch.abs(ref_res)
+abs_diff = torch.sum(abs_diff) / abs_diff.numel()
+rel_diff = torch.sum(rel_diff) / rel_diff.numel()
+outliers = torch.sum(outliers) / outliers.shape[0]
+print(f'abs_diff {abs_diff:4f}, '
+      f'rel_diff {rel_diff:4f}, '
+      f'outliers {outliers:4f}')
diff --git a/turbomind/linear.py b/turbomind/linear.py
index e888248..1e9d221 100644
--- a/turbomind/linear.py
+++ b/turbomind/linear.py
@@ -14,10 +14,10 @@
 
 try:
     import _turbomind_ext
-    TURBOMIND_KERNELS_INSTALLED = True
+    TURBOMIND_EXTENSION_INSTALLED = True
 except Exception as e:
     logging.error(f'_turbomind_ext is not installed: {e}')
-    TURBOMIND_KERNELS_INSTALLED = False
+    TURBOMIND_EXTENSION_INSTALLED = False
 
 
 def pad_out_dims(x: torch.Tensor, dims: int):
@@ -95,17 +95,17 @@ def __init__(self,
             ),
         )
 
-        # if bias:
-        #     self.register_buffer(
-        #         'bias',
-        #         torch.zeros(
-        #             (out_features),
-        #             dtype=torch.float16,
-        #             device=device,
-        #         ),
-        #     )
-        # else:
-        #     self.bias = None
+        if bias:
+            self.register_buffer(
+                'bias',
+                torch.zeros(
+                    (out_features),
+                    dtype=torch.float16,
+                    device=device,
+                ),
+            )
+        else:
+            self.bias = None
 
         self.linear = _turbomind_ext.Linear(self.in_features,
                                             self.out_features, self.w_bit,
@@ -149,7 +149,7 @@ def is_16xx_series(name):
 
     @torch.no_grad()
     def forward(self, x):
-        assert TURBOMIND_KERNELS_INSTALLED, (
+        assert TURBOMIND_EXTENSION_INSTALLED, (
             'turbomind kernels are not installed. '
             'Please perform `pip install turbomind` to install turbomind '
             'kernels.')
@@ -165,6 +165,8 @@ def forward(self, x):
         )
         self.linear.forward(x, out)
         out = torch.from_dlpack(out)
+        if self.bias is not None:
+            out.add_(self.bias)
         return out.view(out_shape)
 
     def __call__(self, x: torch.Tensor):

From a4ecf6a1c41fc27b332b33dbfc353910d5d53dda Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Tue, 15 Oct 2024 14:40:38 +0800
Subject: [PATCH 16/18] update

---
 example/test_linear.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/example/test_linear.py b/example/test_linear.py
index 025f60c..c2ad4db 100644
--- a/example/test_linear.py
+++ b/example/test_linear.py
@@ -58,14 +58,15 @@ def makeup_weights(in_features: int, out_features: int, group_size: int = 128):
 def dequantize(qweight, qzeros, scales, group_size: int = 128):
     _qweight = unpack_awq_gemm(qweight)
     _qzeros = unpack_awq_gemm(qzeros)
-    _qzeros = _qzeros.half()
-    weight = _qweight.clone().half()
+    _qzeros = _qzeros.float()
+    _qweight = _qweight.float()
+    _scales = scales.float()
     for i in range(qzeros.shape[0]):
         start = i * group_size
         end = start + group_size
-        weight[start:end] = (weight[start:end, :] -
-                             _qzeros[i:i + 1, :]) * scales[i:i + 1, :]
-    return weight
+        _qweight[start:end] = (_qweight[start:end, :] -
+                               _qzeros[i:i + 1, :]) * _scales[i:i + 1, :]
+    return _qweight.half()
 
 
 def load_specified_linear_weights():

From 7948aee7ef1276ccae1002f74065fa8eab207b99 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Tue, 15 Oct 2024 14:48:09 +0800
Subject: [PATCH 17/18] add cudaStream_t param in Linear::Forward

---
 src/turbomind/api/python/linear.cc | 19 +++++++++----------
 src/turbomind/api/python/linear.h  |  2 +-
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/turbomind/api/python/linear.cc b/src/turbomind/api/python/linear.cc
index 6641110..d05c7d4 100644
--- a/src/turbomind/api/python/linear.cc
+++ b/src/turbomind/api/python/linear.cc
@@ -36,15 +36,15 @@ struct Linear::Impl {
 
         workspace_.barriers_size = gemm::Gemm::kBarriersSize;
         workspace_.partials_size = gemm::Gemm::kPartialsSize;
-        cudaMallocAsync(&workspace_.barriers, workspace_.barriers_size, stream_);
-        cudaMallocAsync(&workspace_.partials, workspace_.partials_size, stream_);
-        cudaMemsetAsync(workspace_.barriers, 0, workspace_.barriers_size, stream_);
+        cudaMallocAsync(&workspace_.barriers, workspace_.barriers_size, 0);
+        cudaMallocAsync(&workspace_.partials, workspace_.partials_size, 0);
+        cudaMemsetAsync(workspace_.barriers, 0, workspace_.barriers_size, 0);
     }
 
     ~Impl()
     {
-        cudaFreeAsync(workspace_.barriers, stream_);
-        cudaFreeAsync(workspace_.partials, stream_);
+        cudaFreeAsync(workspace_.barriers, 0);
+        cudaFreeAsync(workspace_.partials, 0);
         workspace_ = {};
         check_cuda_error(cudaFree(scales_zeros_));
     }
@@ -60,7 +60,7 @@ struct Linear::Impl {
         check_cuda_error(cudaFree(workspace));
     }
 
-    void forward(const Tensor& in, Tensor& out) {
+    void forward(const Tensor& in, Tensor& out, cudaStream_t stream) {
         TM_CHECK(in.type == TYPE_FP16 && out.type == TYPE_FP16);
         TM_CHECK(in.shape.size() == 2 && in.shape[1] == input_dims_);
         TM_CHECK(out.shape.size() == 2 && out.shape[0] == in.shape[0] && out.shape[1] == output_dims_);
@@ -106,7 +106,7 @@ struct Linear::Impl {
                             const_cast<void*>(out.data),
                             c_desc,
                             workspace_,
-                            stream_);
+                            stream);
 
         if (ec) {
             printf("%s: %d", __PRETTY_FUNCTION__, ec);
@@ -266,7 +266,6 @@ struct Linear::Impl {
     gemm::Gemm           gemm_;
     gemm::DispatchPolicy dispatch_policy_{gemm::DispatchPolicy::kDefault};
     gemm::Workspace      workspace_;
-    cudaStream_t         stream_{};
 
     size_t input_dims_;
     size_t output_dims_;
@@ -288,8 +287,8 @@ void Linear::post_init(std::shared_ptr<Tensor> qweight, const Tensor& scales, co
     impl_->post_init(qweight, scales, qzeros, simt);
 }
 
-void Linear::forward(const Tensor& in, Tensor& out)
+void Linear::forward(const Tensor& in, Tensor& out, cudaStream_t stream)
 {
-    impl_->forward(in, out);
+    impl_->forward(in, out, stream);
 }
 }  // namespace turbomind
diff --git a/src/turbomind/api/python/linear.h b/src/turbomind/api/python/linear.h
index 11e81eb..8b611f2 100644
--- a/src/turbomind/api/python/linear.h
+++ b/src/turbomind/api/python/linear.h
@@ -28,7 +28,7 @@ class Linear {
     Linear(size_t input_dims, size_t output_dims, int w_bit, int group_size);
     void post_init(std::shared_ptr<Tensor> qweight, const Tensor& scales, const Tensor& qzeros,
                    bool simt);
-    void forward(const Tensor& in, Tensor& out);
+    void forward(const Tensor& in, Tensor& out, cudaStream_t stream = nullptr);
     ~Linear() {}
 
 private:

From 5904bf86f43467655d854db083e3344d39db3efa Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Tue, 15 Oct 2024 15:46:17 +0800
Subject: [PATCH 18/18] update

---
 example/test_linear.py            | 6 +++++-
 src/turbomind/api/python/bind.cpp | 9 +++++----
 turbomind/linear.py               | 3 ++-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/example/test_linear.py b/example/test_linear.py
index c2ad4db..8ccbbbc 100644
--- a/example/test_linear.py
+++ b/example/test_linear.py
@@ -118,7 +118,11 @@ def load_specified_linear_weights():
 
 model.post_init()
 
-res = model(x)
+stream = torch.cuda.Stream()
+with torch.cuda.stream(stream):
+    res = model(x)
+stream.synchronize()
+
 print(f'tm.linear.res: {res}')
 abs_diff = torch.abs(res - ref_res).float()
 rel_diff = abs_diff / torch.max(torch.abs(ref_res), torch.abs(res))
diff --git a/src/turbomind/api/python/bind.cpp b/src/turbomind/api/python/bind.cpp
index 8140b46..5eadc1e 100644
--- a/src/turbomind/api/python/bind.cpp
+++ b/src/turbomind/api/python/bind.cpp
@@ -329,16 +329,17 @@ PYBIND11_MODULE(_turbomind_ext, m) {
         .def(py::init([](size_t in_features, size_t out_features, int w_bit, int group_size) {
             return new turbomind::Linear(in_features, out_features, w_bit, group_size);
         }))
-        .def("post_init", [](turbomind::Linear* linear, py::object qweight, py::object scales, py::object qzeros,
+        .def("post_init", [](turbomind::Linear* self, py::object qweight, py::object scales, py::object qzeros,
                              bool simt){
             auto _qweight = TorchTensorToTurbomindTensor(qweight);
             auto _scales = TorchTensorToTurbomindTensor(scales);
             auto _qzeros = TorchTensorToTurbomindTensor(qzeros);
-            linear->post_init(_qweight, *_scales, *_qzeros, simt);
+            self->post_init(_qweight, *_scales, *_qzeros, simt);
         })
-        .def("forward", [](turbomind::Linear* linear, py::object in, py::object out) {
+        .def("forward", [](turbomind::Linear* self, py::object in, py::object out, int64_t stream_id = 0) {
             auto _in = TorchTensorToTurbomindTensor(in);
             auto _out = TorchTensorToTurbomindTensor(out);
-            return linear->forward(*_in, *_out);
+            auto stream = reinterpret_cast<cudaStream_t>(stream_id);
+            return self->forward(*_in, *_out, stream);
         });
 }
diff --git a/turbomind/linear.py b/turbomind/linear.py
index 1e9d221..a7fea9b 100644
--- a/turbomind/linear.py
+++ b/turbomind/linear.py
@@ -163,7 +163,8 @@ def forward(self, x):
             dtype=torch.float16,
             device=x.device,
         )
-        self.linear.forward(x, out)
+        stream = torch.cuda.current_stream()
+        self.linear.forward(x, out, stream.cuda_stream)
         out = torch.from_dlpack(out)
         if self.bias is not None:
             out.add_(self.bias)