From 49c64aa5baf637273ab29cbbd5a3ba78fa9974f0 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 2 Sep 2024 13:25:25 +0800 Subject: [PATCH 01/18] check in pre-commit hooks --- .gitignore | 2 +- .pre-commit-config.yaml | 58 +++++++++++ README.md | 2 +- generate.sh | 2 +- requirements.txt | 1 + requirements/build.txt | 2 + requirements/runtime.txt | 1 + requirements/test.txt | 10 ++ setup.py | 157 +++++++++++++++++++++++++++++ src/turbomind/CMakeLists.txt | 2 +- src/turbomind/kernels/gemm/diff.py | 10 +- src/turbomind/kernels/gemm/plot.py | 89 ++++++++++++---- src/turbomind/utils/CMakeLists.txt | 2 +- turbomind/__init__.py | 1 + turbomind/linear.py | 6 +- turbomind/utils.py | 3 +- turbomind/version.py | 28 +++++ 17 files changed, 344 insertions(+), 32 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 requirements.txt create mode 100644 requirements/build.txt create mode 100644 requirements/runtime.txt create mode 100644 requirements/test.txt create mode 100644 setup.py create mode 100644 turbomind/version.py diff --git a/.gitignore b/.gitignore index 1aac461..2290706 100644 --- a/.gitignore +++ b/.gitignore @@ -33,4 +33,4 @@ .cache -/build \ No newline at end of file +/build diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..61d3b10 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,58 @@ +repos: + - repo: https://github.com/PyCQA/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + - repo: https://github.com/PyCQA/isort + rev: 5.11.5 + hooks: + - id: isort + - repo: https://github.com/pre-commit/mirrors-yapf + rev: v0.32.0 + hooks: + - id: yapf + name: yapf + description: 'Formatter for Python code' + entry: yapf + language: python + args: ['-i', '--style={based_on_style: pep8, column_limit: 79}'] + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.2.0 + hooks: + - id: trailing-whitespace + - id: check-yaml + - id: end-of-file-fixer + - id: requirements-txt-fixer + - id: double-quote-string-fixer + - id: check-merge-conflict + - id: fix-encoding-pragma + args: ["--remove"] + - id: mixed-line-ending + args: ["--fix=lf"] + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.9 + hooks: + - id: mdformat + args: ["--number"] + additional_dependencies: + - mdformat-openmmlab + - mdformat_frontmatter + - linkify-it-py + - repo: https://github.com/codespell-project/codespell + rev: v2.1.0 + hooks: + - id: codespell + args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/kernels/gemm/transform.h"] + + - repo: https://github.com/myint/docformatter + rev: v1.4 + hooks: + - id: docformatter + args: ["--in-place", "--wrap-descriptions", "79"] + + - repo: https://github.com/open-mmlab/pre-commit-hooks + rev: v0.2.0 + hooks: + - id: check-copyright + args: ["turbomind"] diff --git a/README.md b/README.md index 4a0f313..5f671f2 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -# turbomind \ No newline at end of file +# turbomind diff --git a/generate.sh b/generate.sh index 46939e9..7ed3168 100755 --- a/generate.sh +++ b/generate.sh @@ -9,4 +9,4 @@ cmake -G Ninja .. \ -DFETCHCONTENT_UPDATES_DISCONNECTED=ON \ -DLMDEPLOY_ASAN_ENABLE=OFF \ -DLMDEPLOY_UBSAN_ENABLE=OFF \ - -DCMAKE_CUDA_ARCHITECTURES="80-real" \ No newline at end of file + -DCMAKE_CUDA_ARCHITECTURES="80-real" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e7b244f --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +-r runtime.txt diff --git a/requirements/build.txt b/requirements/build.txt new file mode 100644 index 0000000..28c4a7a --- /dev/null +++ b/requirements/build.txt @@ -0,0 +1,2 @@ +pybind11<=2.13.1 +setuptools diff --git a/requirements/runtime.txt b/requirements/runtime.txt new file mode 100644 index 0000000..12c6d5d --- /dev/null +++ b/requirements/runtime.txt @@ -0,0 +1 @@ +torch diff --git a/requirements/test.txt b/requirements/test.txt new file mode 100644 index 0000000..d06440a --- /dev/null +++ b/requirements/test.txt @@ -0,0 +1,10 @@ +allure-pytest +coverage +pynvml +pytest +pytest-assume +pytest-order +pytest-rerunfailures +pytest-sugar +pytest-xdist +pyyaml diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4061558 --- /dev/null +++ b/setup.py @@ -0,0 +1,157 @@ +import os +import re +import sys + +from setuptools import find_packages, setup + +pwd = os.path.dirname(__file__) +version_file = 'lmdeploy/version.py' + + +def readme(): + with open(os.path.join(pwd, 'README.md'), encoding='utf-8') as f: + content = f.read() + return content + + +def get_version(): + with open(os.path.join(pwd, version_file), 'r') as f: + exec(compile(f.read(), version_file, 'exec')) + return locals()['__version__'] + + +def check_ext_modules(): + if os.path.exists(os.path.join(pwd, 'turbomind', 'lib')): + return True + return False + + +def get_cuda_pkgs(): + arg_name = '--cuda=' + arg_value = None + for arg in sys.argv[1:]: + if arg.startswith(arg_name): + arg_value = arg[len(arg_name):] + sys.argv.remove(arg) + break + + cuda_pkgs = [] + if arg_value == '11': + cuda_pkgs = [ + 'nvidia-nccl-cu11', 'nvidia-cuda-runtime-cu11', + 'nvidia-cublas-cu11', 'nvidia-curand-cu11' + ] + elif arg_value == '12': + cuda_pkgs = [ + 'nvidia-nccl-cu12', 'nvidia-cuda-runtime-cu12', + 'nvidia-cublas-cu12', 'nvidia-curand-cu12' + ] + return cuda_pkgs + + +cuda_pkgs = get_cuda_pkgs() + + +def parse_requirements(fname='requirements.txt', with_version=True): + """Parse the package dependencies listed in a file but strips specific + versioning information. + + Args: + fname (str): path to the file + with_version (bool, default=False): if True include version specs + + Returns: + List[str]: list of requirements items + + CommandLine: + python -c "import setup; print(setup.parse_requirements())" + """ + require_fpath = fname + + def parse_line(line): + """Parse information from a line in a requirements text file.""" + if line.startswith('-r '): + # Allow specifying requirements in other files + target = line.split(' ')[1] + for info in parse_require_file(target): + yield info + else: + info = {'line': line} + if line.startswith('-e '): + info['package'] = line.split('#egg=')[1] + elif '@git+' in line: + info['package'] = line + else: + # Remove versioning from the package + pat = '(' + '|'.join(['>=', '==', '>']) + ')' + parts = re.split(pat, line, maxsplit=1) + parts = [p.strip() for p in parts] + + info['package'] = parts[0] + if len(parts) > 1: + op, rest = parts[1:] + if ';' in rest: + # Handle platform specific dependencies + # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies + version, platform_deps = map(str.strip, + rest.split(';')) + info['platform_deps'] = platform_deps + else: + version = rest # NOQA + info['version'] = (op, version) + yield info + + def parse_require_file(fpath): + with open(fpath, 'r') as f: + for line in f.readlines(): + line = line.strip() + if line and not line.startswith('#'): + for info in parse_line(line): + yield info + + def gen_packages_items(): + if os.path.exists(require_fpath): + for info in parse_require_file(require_fpath): + parts = [info['package']] + if with_version and 'version' in info: + parts.extend(info['version']) + if not sys.version.startswith('3.4'): + # apparently package_deps are broken in 3.4 + platform_deps = info.get('platform_deps') + if platform_deps is not None: + parts.append(';' + platform_deps) + item = ''.join(parts) + yield item + + packages = list(gen_packages_items()) + packages += cuda_pkgs + return packages + + +if __name__ == '__main__': + setup(name='turbomind', + version=get_version(), + description='CUDA kernels used for LLM Quantization', + long_description=readme(), + long_description_content_type='text/markdown', + author='OpenMMLab', + author_email='openmmlab@gmail.com', + packages=find_packages(exclude=()), + include_package_data=True, + setup_requires=parse_requirements('requirements/build.txt'), + tests_require=parse_requirements('requirements/test.txt'), + install_requires=parse_requirements('requirements/runtime.txt'), + extras_require={ + 'all': parse_requirements('requirements.txt'), + }, + has_ext_modules=check_ext_modules, + classifiers=[ + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Intended Audience :: Developers', + 'Intended Audience :: Education', + 'Intended Audience :: Science/Research', + ]) diff --git a/src/turbomind/CMakeLists.txt b/src/turbomind/CMakeLists.txt index 289b362..ed840d6 100644 --- a/src/turbomind/CMakeLists.txt +++ b/src/turbomind/CMakeLists.txt @@ -2,4 +2,4 @@ add_subdirectory(utils) add_subdirectory(kernels/gemm) -add_subdirectory(api/python) \ No newline at end of file +add_subdirectory(api/python) diff --git a/src/turbomind/kernels/gemm/diff.py b/src/turbomind/kernels/gemm/diff.py index 01824cf..e233c46 100644 --- a/src/turbomind/kernels/gemm/diff.py +++ b/src/turbomind/kernels/gemm/diff.py @@ -1,7 +1,9 @@ -import fire import json + +import fire import prettytable + def get_tag(state, tag): for x in state['summaries']: if x['tag'] == tag: @@ -9,7 +11,7 @@ def get_tag(state, tag): def main(path_a: str, path_b: str): - + with open(path_a, 'r') as f: a = json.load(f) with open(path_b, 'r') as f: @@ -38,7 +40,7 @@ def main(path_a: str, path_b: str): x = by_bs.get(bs, [0, 0, 0]) by_bs[bs] = x[0] + 1, x[1] + flops, x[2] + diff idx_4 = int(idx[4:]) % 4 - x = by_idx_4.get(idx_4, [0,0,0]) + x = by_idx_4.get(idx_4, [0, 0, 0]) by_idx_4[idx_4] = x[0] + 1, x[1] + flops, x[2] + diff by_idx_tab = prettytable.PrettyTable(['idx', 'flops', 'diff']) @@ -67,4 +69,4 @@ def main(path_a: str, path_b: str): if __name__ == '__main__': - fire.Fire(main) \ No newline at end of file + fire.Fire(main) diff --git a/src/turbomind/kernels/gemm/plot.py b/src/turbomind/kernels/gemm/plot.py index 9c08414..c7c162e 100644 --- a/src/turbomind/kernels/gemm/plot.py +++ b/src/turbomind/kernels/gemm/plot.py @@ -4,50 +4,99 @@ a100 = { 'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192], - 'flops': [30.975, 61.083, 104.805, 134.369, 152.262, 173.313, 187.04, 210.705, 212.336, 225.996, 241.908, 246.485], - 'diff': [203.433, 200.315, 157.897, 120.865, 90.145, 68.087, 40.362, 36.08, 12.763, 6.857, 1.694, 3.056] + 'flops': [ + 30.975, 61.083, 104.805, 134.369, 152.262, 173.313, 187.04, 210.705, + 212.336, 225.996, 241.908, 246.485 + ], + 'diff': [ + 203.433, 200.315, 157.897, 120.865, 90.145, 68.087, 40.362, 36.08, + 12.763, 6.857, 1.694, 3.056 + ] } h100 = { 'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192], - 'flops': [40.723, 80.707, 145.495, 188.742, 216.633, 252.697, 282.116, 307.794, 330.382, 368.38, 394.966, 395.913], - 'diff': [151.567, 149.366, 122.284, 93.14, 68.945, 41.166, 18.074, -1.506, -15.628, -29.799, -36.413, -37.346] + 'flops': [ + 40.723, 80.707, 145.495, 188.742, 216.633, 252.697, 282.116, 307.794, + 330.382, 368.38, 394.966, 395.913 + ], + 'diff': [ + 151.567, 149.366, 122.284, 93.14, 68.945, 41.166, 18.074, -1.506, + -15.628, -29.799, -36.413, -37.346 + ] } -rtx4090 ={ +rtx4090 = { 'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192], - 'flops': [18.685, 37.152, 72.702, 104.168, 125.963, 139.206, 146.199, 154.541, 157.06, 164.551, 167.729, 169.425], - 'diff': [242.372, 247.154, 235.135, 230.773, 203.158, 131.433, 81.746, 45.646, 17.447, 10.195, 2.411, 2.135] + 'flops': [ + 18.685, 37.152, 72.702, 104.168, 125.963, 139.206, 146.199, 154.541, + 157.06, 164.551, 167.729, 169.425 + ], + 'diff': [ + 242.372, 247.154, 235.135, 230.773, 203.158, 131.433, 81.746, 45.646, + 17.447, 10.195, 2.411, 2.135 + ] } v100 = { 'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192], - 'flops': [17.179, 31.255, 46.443, 44.769, 56.847, 59.087, 62.678, 64.258, 65.164, 66.699, 69.755, 71.13], - 'diff': [260.359, 231.003, 156.902, 142.953, 134.385, 97.611, 54.962, 21.085, -5.455, -14.298, -15.295, -14.578] + 'flops': [ + 17.179, 31.255, 46.443, 44.769, 56.847, 59.087, 62.678, 64.258, 65.164, + 66.699, 69.755, 71.13 + ], + 'diff': [ + 260.359, 231.003, 156.902, 142.953, 134.385, 97.611, 54.962, 21.085, + -5.455, -14.298, -15.295, -14.578 + ] } rtx2080 = { 'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096], - 'flops': [10.958, 21.416, 26.624, 26.946, 28.604, 30.342, 31.97, 32.201, 32.424, 32.982, 34.231], - 'diff': [271.225, 265.331, 121.867, 51.776, 21.793, 40.907, 15.176, 23.986, 10.364, -0.066, -2.167] + 'flops': [ + 10.958, 21.416, 26.624, 26.946, 28.604, 30.342, 31.97, 32.201, 32.424, + 32.982, 34.231 + ], + 'diff': [ + 271.225, 265.331, 121.867, 51.776, 21.793, 40.907, 15.176, 23.986, + 10.364, -0.066, -2.167 + ] } -fig, ax1 = plt.subplots(1, 1, figsize=(16,9), dpi=80) +fig, ax1 = plt.subplots(1, 1, figsize=(16, 9), dpi=80) ax1.set_xscale('log') ax1.set_xlabel('linear') -ax1.plot(a100['bs'], np.array(a100['diff']) * 0.01 + 1, marker='o', label='A100', color='tab:red') -ax1.plot(h100['bs'], np.array(h100['diff']) * 0.01 + 1, marker='*', label='H100', color='tab:blue') -ax1.plot(v100['bs'], np.array(v100['diff']) * 0.01 + 1, marker='p', label='V100', color='tab:orange') -ax1.plot(rtx4090['bs'], np.array(rtx4090['diff']) * 0.01 + 1, marker='h', label='4090', color='tab:green') -ax1.plot(rtx2080['bs'], np.array(rtx2080['diff']) * 0.01 + 1, marker='s', label='2080', color='tab:purple') +ax1.plot(a100['bs'], + np.array(a100['diff']) * 0.01 + 1, + marker='o', + label='A100', + color='tab:red') +ax1.plot(h100['bs'], + np.array(h100['diff']) * 0.01 + 1, + marker='*', + label='H100', + color='tab:blue') +ax1.plot(v100['bs'], + np.array(v100['diff']) * 0.01 + 1, + marker='p', + label='V100', + color='tab:orange') +ax1.plot(rtx4090['bs'], + np.array(rtx4090['diff']) * 0.01 + 1, + marker='h', + label='4090', + color='tab:green') +ax1.plot(rtx2080['bs'], + np.array(rtx2080['diff']) * 0.01 + 1, + marker='s', + label='2080', + color='tab:purple') ax1.set_ylabel('relative speed (vs cublasGemmEx)') ax1.tick_params(axis='x', rotation=0, labelsize=12) ax1.set_xlabel('batch size') ax1.set_xticks(a100['bs']) -ax1.set_xticklabels(a100['bs'], fontdict={'fontsize':10}) +ax1.set_xticklabels(a100['bs'], fontdict={'fontsize': 10}) ax1.grid(alpha=.25, axis='y') - ax2 = ax1.twinx() ax2.plot(a100['bs'], a100['flops'], alpha=.4, color='tab:red') ax2.plot(h100['bs'], h100['flops'], alpha=.4, color='tab:blue') @@ -62,4 +111,4 @@ ax1.legend() fig.tight_layout() -plt.savefig('fig.jpg') \ No newline at end of file +plt.savefig('fig.jpg') diff --git a/src/turbomind/utils/CMakeLists.txt b/src/turbomind/utils/CMakeLists.txt index 05b1011..06d7749 100644 --- a/src/turbomind/utils/CMakeLists.txt +++ b/src/turbomind/utils/CMakeLists.txt @@ -1,4 +1,4 @@ add_library(parser STATIC parser.cc) -set_property(TARGET parser PROPERTY POSITION_INDEPENDENT_CODE ON) \ No newline at end of file +set_property(TARGET parser PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/turbomind/__init__.py b/turbomind/__init__.py index e69de29..ef101fe 100644 --- a/turbomind/__init__.py +++ b/turbomind/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/turbomind/linear.py b/turbomind/linear.py index 1ec6419..0b322a1 100644 --- a/turbomind/linear.py +++ b/turbomind/linear.py @@ -1,6 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. import torch -from . import utils + +# from . import utils + class Linear(torch.nn.Module): pass - diff --git a/turbomind/utils.py b/turbomind/utils.py index da1adc5..39d9397 100644 --- a/turbomind/utils.py +++ b/turbomind/utils.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from typing import List + import torch @@ -46,4 +47,4 @@ def pack_u4_row(x: torch.Tensor) -> torch.Tensor: a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device) for t in reversed(xs): a = (a << 4) | t - return a.squeeze(dim=-1) \ No newline at end of file + return a.squeeze(dim=-1) diff --git a/turbomind/version.py b/turbomind/version.py new file mode 100644 index 0000000..20e5e73 --- /dev/null +++ b/turbomind/version.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +__version__ = '0.0.1' +short_version = __version__ + + +def parse_version_info(version_str: str) -> Tuple: + """Parse version from a string. + + Args: + version_str (str): A string represents a version info. + + Returns: + tuple: A sequence of integer and string represents version. + """ + _version_info = [] + for x in version_str.split('.'): + if x.isdigit(): + _version_info.append(int(x)) + elif x.find('rc') != -1: + patch_version = x.split('rc') + _version_info.append(int(patch_version[0])) + _version_info.append(f'rc{patch_version[1]}') + return tuple(_version_info) + + +version_info = parse_version_info(__version__) From a79f44429648ea8817e7d76dac71dc3e97d8397c Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 2 Sep 2024 21:31:54 +0800 Subject: [PATCH 02/18] remove diff.py and plot.py from src/turbomind --- src/turbomind/kernels/gemm/diff.py | 72 ------------------ src/turbomind/kernels/gemm/plot.py | 114 ----------------------------- 2 files changed, 186 deletions(-) delete mode 100644 src/turbomind/kernels/gemm/diff.py delete mode 100644 src/turbomind/kernels/gemm/plot.py diff --git a/src/turbomind/kernels/gemm/diff.py b/src/turbomind/kernels/gemm/diff.py deleted file mode 100644 index e233c46..0000000 --- a/src/turbomind/kernels/gemm/diff.py +++ /dev/null @@ -1,72 +0,0 @@ -import json - -import fire -import prettytable - - -def get_tag(state, tag): - for x in state['summaries']: - if x['tag'] == tag: - return float(x['data'][0]['value']) - - -def main(path_a: str, path_b: str): - - with open(path_a, 'r') as f: - a = json.load(f) - with open(path_b, 'r') as f: - b = json.load(f) - states_a = a['benchmarks'][0]['states'] - states_b = b['benchmarks'][0]['states'] - by_idx = {} - by_idx_4 = {} - by_bs = {} - table = prettytable.PrettyTable(['idx', 'bs', 'tp', 'flops', 'diff']) - for u in states_a: - for v in states_b: - if u['name'] == v['name']: - break - assert u['name'] == v['name'] - tag = 'nv/cold/bw/item_rate' - flops_a = get_tag(u, tag) - flops_b = get_tag(v, tag) - ratio = flops_a / flops_b - flops = round(flops_a / 1e12, 3) - diff = round((ratio - 1.0) * 100, 3) - idx, bs, tp = u['name'].split(' ')[1:] - table.add_row([idx, bs, tp, flops, diff]) - x = by_idx.get(idx, [0, 0, 0]) - by_idx[idx] = x[0] + 1, x[1] + flops, x[2] + diff - x = by_bs.get(bs, [0, 0, 0]) - by_bs[bs] = x[0] + 1, x[1] + flops, x[2] + diff - idx_4 = int(idx[4:]) % 4 - x = by_idx_4.get(idx_4, [0, 0, 0]) - by_idx_4[idx_4] = x[0] + 1, x[1] + flops, x[2] + diff - - by_idx_tab = prettytable.PrettyTable(['idx', 'flops', 'diff']) - for k, (cnt, flops, diff) in by_idx.items(): - flops = round(flops / cnt, 3) - diff = round(diff / cnt, 3) - by_idx_tab.add_row([k, flops, diff]) - - by_bs_tab = prettytable.PrettyTable(['idx', 'flops', 'diff']) - for k, (cnt, flops, diff) in by_bs.items(): - flops = round(flops / cnt, 3) - diff = round(diff / cnt, 3) - by_bs_tab.add_row([k, flops, diff]) - - by_idx_4_tab = prettytable.PrettyTable(['idx_4', 'flops', 'diff']) - for k, (cnt, flops, diff) in by_idx_4.items(): - flops = round(flops / cnt, 3) - diff = round(diff / cnt, 3) - by_idx_4_tab.add_row([k, flops, diff]) - - print(table) - - print(by_idx_tab) - print(by_idx_4_tab) - print(by_bs_tab) - - -if __name__ == '__main__': - fire.Fire(main) diff --git a/src/turbomind/kernels/gemm/plot.py b/src/turbomind/kernels/gemm/plot.py deleted file mode 100644 index c7c162e..0000000 --- a/src/turbomind/kernels/gemm/plot.py +++ /dev/null @@ -1,114 +0,0 @@ -# import pandas as pd -import matplotlib.pyplot as plt -import numpy as np - -a100 = { - 'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192], - 'flops': [ - 30.975, 61.083, 104.805, 134.369, 152.262, 173.313, 187.04, 210.705, - 212.336, 225.996, 241.908, 246.485 - ], - 'diff': [ - 203.433, 200.315, 157.897, 120.865, 90.145, 68.087, 40.362, 36.08, - 12.763, 6.857, 1.694, 3.056 - ] -} - -h100 = { - 'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192], - 'flops': [ - 40.723, 80.707, 145.495, 188.742, 216.633, 252.697, 282.116, 307.794, - 330.382, 368.38, 394.966, 395.913 - ], - 'diff': [ - 151.567, 149.366, 122.284, 93.14, 68.945, 41.166, 18.074, -1.506, - -15.628, -29.799, -36.413, -37.346 - ] -} - -rtx4090 = { - 'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192], - 'flops': [ - 18.685, 37.152, 72.702, 104.168, 125.963, 139.206, 146.199, 154.541, - 157.06, 164.551, 167.729, 169.425 - ], - 'diff': [ - 242.372, 247.154, 235.135, 230.773, 203.158, 131.433, 81.746, 45.646, - 17.447, 10.195, 2.411, 2.135 - ] -} - -v100 = { - 'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096, 8192], - 'flops': [ - 17.179, 31.255, 46.443, 44.769, 56.847, 59.087, 62.678, 64.258, 65.164, - 66.699, 69.755, 71.13 - ], - 'diff': [ - 260.359, 231.003, 156.902, 142.953, 134.385, 97.611, 54.962, 21.085, - -5.455, -14.298, -15.295, -14.578 - ] -} - -rtx2080 = { - 'bs': [8, 16, 32, 48, 64, 96, 128, 192, 256, 512, 4096], - 'flops': [ - 10.958, 21.416, 26.624, 26.946, 28.604, 30.342, 31.97, 32.201, 32.424, - 32.982, 34.231 - ], - 'diff': [ - 271.225, 265.331, 121.867, 51.776, 21.793, 40.907, 15.176, 23.986, - 10.364, -0.066, -2.167 - ] -} - -fig, ax1 = plt.subplots(1, 1, figsize=(16, 9), dpi=80) -ax1.set_xscale('log') -ax1.set_xlabel('linear') -ax1.plot(a100['bs'], - np.array(a100['diff']) * 0.01 + 1, - marker='o', - label='A100', - color='tab:red') -ax1.plot(h100['bs'], - np.array(h100['diff']) * 0.01 + 1, - marker='*', - label='H100', - color='tab:blue') -ax1.plot(v100['bs'], - np.array(v100['diff']) * 0.01 + 1, - marker='p', - label='V100', - color='tab:orange') -ax1.plot(rtx4090['bs'], - np.array(rtx4090['diff']) * 0.01 + 1, - marker='h', - label='4090', - color='tab:green') -ax1.plot(rtx2080['bs'], - np.array(rtx2080['diff']) * 0.01 + 1, - marker='s', - label='2080', - color='tab:purple') -ax1.set_ylabel('relative speed (vs cublasGemmEx)') -ax1.tick_params(axis='x', rotation=0, labelsize=12) -ax1.set_xlabel('batch size') -ax1.set_xticks(a100['bs']) -ax1.set_xticklabels(a100['bs'], fontdict={'fontsize': 10}) -ax1.grid(alpha=.25, axis='y') - -ax2 = ax1.twinx() -ax2.plot(a100['bs'], a100['flops'], alpha=.4, color='tab:red') -ax2.plot(h100['bs'], h100['flops'], alpha=.4, color='tab:blue') -ax2.plot(v100['bs'], v100['flops'], alpha=.4, color='tab:orange') -ax2.plot(rtx4090['bs'], rtx4090['flops'], alpha=.4, color='tab:green') -ax2.plot(rtx2080['bs'], rtx2080['flops'], alpha=.4, color='tab:purple') -ax2.set_ylabel('TFLOPS') -# ax2.set_xticklabels(x[::60], rotation=90, fontdict={'fontsize':10}) - -# ax2.set_yscale(0.01) - -ax1.legend() - -fig.tight_layout() -plt.savefig('fig.jpg') From 170fa36aae02c2cfc43d99b634dbda204c3017f9 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 3 Sep 2024 17:20:23 +0800 Subject: [PATCH 03/18] update setup.py --- .gitignore | 9 +++++++++ requirements.txt | 3 ++- setup.py | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 2290706..7d4768c 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,12 @@ .cache /build + +# Byte-compiled / optimized / DLL files +__pycache__/ +.vscode/ + +# Distribution / packaging +.eggs/ +wheels/ +*.egg-info/ diff --git a/requirements.txt b/requirements.txt index e7b244f..1b0eb4a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ --r runtime.txt +-r requirements/build.txt +-r requirements/runtime.txt diff --git a/setup.py b/setup.py index 4061558..1b238db 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import find_packages, setup pwd = os.path.dirname(__file__) -version_file = 'lmdeploy/version.py' +version_file = 'turbomind/version.py' def readme(): From 7ee84e6801bb4f33397bcf07659e5f982b4cac11 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 3 Sep 2024 21:03:07 +0800 Subject: [PATCH 04/18] add autoaweq inference as a test example --- example/awq/__init__.py | 0 example/awq/_config.py | 109 +++++++++++++ example/awq/act.py | 12 ++ example/awq/auto.py | 64 ++++++++ example/awq/base.py | 244 +++++++++++++++++++++++++++++ example/awq/gemm.py | 287 +++++++++++++++++++++++++++++++++++ example/awq/llama.py | 81 ++++++++++ example/awq/module.py | 60 ++++++++ example/awq/packing_utils.py | 97 ++++++++++++ example/awq/utils.py | 110 ++++++++++++++ example/generate.py | 44 ++++++ 11 files changed, 1108 insertions(+) create mode 100644 example/awq/__init__.py create mode 100644 example/awq/_config.py create mode 100644 example/awq/act.py create mode 100644 example/awq/auto.py create mode 100644 example/awq/base.py create mode 100644 example/awq/gemm.py create mode 100644 example/awq/llama.py create mode 100644 example/awq/module.py create mode 100644 example/awq/packing_utils.py create mode 100644 example/awq/utils.py create mode 100644 example/generate.py diff --git a/example/awq/__init__.py b/example/awq/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/example/awq/_config.py b/example/awq/_config.py new file mode 100644 index 0000000..a966e5e --- /dev/null +++ b/example/awq/_config.py @@ -0,0 +1,109 @@ +import json +import os +from dataclasses import dataclass, field +from typing import Dict, List, Optional + +from transformers.utils.hub import PushToHubMixin, cached_file + + +@dataclass +class AwqConfig(PushToHubMixin): + quant_method: str = field(default='awq') + zero_point: bool = field(default=True) + q_group_size: int = field(default=128) + w_bit: int = field(default=4) + version: str = field(default='gemm') + config_file_name = 'config.json' + modules_to_not_convert: Optional[List] = None + + @classmethod + def from_dict(cls, quant_config: Dict = {}): + if not quant_config: + quant_config = cls() + else: + quant_config = cls(**quant_config) + quant_config.version = quant_config.version.lower() + + return quant_config + + @classmethod + def from_pretrained(cls, save_dir: str, **kwargs): + cache_dir = kwargs.pop('cache_dir', None) + force_download = kwargs.pop('force_download', False) + resume_download = kwargs.pop('resume_download', False) + proxies = kwargs.pop('proxies', None) + local_files_only = kwargs.pop('local_files_only', False) + use_auth_token = kwargs.pop('use_auth_token', None) + revision = kwargs.pop('revision', None) + subfolder = kwargs.pop('subfolder', None) + commit_hash = kwargs.pop('_commit_hash', None) + + if os.path.isdir(save_dir): # Local + resolved_config_file = os.path.join(save_dir, cls.config_file_name) + else: # Remote + resolved_config_file = cached_file( + save_dir, + cls.config_file_name, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + use_auth_token=use_auth_token, + revision=revision, + local_files_only=local_files_only, + subfolder=subfolder, + _raise_exceptions_for_missing_entries=False, + _raise_exceptions_for_connection_errors=False, + _commit_hash=commit_hash, + ) + + quant_config = None + if os.path.exists(resolved_config_file): + with open(resolved_config_file, 'r', encoding='utf-8') as file: + loaded_config = json.loads(file.read()) + + quant_config = loaded_config.get('quantization_config') + + if quant_config is not None: + awq_config = cls.from_transformers_dict(cls, quant_config) + quant_config = cls(**awq_config) + + if quant_config is None: + quant_config = cls() + + return quant_config + + def to_dict(self): + return { + 'zero_point': self.zero_point, + 'q_group_size': self.q_group_size, + 'w_bit': self.w_bit, + 'version': self.version, + 'modules_to_not_convert': self.modules_to_not_convert, + } + + def to_transformers_dict(self): + return { + 'quant_method': self.quant_method, + 'zero_point': self.zero_point, + 'group_size': self.q_group_size, + 'bits': self.w_bit, + 'version': self.version.lower(), + 'modules_to_not_convert': self.modules_to_not_convert, + } + + def from_transformers_dict(self, transformers_dict: Dict): + return { + 'quant_method': + transformers_dict.get('quant_method'), + 'zero_point': + transformers_dict.get('zero_point'), + 'q_group_size': + transformers_dict.get('group_size'), + 'w_bit': + transformers_dict.get('bits'), + 'version': + transformers_dict.get('version'), + 'modules_to_not_convert': + transformers_dict.get('modules_to_not_convert'), + } diff --git a/example/awq/act.py b/example/awq/act.py new file mode 100644 index 0000000..59ca7c4 --- /dev/null +++ b/example/awq/act.py @@ -0,0 +1,12 @@ +import torch.nn as nn + + +class ScaledActivation(nn.Module): + + def __init__(self, module, scales): + super().__init__() + self.act = module + self.scales = nn.Parameter(scales.data) + + def forward(self, x): + return self.act(x) / self.scales.view(1, 1, -1).to(x.device) diff --git a/example/awq/auto.py b/example/awq/auto.py new file mode 100644 index 0000000..a18db0c --- /dev/null +++ b/example/awq/auto.py @@ -0,0 +1,64 @@ +import logging +import os + +from transformers import AutoConfig + +from .base import BaseAWQForCausalLM +from .llama import LlamaAWQForCausalLM + +AWQ_CAUSAL_LM_MODEL_MAP = { + 'llama': LlamaAWQForCausalLM, +} + + +def check_and_get_model_type(model_dir, **model_init_kwargs): + config = AutoConfig.from_pretrained(model_dir, + trust_remote_code=True, + **model_init_kwargs) + if config.model_type not in AWQ_CAUSAL_LM_MODEL_MAP.keys(): + raise TypeError(f"{config.model_type} isn't supported yet.") + model_type = config.model_type + return model_type + + +class AutoAWQForCausalLM: + + def __init__(self): + raise EnvironmentError( + 'You must instantiate AutoAWQForCausalLM with\n' + 'AutoAWQForCausalLM.from_quantized or AutoAWQForCausalLM.' + 'from_pretrained') + + @classmethod + def from_quantized( + self, + quant_path, + max_seq_len=2048, + fuse_layers=True, + batch_size=1, + device_map='balanced', + max_memory=None, + offload_folder=None, + download_kwargs=None, + **config_kwargs, + ) -> BaseAWQForCausalLM: + os.environ['AWQ_BATCH_SIZE'] = str(batch_size) + model_type = check_and_get_model_type(quant_path) + + if config_kwargs.get('max_new_tokens') is not None: + max_seq_len = config_kwargs['max_new_tokens'] + logging.warning( + 'max_new_tokens argument is deprecated... gracefully ' + 'setting max_seq_len=max_new_tokens.') + + return AWQ_CAUSAL_LM_MODEL_MAP[model_type].from_quantized( + quant_path, + model_type, + max_seq_len, + fuse_layers=fuse_layers, + device_map=device_map, + max_memory=max_memory, + offload_folder=offload_folder, + download_kwargs=download_kwargs, + **config_kwargs, + ) diff --git a/example/awq/base.py b/example/awq/base.py new file mode 100644 index 0000000..9a815c4 --- /dev/null +++ b/example/awq/base.py @@ -0,0 +1,244 @@ +import gc +from typing import Dict, Union + +import torch +import torch.nn as nn +import transformers +from accelerate.big_modeling import (init_empty_weights, + load_checkpoint_and_dispatch) +from tqdm import tqdm +from transformers import AutoConfig, PretrainedConfig, PreTrainedModel +from typing_extensions import Annotated, Doc + +from ._config import AwqConfig +from .act import ScaledActivation +from .gemm import WQLinear_GEMM +from .module import (exclude_layers_to_not_quantize, get_named_linears, + set_op_by_name) + +# from turbomind import Linear +# from turbomind.utils import turbomind_post_init + +# Since we support different `AutoModelForxxx` from transformers +# we need to define a custom mapping dict as below: +TRANSFORMERS_AUTO_MAPPING_DICT = { + 'llama': 'AutoModelForCausalLM', +} + + +class BaseAWQForCausalLM(nn.Module): + + def __init__( + self, + model, + model_type, + is_quantized, + config, + quant_config, + ): + """The base model for all AutoAWQ models. + + Args: + model: The pretrained or quantized model. + model_type: The model type, found in config.json. + is_quantized: Indicates if the current model is quantized + config: The config of the model. + quant_config: The quantization config of the model. + """ + super().__init__() + self.model: PreTrainedModel = model + self.model_type: str = model_type + self.is_quantized: bool = is_quantized + self.search_result = None + self.config: PretrainedConfig = config + self.quant_config: AwqConfig = quant_config + + def to(self, device: Annotated[str, + Doc('The device to move your model to.')]): + """A utility function for moving the model to a device.""" + return self.model.to(device) + + def forward(self, *args, **kwargs): + """A forward function that mimics the torch forward.""" + return self.model(*args, **kwargs) + + def generate(self, *args, **kwargs): + """A generate function that mimics the HF generate function.""" + with torch.inference_mode(): + return self.model.generate(*args, **kwargs) + + # @staticmethod + # def fuse_layers(model): + # pass + + @classmethod + def from_quantized(self, + model_path: str, + model_type: str, + max_seq_len: int, + torch_dtype: torch.dtype = torch.float16, + device_map: Union[str, Dict] = 'balanced', + **config_kwargs: Dict): + """A method for initialization of a quantized model, usually in INT4. + + Args: + model_path (str): The model path + model_type (str): The model type, loaded from config.json. + max_seq_len (int): The maximum sequence cached sequence length of + the model. Larger values may increase loading time and + memory usage. + torch_dtype: The dtype to load the model as. May not work with + other values than float16. + device_map: A device map that will be passed onto the model + loading method from transformers. + **config_kwargs: Additional kwargs that are passed to the config + during initialization + """ + # [STEP 1-2] Load weights path and configs + model_weights_path, config, quant_config = self._load_config( + self, + model_path, + max_seq_len=max_seq_len, + **config_kwargs, + ) + + target_cls_name = TRANSFORMERS_AUTO_MAPPING_DICT[config.model_type] + target_cls = getattr(transformers, target_cls_name) + + # [STEP 3] Load model + with init_empty_weights(): + model = target_cls.from_config( + config=config, + torch_dtype=torch_dtype, + trust_remote_code=True, + ) + # Prepare WQLinear layers, replace nn.Linear + self._load_quantized_modules( + self, + model, + quant_config, + quant_config.version, + use_exllama=False, + use_exllama_v2=False, + use_qbits=False, + ) + + model.tie_weights() + + # loads the weights into modules and distributes + # across available devices automatically + load_checkpoint_and_dispatch( + model, + checkpoint=model_weights_path, + device_map=device_map, + no_split_module_classes=[self.layer_type], + dtype=torch_dtype, + ) + + # TODO + # model = turbomind_post_init(model) + + # # Dispatch to devices + # if fuse_layers: + # self.fuse_layers(model) + + model.eval() + + return self( + model, + model_type, + is_quantized=True, + config=config, + quant_config=quant_config, + ) + + def _load_config( + self, + model_path, + max_seq_len=4096, + **config_kwargs, + ): + # [STEP 2] Load config and set sequence length + # TODO: Create BaseAWQConfig class + quant_config = AwqConfig.from_pretrained(model_path) + + # Load model config and set max generation length + if max_seq_len is None and hasattr(self, 'max_seq_len_key'): + config = AutoConfig.from_pretrained(model_path, + trust_remote_code=True, + **config_kwargs) + config.max_seq_len = getattr(config, self.max_seq_len_key, 2048) + # To add the generate support for Multi-modal models as well + if hasattr(config, 'text_config'): + config.text_config.max_seq_len = getattr( + config, self.max_seq_len_key, 2048) + else: + max_seq_len = 2048 if max_seq_len is None else max_seq_len + config = AutoConfig.from_pretrained(model_path, + trust_remote_code=True, + **config_kwargs) + config.max_seq_len = max_seq_len + + return model_path, config, quant_config + + def _load_quantized_modules(self, + model, + quant_config, + version, + use_exllama, + use_exllama_v2, + use_qbits=False): + # Real quantization of weights + assert not (version == 'gemv' and + (use_exllama or use_exllama_v2 or + use_qbits)), 'Exllama kernels only support GEMM version.' + + # Get blocks of model + layers = self.get_model_layers(model) + + for i in tqdm(range(len(layers)), desc='Replacing layers...'): + layer = layers[i] + + # Get every linear layer in a block + named_linears = get_named_linears(layer) + + # Filter out the linear layers we don't want to include + named_linears = exclude_layers_to_not_quantize( + named_linears, quant_config.modules_to_not_convert) + + # Replace activation functions + self._scale_activations(self, layer) + + # Replace nn.Linear with WQLinear + for name, module in named_linears.items(): + assert version == 'gemm' + + q_linear_module = WQLinear_GEMM + # q_linear_module = Linear + q_linear = q_linear_module.from_linear( + module, quant_config.w_bit, quant_config.q_group_size, + True) + q_linear.to(next(layer.parameters()).device) + set_op_by_name(layer, name, q_linear) + + if not use_qbits: + torch.cuda.empty_cache() + gc.collect() + + @staticmethod + def _scale_activations(self, layer): + scale_dict = self.get_act_for_scaling(layer) + + if scale_dict['is_scalable']: + if not isinstance(scale_dict['scale_layer'], ScaledActivation): + param = next(layer.parameters()) + + # get activation scale + scale_like = torch.ones(scale_dict['scale_shape'], + dtype=param.dtype, + device=param.device) + + # scale activation + scaled_act = ScaledActivation(scale_dict['scale_layer'], + scale_like) + set_op_by_name(layer, scale_dict['scale_name'], scaled_act) diff --git a/example/awq/gemm.py b/example/awq/gemm.py new file mode 100644 index 0000000..664b4f5 --- /dev/null +++ b/example/awq/gemm.py @@ -0,0 +1,287 @@ +import warnings + +import torch +import torch.nn as nn +from torch.autograd import Function + +from .packing_utils import dequantize_gemm +from .utils import get_best_device + +try: + import awq_ext # with CUDA kernels (AutoAWQ_kernels) + + AWQ_INSTALLED = True +except Exception as ex: + AWQ_INSTALLED = False + warnings.warn( + f'AutoAWQ could not load GEMM kernels extension. Details: {ex}') + + +# Adapted from https://github.com/compressa-ai/AutoAWQ/tree/dev +class WQLinearMMFunction(Function): + + @staticmethod + # ctx is the first argument to forward + def forward( + ctx, + x, + qweight, + qzeros, + scales, + w_bit=4, + group_size=128, + bias=None, + out_features=0, + ): + # The forward pass can use ctx. + ctx.save_for_backward(x, qweight, qzeros, scales, bias) + ctx.out_features = out_features + + out_shape = x.shape[:-1] + (out_features, ) + x = x.to(torch.float16) + + if AWQ_INSTALLED: + FP16_MATMUL_HEURISTIC_CONDITION = x.shape[0] * x.shape[1] >= 1024 + + if FP16_MATMUL_HEURISTIC_CONDITION: + out = awq_ext.dequantize_weights_cuda(qweight, scales, qzeros, + 0, 0, 0, False) + out = torch.matmul(x, out) + else: + out = awq_ext.gemm_forward_cuda(x.reshape(-1, x.shape[-1]), + qweight, scales, qzeros, 8) + else: + out = dequantize_gemm(qweight, qzeros, scales, w_bit, group_size) + out = torch.matmul(x, out) + + out = out + bias if bias is not None else out + out = out.reshape(out_shape) + + # always want 3D tensor if tensor is 2D + if len(out.shape) == 2: + out = out.unsqueeze(0) + + return out + + @staticmethod + def backward(ctx, grad_output): + input, qweight, qzeros, scales, bias = ctx.saved_tensors + + if not AWQ_INSTALLED: + raise ValueError( + 'auto-awq kernels is needed to be installed to ' + 'use `.backward()`. Make sure to install the auto-awq kernels' + ' by following the installation guides in ' + 'https://github.com/casper-hansen/AutoAWQ_kernels') + + # Cast to correct dtype for mixed precision training + weights = awq_ext.dequantize_weights_cuda(qweight, scales, qzeros, 1, + 0, 0, + False).to(grad_output.dtype) + + if ctx.needs_input_grad[0]: + # 3D matmul using torch.bmm: + # https://pytorch.org/docs/stable/generated/torch.bmm.html#torch.bmm # noqa + # to propagate gradient across all batch sizes. + batch_size = grad_output.shape[0] + grad_input = grad_output.bmm( + weights.transpose(0, 1).unsqueeze(0).repeat(batch_size, 1, 1)) + + return grad_input, None, None, None, None, None, None, None + + +class WQLinear_GEMM(nn.Module): + + def __init__(self, + w_bit, + group_size, + in_features, + out_features, + bias, + dev, + training=False): + super().__init__() + + if w_bit not in [4]: + raise NotImplementedError('Only 4-bit are supported for now.') + + self.in_features = in_features + self.out_features = out_features + self.w_bit = w_bit + self.group_size = group_size if group_size != -1 else in_features + self.training = training + + # quick sanity check (make sure alignment) + assert self.in_features % self.group_size == 0 + assert out_features % (32 // self.w_bit) == 0 + + self.register_buffer( + 'qweight', + torch.zeros( + (in_features, out_features // (32 // self.w_bit)), + dtype=torch.int32, + device=dev, + ), + ) + self.register_buffer( + 'qzeros', + torch.zeros( + (in_features // self.group_size, out_features // + (32 // self.w_bit)), + dtype=torch.int32, + device=dev, + ), + ) + self.register_buffer( + 'scales', + torch.zeros( + (in_features // self.group_size, out_features), + dtype=torch.float16, + device=dev, + ), + ) + if bias: + self.register_buffer( + 'bias', + torch.zeros( + (out_features), + dtype=torch.float16, + device=dev, + ), + ) + else: + self.bias = None + + @classmethod + def from_linear(cls, + linear, + w_bit, + group_size, + init_only=False, + scales=None, + zeros=None): + awq_linear = cls( + w_bit, + group_size, + linear.in_features, + linear.out_features, + linear.bias is not None, + linear.weight.device, + ) + if init_only: # just prepare for loading sd + return awq_linear + + # need scales and zeros info for real quantization + assert scales is not None and zeros is not None + scale_zeros = zeros * scales + + awq_linear.scales = scales.clone().half() + if linear.bias is not None: + awq_linear.bias = linear.bias.clone().half() + + pack_num = 32 // awq_linear.w_bit + + intweight = [] + for idx in range(awq_linear.in_features): + intweight.append( + torch.round( + (linear.weight.data[:, idx] + + scale_zeros[idx // group_size]) / + awq_linear.scales[idx // group_size]).to(torch.int)[:, + None]) + intweight = torch.cat(intweight, dim=1) + intweight = intweight.t().contiguous() + intweight = intweight.to(dtype=torch.int32) + + best_device = get_best_device() + + # Avoid: The operator 'aten::__lshift__.Scalar' is not currently + # implemented for the MPS device + if 'mps' in best_device: + intweight = intweight.to('cpu') + + qweight = torch.zeros( + (intweight.shape[0], intweight.shape[1] // 32 * awq_linear.w_bit), + dtype=torch.int32, + device=intweight.device, + ) + + for col in range(intweight.shape[1] // pack_num): + if awq_linear.w_bit == 4: + order_map = [0, 2, 4, 6, 1, 3, 5, 7] + else: + raise NotImplementedError('Only 4-bit are supported for now.') + for i in range(pack_num): + qweight_col = intweight[:, col * pack_num + order_map[i]] + qweight[:, col] |= qweight_col << (i * awq_linear.w_bit) + awq_linear.qweight = qweight + + zeros = zeros.to(dtype=torch.int32, device=best_device) + + if 'mps' in best_device: + zeros = zeros.to('cpu') + + qzeros = torch.zeros( + (zeros.shape[0], zeros.shape[1] // 32 * awq_linear.w_bit), + dtype=torch.int32, + device=zeros.device, + ) + + for col in range(zeros.shape[1] // pack_num): + if awq_linear.w_bit == 4: + order_map = [0, 2, 4, 6, 1, 3, 5, 7] + else: + raise NotImplementedError('Only 4-bit are supported for now.') + for i in range(pack_num): + qzero_col = zeros[:, col * pack_num + order_map[i]] + qzeros[:, col] |= qzero_col << (i * awq_linear.w_bit) + awq_linear.qzeros = qzeros + + return awq_linear + + def forward(self, x): + out_shape = x.shape[:-1] + (self.out_features, ) + + input_dtype = x.dtype + if input_dtype != torch.float16: + x = x.half() + + if self.training: + out = WQLinearMMFunction.apply( + x, + self.qweight, + self.qzeros, + self.scales, + self.w_bit, + self.group_size, + self.bias, + self.out_features, + ) + else: + with torch.no_grad(): + out = WQLinearMMFunction.apply( + x, + self.qweight, + self.qzeros, + self.scales, + self.w_bit, + self.group_size, + self.bias, + self.out_features, + ) + + if input_dtype != torch.float16: + out = out.to(dtype=input_dtype) + + return out.reshape(out_shape) + + def extra_repr(self) -> str: + return ( + 'in_features={}, out_features={}, bias={}, w_bit={}, group_size={}' + .format( + self.in_features, + self.out_features, + self.bias is not None, + self.w_bit, + self.group_size, + )) diff --git a/example/awq/llama.py b/example/awq/llama.py new file mode 100644 index 0000000..f221e48 --- /dev/null +++ b/example/awq/llama.py @@ -0,0 +1,81 @@ +# from awq.utils.fused_utils import fuse_qkv +# from awq.modules.fused.block import LlamaLikeBlock +# from awq.modules.fused.model import LlamaLikeModel +from transformers.models.llama.modeling_llama import \ + LlamaDecoderLayer as OldLlamaDecoderLayer +from transformers.models.llama.modeling_llama import \ + LlamaForCausalLM as OldLlamaForCausalLM + +from .base import BaseAWQForCausalLM + +# from awq.modules.fused.norm import FasterTransformerRMSNorm + + +class LlamaAWQForCausalLM(BaseAWQForCausalLM): + layer_type = 'LlamaDecoderLayer' + max_seq_len_key = 'max_position_embeddings' + + # @staticmethod + # def fuse_layers(model: OldLlamaForCausalLM): + # fuser = LlamaFuser(model) + # fuser.fuse_transformer() + + @staticmethod + def get_model_layers(model: OldLlamaForCausalLM): + return model.model.layers + + @staticmethod + def get_act_for_scaling(module: OldLlamaDecoderLayer): + return dict(is_scalable=False) + + @staticmethod + def move_embed(model: OldLlamaForCausalLM, device: str): + model.model.embed_tokens = model.model.embed_tokens.to(device) + + @staticmethod + def get_layers_for_scaling(module: OldLlamaDecoderLayer, input_feat, + module_kwargs): + layers = [] + + # attention input + layers.append( + dict( + prev_op=module.input_layernorm, + layers=[ + module.self_attn.q_proj, + module.self_attn.k_proj, + module.self_attn.v_proj, + ], + inp=input_feat['self_attn.q_proj'], + module2inspect=module.self_attn, + kwargs=module_kwargs, + )) + + # attention out + # Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696 # noqa + if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape: # noqa + layers.append( + dict( + prev_op=module.self_attn.v_proj, + layers=[module.self_attn.o_proj], + inp=input_feat['self_attn.o_proj'], + )) + + # linear 1 + layers.append( + dict( + prev_op=module.post_attention_layernorm, + layers=[module.mlp.gate_proj, module.mlp.up_proj], + inp=input_feat['mlp.gate_proj'], + module2inspect=module.mlp, + )) + + # linear 2 + layers.append( + dict( + prev_op=module.mlp.up_proj, + layers=[module.mlp.down_proj], + inp=input_feat['mlp.down_proj'], + )) + + return layers diff --git a/example/awq/module.py b/example/awq/module.py new file mode 100644 index 0000000..aeefb66 --- /dev/null +++ b/example/awq/module.py @@ -0,0 +1,60 @@ +import torch.nn as nn + + +def get_named_linears(module): + return { + name: m + for name, m in module.named_modules() if isinstance(m, nn.Linear) + } + + +def get_op_by_name(module, op_name): + # get the op by its name relative to the module + for name, m in module.named_modules(): + if name == op_name: + return m + raise ValueError(f'Cannot find op {op_name} in module {module}') + + +def set_op_by_name(layer, name, new_module): + levels = name.split('.') + if len(levels) > 1: + mod_ = layer + for l_idx in range(len(levels) - 1): + if levels[l_idx].isdigit(): + mod_ = mod_[int(levels[l_idx])] + else: + mod_ = getattr(mod_, levels[l_idx]) + setattr(mod_, levels[-1], new_module) + else: + setattr(layer, name, new_module) + + +def get_op_name(module, op): + # get the name of the op relative to the module + for name, m in module.named_modules(): + if m is op: + return name + raise ValueError(f'Cannot find op {op} in module {module}') + + +def append_str_prefix(x, prefix): + if isinstance(x, str): + return prefix + x + elif isinstance(x, tuple): + return tuple([append_str_prefix(y, prefix) for y in x]) + elif isinstance(x, list): + return [append_str_prefix(y, prefix) for y in x] + else: + return x + + +def exclude_layers_to_not_quantize(linear_layers, modules_to_not_convert): + if modules_to_not_convert is None: + return linear_layers + + filtered_layers = {} + for name, linear_layer in linear_layers.items(): + if not any(key in name for key in modules_to_not_convert): + filtered_layers[name] = linear_layer + return filtered_layers diff --git a/example/awq/packing_utils.py b/example/awq/packing_utils.py new file mode 100644 index 0000000..a9724fa --- /dev/null +++ b/example/awq/packing_utils.py @@ -0,0 +1,97 @@ +import torch + +AWQ_ORDER = [0, 2, 4, 6, 1, 3, 5, 7] +AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7] + + +def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int): + shifts = torch.arange(0, 32, bits, device=qzeros.device) + + # unpacking columnwise + iweights = torch.bitwise_right_shift( + qweight[:, :, None], + shifts[None, None, :]).to(torch.int8 # smallest dtype available + ) + iweights = iweights.view(iweights.shape[0], -1) + + # unpacking columnwise + if qzeros is not None: + izeros = torch.bitwise_right_shift( + qzeros[:, :, None], + shifts[None, None, :]).to(torch.int8 # smallest dtype available + ) + izeros = izeros.view(izeros.shape[0], -1) + else: + izeros = qzeros + + return iweights, izeros + + +def reverse_awq_order(iweights: torch.Tensor, izeros: torch.Tensor, bits: int): + reverse_order_tensor = torch.arange( + iweights.shape[-1], + dtype=torch.int32, + device=izeros.device, + ) + reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits) + reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER] + reverse_order_tensor = reverse_order_tensor.view(-1) + + if izeros is not None: + izeros = izeros[:, reverse_order_tensor] + iweights = iweights[:, reverse_order_tensor] + + return iweights, izeros + + +def pack_exllama(iweights: torch.Tensor, izeros: torch.Tensor, bits: int): + shifts = torch.arange(0, 32, bits, device=iweights.device) + + # packing rowwise + iweights = iweights.view(iweights.shape[0] // (32 // bits), 32 // bits, -1) + qweight = (torch.bitwise_left_shift( + iweights, shifts[None, :, None]).sum(dim=1).to(torch.int32)) + + # packing columnwise + izeros = izeros.view(-1, izeros.shape[1] // (32 // bits), 32 // bits) + qzeros = (torch.bitwise_left_shift( + izeros, shifts[None, None, :]).sum(dim=-1).to(torch.int32)) + + return qweight, qzeros + + +def unpack_reorder_pack(qweight, qzeros, bits): + # Unpack the qweight and qzeros tensors + iweight, izeros = unpack_awq(qweight, qzeros, bits) + # Reverse the order of the iweight and izeros tensors + iweight, izeros = reverse_awq_order(iweight, izeros, bits) + + # overflow checks + iweight = torch.bitwise_and(iweight, (2**bits) - 1) + izeros = torch.bitwise_and(izeros, (2**bits) - 1) + + # Subtract 1 from the izeros tensor (exllama adds 1 during inference) + # We can remove it if we remove the +1 in the exllama code + izeros = izeros - 1 + # Pack the qweight and qzeros tensors + qweight, qzeros = pack_exllama(iweight, izeros, bits) + + return qweight, qzeros + + +def dequantize_gemm(qweight, qzeros, scales, bits, group_size): + # Unpack the qweight and qzeros tensors + iweight, izeros = unpack_awq(qweight, qzeros, bits) + # Reverse the order of the iweight and izeros tensors + iweight, izeros = reverse_awq_order(iweight, izeros, bits) + + # overflow checks + iweight = torch.bitwise_and(iweight, (2**bits) - 1) + izeros = torch.bitwise_and(izeros, (2**bits) - 1) + + # fp16 weights + scales = scales.repeat_interleave(group_size, dim=0) + izeros = izeros.repeat_interleave(group_size, dim=0) + iweight = (iweight - izeros) * scales + + return iweight diff --git a/example/awq/utils.py b/example/awq/utils.py new file mode 100644 index 0000000..3ba594c --- /dev/null +++ b/example/awq/utils.py @@ -0,0 +1,110 @@ +import gc +import importlib + +import accelerate +import torch + +qbits_available = importlib.util.find_spec( + 'intel_extension_for_transformers') is not None + + +def get_module_by_name_suffix(model, module_name: str): + for name, module in model.named_modules(): + if name.endswith(module_name): + return module + + +def simple_dispatch_model(model, device_map): + from accelerate.hooks import AlignDevicesHook, add_hook_to_module + + if '' in device_map: + d = device_map[''] + model = model.to(torch.device(d)) + model.hf_device_map = device_map + return model + + tied_params = accelerate.utils.modeling.find_tied_parameters(model) + if set(device_map.values()) == {'cpu'} or set(device_map.values()) == { + 'cpu', + 'disk', + }: + main_device = 'cpu' + else: + main_device = [ + d for d in device_map.values() if d not in ['cpu', 'disk'] + ][0] + + cpu_offload_group = [(n, d) for n, d in device_map.items() if d == 'cpu'] + prev_hook = None + for idx, (n, d) in enumerate(cpu_offload_group): + m = get_module_by_name_suffix(model, n) + _, prev_hook = accelerate.cpu_offload_with_hook( + m, execution_device=main_device, prev_module_hook=prev_hook) + # set first cpu offload module's prev_module_hook + # to the last cpu offload module's hook + if len(cpu_offload_group) > 1: + get_module_by_name_suffix( + model, + cpu_offload_group[0][0])._hf_hook.prev_module_hook = prev_hook + + for n, d in device_map.items(): + m = get_module_by_name_suffix(model, n) + if d != 'cpu': + d = torch.device(d) + hook = AlignDevicesHook(d, + io_same_device=True, + place_submodules=True) + add_hook_to_module(m, hook) + accelerate.utils.modeling.retie_parameters(model, tied_params) + model.hf_device_map = device_map + + return model + + +def set_module_name(model, name, value): + if '.' in name: + parent_name = name.rsplit('.', 1)[0] + child_name = name[len(parent_name) + 1:] + parent = model.get_submodule(parent_name) + else: + parent_name = '' + parent = model + child_name = name + + setattr(parent, child_name, value) + + +def clear_memory(weight=None): + if weight is not None: + del weight + gc.collect() + torch.cuda.empty_cache() + + +def compute_memory_used_pct(device): + memory_used = torch.cuda.max_memory_allocated(device) / (1024**3) + memory_pct = (memory_used / + (torch.cuda.get_device_properties(device).total_memory / + (1024**3)) * 100) + return memory_pct + + +def get_best_device(): + if torch.backends.mps.is_available(): + return 'mps' + elif torch.cuda.is_available(): + return 'cuda:0' + else: + return 'cpu' + + +def get_lowest_memory_device_index(): + device = None + curr_device_memory_pct = 0 + for device_index in range(torch.cuda.device_count()): + device_memory_pct = compute_memory_used_pct(device_index) + if device is None or device_memory_pct < curr_device_memory_pct: + device = device_index + curr_device_memory_pct = device_memory_pct + + return device diff --git a/example/generate.py b/example/generate.py new file mode 100644 index 0000000..cca53ab --- /dev/null +++ b/example/generate.py @@ -0,0 +1,44 @@ +import torch +from awq.auto import AutoAWQForCausalLM +from transformers import AutoTokenizer, TextStreamer + +quant_path = '/mnt/140/llama3/Meta-Llama-3-8B-Instruct-hf-AWQ' + +# Load model +model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True) + +tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True) +streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) + +prompt = "You're standing on the surface of the Earth. "\ + 'You walk one mile south, one mile west and one mile north. '\ + 'You end up exactly where you started. Where are you?' + +chat = [ + { + 'role': 'system', + 'content': 'You are a concise assistant that helps answer questions.' + }, + { + 'role': 'user', + 'content': prompt + }, +] + +terminators = [ + tokenizer.eos_token_id, + tokenizer.convert_tokens_to_ids('<|eot_id|>') +] + +tokens = tokenizer.apply_chat_template(chat, return_tensors='pt') +tokens = tokens.to(torch.device('cuda')) + +# Generate output +generation_output = model.generate(tokens, + streamer=streamer, + max_new_tokens=64, + eos_token_id=terminators) + +res = tokenizer.decode(generation_output[0].cpu().numpy().tolist()) +print(f'token_ids: {generation_output}') +print(f'output: {res}') From 8cc45809aa1e08f4345403e8361a1a5c8783715f Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Sat, 12 Oct 2024 19:24:09 +0800 Subject: [PATCH 05/18] bind turbomind linear --- CMakeLists.txt | 5 +- MANIFEST.in | 5 + example/test_linear.py | 163 ++++++++ generate.sh | 3 +- src/turbomind/CMakeLists.txt | 2 - src/turbomind/api/python/CMakeLists.txt | 4 +- src/turbomind/api/python/bind.cpp | 367 +++++++++++++++++ src/turbomind/api/python/linear.cc | 284 ++++++++++++++ src/turbomind/api/python/linear.h | 50 +++ src/turbomind/kernels/gemm/CMakeLists.txt | 4 +- src/turbomind/utils/cuda_utils.h | 456 ++++++++++++++++++++++ src/turbomind/utils/tensor.h | 86 ++++ turbomind/__init__.py | 4 + turbomind/linear.py | 225 ++++++++++- 14 files changed, 1645 insertions(+), 13 deletions(-) create mode 100644 MANIFEST.in create mode 100644 example/test_linear.py create mode 100644 src/turbomind/api/python/linear.cc create mode 100644 src/turbomind/api/python/linear.h create mode 100644 src/turbomind/utils/cuda_utils.h create mode 100644 src/turbomind/utils/tensor.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 6f8f984..29299f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,9 +150,8 @@ link_directories( add_subdirectory(src/turbomind) # install python api -if (BUILD_PY_FFI) - install(TARGETS _tm DESTINATION ${CMAKE_SOURCE_DIR}/turbomind/lib) -endif () +install(TARGETS turbomind_kernels DESTINATION ${CMAKE_SOURCE_DIR}/turbomind/lib) + if (MSVC) return() diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..0462a5a --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ + +include turbomind/lib/*.so +include turbomind/lib/*.so* +include turbomind/lib/*.dll +include turbomind/lib/*.pyd diff --git a/example/test_linear.py b/example/test_linear.py new file mode 100644 index 0000000..2bd6e46 --- /dev/null +++ b/example/test_linear.py @@ -0,0 +1,163 @@ +import torch +from safetensors import safe_open + +import turbomind as tm + +torch.manual_seed(0) + + +def i32x8_to_i4x8(w): + """merge 8 integers (range from 0 to 15) into one 32-bit integer.""" + assert w.shape[-1] % 8 == 0 + shape = (w.shape[0], w.numel() // (w.shape[0] * 8), 8) + shape = shape[:-1] + (1, ) + result = torch.zeros(shape, dtype=w.dtype, device=w.device) + mask = torch.tensor([15], dtype=w.dtype, device=w.device) + for i in range(8): + shift = 4 * (7 - i) + result[..., 0] |= (w[..., i] & mask) << shift + result = result.view(w.shape[0], -1) + return result + + +def i4x8_to_i32x8(w): + """split one integer every 4bits into 8 integers (range from 0 to 15)""" + shape = w.shape + (8, ) + result = torch.zeros(shape, dtype=w.dtype, device=w.device) + mask = torch.tensor([15], dtype=w.dtype, device=w.device) + for i in range(8): + shift = 4 * (7 - i) + result[..., i] = (w >> shift) & mask + result = result.view(w.shape[0], -1) + return result + + +# ## test i4x8_to_i32x8 +# value = 1636164468 +# print(hex(value)) +# a = torch.tensor([[value, value], [value, value]], dtype=torch.int32) +# b = i4x8_to_i32x8(a) +# print(b) +# c = i32x8_to_i4x8(b) +# print(c) +# cmp = a == c +# assert torch.sum(cmp) == cmp.numel() +# exit(0) +# ## end test + + +def makeup_qweight(in_features: int, out_features: int): + assert out_features % 8 == 0 + qweight = torch.randint(0, + 16, (in_features, out_features // 8, 8), + dtype=torch.int32, + device='cuda') + print(f'-- makeup qweight: shape {qweight.shape}') + print(qweight.view(in_features, -1)) + qweight = i32x8_to_i4x8(qweight) + print(f'-- merge qweight: shape {qweight.shape}') + print(qweight) + return qweight + + +def makup_qzeros(in_features: int, out_features: int, group_size: int): + assert out_features % 8 == 0 + assert in_features % group_size == 0 and in_features // group_size >= 1 + + qzeros = torch.randint(0, + 16, + (in_features // group_size, out_features // 8, 8), + dtype=torch.int32, + device='cuda') + print(f'-- makeup qzero: shape {qzeros.shape}') + print(qzeros.view(in_features // group_size, -1)) + qzeros = i32x8_to_i4x8(qzeros) + print(f'-- merge qzero: shape {qzeros.shape}\n{qzeros}') + return qzeros + + +def makup_scales(in_features: int, out_featurse: int, group_size: int): + assert in_features % group_size == 0 and in_features // group_size >= 1 + scales = torch.rand((in_features // group_size, out_features), + dtype=torch.float16, + device='cuda') + print(f'-- makeup scales: shape {scales.shape}\n{scales}') + return scales + + +def dequantize(qweight, qzeros, scales, group_size: int = 128): + _qweight = i4x8_to_i32x8(qweight) + _qzeros = i4x8_to_i32x8(qzeros) + _qzeros = _qzeros.half() + weight = _qweight.clone().half() + for i in range(qzeros.shape[0]): + start = i * group_size + end = start + group_size + weight[start:end] = (weight[start:end, :] - + _qzeros[i:i + 1, :]) * scales[i:i + 1, :] + return weight + + +# in_features = 128 +# out_features = 8 +# group_size = 128 +# qweight = makeup_qweight(in_features, out_features) +# qzeros = makup_qzeros(in_features=in_features, +# out_features=out_features, +# group_size=group_size) +# scales = makup_scales(in_features, +# out_featurse=out_features, +# group_size=group_size) + +# weight = dequantize(qweight, qzeros, scales, group_size) +# print(f'-- dequantization: weight.shape={weight.shape}, weight: \n{weight}') +# ref_linear = nn.Linear(in_features, out_features, bias=False, device='cuda') +# with torch.no_grad(): +# ref_linear.weight = nn.Parameter(weight.T) + +# x = torch.randn(in_features, device=weight.device, dtype=weight.dtype) +# print(f'input: {x}') +# print(weight.device, x.device) +# ref_res = ref_linear(x) +# print(ref_res) + + +def load_specified_linear_weights(): + ckpt_path = '/models/140/llama3/Meta-Llama-3-8B-Instruct-hf-AWQ/model-00001-of-00002.safetensors' # noqa + layer_id = 0 + # prefix = f'model.layers.{layer_id}.self_attn.q_proj.' + prefix = f'model.layers.{layer_id}.mlp.gate_proj.' + keys = ['qweight', 'qzeros', 'scales'] + tensors = {} + with safe_open(ckpt_path, framework='pt', device='cuda') as f: + for key in keys: + tensors[key] = f.get_tensor(prefix + key) + + return tensors + + +tensors = load_specified_linear_weights() +qweight, qzeros, scales = tensors['qweight'], tensors['qzeros'], tensors[ + 'scales'] + +group_size = 128 +in_features = qweight.shape[0] +out_features = qweight.shape[1] * 8 + +model = tm.Linear(in_features=in_features, + out_features=out_features, + bias=False, + quant_method='awq', + w_bit=4, + group_size=group_size) + +model.qweight = qweight +model.qzeros = qzeros +model.scales = scales + +model.post_init() + +x = torch.randn(in_features, device=qweight.device, dtype=torch.float16) +res = model(x) +# max_diff = max(abs(ref_res - res)) +# ave_diff = sum(abs(ref_res - res)) / ref_res.numel() diff --git a/generate.sh b/generate.sh index 7ed3168..09c51a7 100755 --- a/generate.sh +++ b/generate.sh @@ -1,9 +1,8 @@ #!/bin/sh -cmake -G Ninja .. \ +cmake .. \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ - -DCMAKE_CUDA_COMPILER=/usr/local/cuda-11.8/bin/nvcc \ -DCMAKE_CUDA_FLAGS="-lineinfo" \ -DUSE_NVTX=ON \ -DFETCHCONTENT_UPDATES_DISCONNECTED=ON \ diff --git a/src/turbomind/CMakeLists.txt b/src/turbomind/CMakeLists.txt index ed840d6..009d0f8 100644 --- a/src/turbomind/CMakeLists.txt +++ b/src/turbomind/CMakeLists.txt @@ -1,5 +1,3 @@ - - add_subdirectory(utils) add_subdirectory(kernels/gemm) add_subdirectory(api/python) diff --git a/src/turbomind/api/python/CMakeLists.txt b/src/turbomind/api/python/CMakeLists.txt index 170fb36..bf9acb7 100644 --- a/src/turbomind/api/python/CMakeLists.txt +++ b/src/turbomind/api/python/CMakeLists.txt @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. cmake_minimum_required(VERSION 3.8) -project(_turbomind_ext) +project(turbomind_kernels) find_package(pybind11 CONFIG) if(NOT pybind11_FOUND) @@ -12,7 +12,7 @@ if(NOT pybind11_FOUND) find_package(pybind11 CONFIG) endif() -pybind11_add_module(${PROJECT_NAME} bind.cpp) +pybind11_add_module(${PROJECT_NAME} bind.cpp linear.cc) target_link_libraries(${PROJECT_NAME} PRIVATE gemm2) target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17) diff --git a/src/turbomind/api/python/bind.cpp b/src/turbomind/api/python/bind.cpp index e69de29..bd81fd9 100644 --- a/src/turbomind/api/python/bind.cpp +++ b/src/turbomind/api/python/bind.cpp @@ -0,0 +1,367 @@ + +#include "src/turbomind/api/python/dlpack.h" +#include "src/turbomind/api/python/linear.h" +#include "src/turbomind/utils/tensor.h" +#include "src/turbomind/utils/cuda_utils.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace py = pybind11; +// namespace tm = turbomind; +using namespace pybind11::literals; + +static const char kDlTensorCapsuleName[] = "dltensor"; + +DLDevice getDLDevice(turbomind::Tensor& tensor) +{ + int device_id = 0; + if (tensor.where == turbomind::MEMORY_GPU) { + cudaPointerAttributes ptr_attr; + cudaPointerGetAttributes(&ptr_attr, tensor.data); + device_id = ptr_attr.device; + } + + DLDevice device{kDLCPU, device_id}; + + switch (tensor.where) { + case turbomind::MEMORY_CPU: + device.device_type = DLDeviceType::kDLCPU; + break; + case turbomind::MEMORY_CPU_PINNED: + device.device_type = DLDeviceType::kDLCUDAHost; + break; + case turbomind::MEMORY_GPU: + device.device_type = DLDeviceType::kDLCUDA; + break; + default: + break; + } + + return device; +} + +DLManagedTensor* TurbomindTensorToDLManagedTensor(turbomind::Tensor& tensor) +{ + DLDevice device = getDLDevice(tensor); + + DLDataType data_type{0, 0, 1}; + switch (tensor.type) { + case turbomind::TYPE_BOOL: + data_type.code = DLDataTypeCode::kDLBool; + data_type.bits = 8; + break; + case turbomind::TYPE_UINT8: + data_type.code = DLDataTypeCode::kDLUInt; + data_type.bits = 8; + break; + case turbomind::TYPE_UINT16: + data_type.code = DLDataTypeCode::kDLUInt; + data_type.bits = 16; + break; + case turbomind::TYPE_UINT32: + data_type.code = DLDataTypeCode::kDLUInt; + data_type.bits = 32; + break; + case turbomind::TYPE_UINT64: + data_type.code = DLDataTypeCode::kDLUInt; + data_type.bits = 64; + break; + case turbomind::TYPE_INT8: + case turbomind::TYPE_BYTES: + data_type.code = DLDataTypeCode::kDLInt; + data_type.bits = 8; + break; + case turbomind::TYPE_INT16: + data_type.code = DLDataTypeCode::kDLInt; + data_type.bits = 16; + break; + case turbomind::TYPE_INT32: + data_type.code = DLDataTypeCode::kDLInt; + data_type.bits = 32; + break; + case turbomind::TYPE_INT64: + data_type.code = DLDataTypeCode::kDLInt; + data_type.bits = 64; + break; + case turbomind::TYPE_FP16: + data_type.code = DLDataTypeCode::kDLFloat; + data_type.bits = 16; + break; + case turbomind::TYPE_FP32: + data_type.code = DLDataTypeCode::kDLFloat; + data_type.bits = 32; + break; + case turbomind::TYPE_FP64: + data_type.code = DLDataTypeCode::kDLFloat; + data_type.bits = 64; + break; + case turbomind::TYPE_BF16: + data_type.code = DLDataTypeCode::kDLBfloat; + data_type.bits = 16; + break; + default: + break; + } + DLTensor dl_tensor{const_cast(tensor.data), + device, + (int32_t)(tensor.shape.size()), + data_type, + reinterpret_cast(const_cast(tensor.shape.data())), + (int64_t*)(nullptr), + 0}; + return new DLManagedTensor{dl_tensor, nullptr, [](DLManagedTensor* dlmt) { delete dlmt; }}; +} + +turbomind::MemoryType getMemoryType(DLDevice device) +{ + switch (device.device_type) { + case DLDeviceType::kDLCUDAHost: + return turbomind::MemoryType::MEMORY_CPU_PINNED; + case DLDeviceType::kDLCUDA: + return turbomind::MemoryType::MEMORY_GPU; + case DLDeviceType::kDLCPU: + default: + return turbomind::MemoryType::MEMORY_CPU; + } +} + +turbomind::DataType getDataType(DLDataType data_type) +{ + switch (data_type.code) { + case DLDataTypeCode::kDLUInt: + switch (data_type.bits) { + case 8: + return turbomind::TYPE_UINT8; + case 16: + return turbomind::TYPE_UINT16; + case 32: + return turbomind::TYPE_UINT32; + case 64: + return turbomind::TYPE_UINT64; + default: + return turbomind::TYPE_INVALID; + } + break; + case DLDataTypeCode::kDLInt: + switch (data_type.bits) { + case 8: + return turbomind::TYPE_INT8; + case 16: + return turbomind::TYPE_INT16; + case 32: + return turbomind::TYPE_INT32; + case 64: + return turbomind::TYPE_INT64; + default: + return turbomind::TYPE_INVALID; + } + break; + case DLDataTypeCode::kDLFloat: + switch (data_type.bits) { + case 16: + return turbomind::TYPE_FP16; + case 32: + return turbomind::TYPE_FP32; + case 64: + return turbomind::TYPE_FP64; + default: + return turbomind::TYPE_INVALID; + } + break; + case DLDataTypeCode::kDLBfloat: + switch (data_type.bits) { + case 16: + return turbomind::TYPE_BF16; + default: + return turbomind::TYPE_INVALID; + } + break; + case DLDataTypeCode::kDLBool: + return turbomind::TYPE_BOOL; + default: + return turbomind::TYPE_INVALID; + } +} + +std::shared_ptr DLManagedTensorToTurbomindTensor(DLManagedTensor* tensor) +{ + auto& dl_tensor = tensor->dl_tensor; + auto where = getMemoryType(dl_tensor.device); + auto dtype = getDataType(dl_tensor.dtype); + assert(dl_tensor.ndim > 0); + std::vector shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim); + auto data = dl_tensor.data; + + return std::make_shared(where, dtype, shape, data); +} + +std::shared_ptr TorchTensorToTurbomindTensor(py::object obj) +{ + py::capsule cap = obj.attr("__dlpack__")(); + DLManagedTensor* dlmt = static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); + return DLManagedTensorToTurbomindTensor(dlmt); +} + +PYBIND11_MODULE(turbomind_kernels, m) { + py::enum_(m, "WeightType") + .value("kFP32", turbomind::WeightType::kFP32) + .value("kFP16", turbomind::WeightType::kFP16) + .value("kFP8", turbomind::WeightType::kFP8) + .value("kBF16", turbomind::WeightType::kBF16) + .value("kINT8", turbomind::WeightType::kINT8) + .value("kINT4", turbomind::WeightType::kINT4); + + // data type + py::enum_(m, "DataType") + .value("TYPE_INVALID", turbomind::DataType::TYPE_INVALID) + .value("TYPE_BOOL", turbomind::DataType::TYPE_BOOL) + .value("TYPE_UINT8", turbomind::DataType::TYPE_UINT8) + .value("TYPE_UINT16", turbomind::DataType::TYPE_UINT16) + .value("TYPE_UINT32", turbomind::DataType::TYPE_UINT32) + .value("TYPE_UINT64", turbomind::DataType::TYPE_UINT64) + .value("TYPE_INT8", turbomind::DataType::TYPE_INT8) + .value("TYPE_INT16", turbomind::DataType::TYPE_INT16) + .value("TYPE_INT32", turbomind::DataType::TYPE_INT32) + .value("TYPE_INT64", turbomind::DataType::TYPE_INT64) + .value("TYPE_FP16", turbomind::DataType::TYPE_FP16) + .value("TYPE_FP32", turbomind::DataType::TYPE_FP32) + .value("TYPE_FP64", turbomind::DataType::TYPE_FP64) + .value("TYPE_BYTES", turbomind::DataType::TYPE_BYTES) + .value("TYPE_BF16", turbomind::DataType::TYPE_BF16); + + // memory type + py::enum_(m, "MemoryType") + .value("MEMORY_CPU", turbomind::MemoryType::MEMORY_CPU) + .value("MEMORY_CPU_PINNED", turbomind::MemoryType::MEMORY_CPU_PINNED) + .value("MEMORY_GPU", turbomind::MemoryType::MEMORY_GPU); + + // tensor + py::class_>(m, "Tensor") + .def_readonly("where", &turbomind::Tensor::where) + .def_readonly("type", &turbomind::Tensor::type) + .def_readonly("shape", &turbomind::Tensor::shape) + .def_readonly("data", &turbomind::Tensor::data) + .def(py::init([](const turbomind::MemoryType where, + const turbomind::DataType type, + const std::vector& shape, + const long data) { + auto data_ptr = reinterpret_cast(data); + return new turbomind::Tensor(where, type, shape, data_ptr); + })) + .def( + "view", + [](turbomind::Tensor* self, turbomind::DataType new_type) { + return new turbomind::Tensor(self->where, new_type, self->shape, self->data); + }, + "new_type"_a) + .def( + "view", + [](turbomind::Tensor* self, std::vector new_shape) { + return new turbomind::Tensor(self->where, self->type, new_shape, self->data); + }, + "new_shape"_a) + .def( + "copy_from", + [](turbomind::Tensor* self, py::object obj) { + py::capsule cap = obj.attr("__dlpack__")(); + DLManagedTensor* dlmt = + static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); + auto src = DLManagedTensorToTurbomindTensor(dlmt); + switch (self->type) { + case turbomind::TYPE_FP16: + case turbomind::TYPE_FP32: + case turbomind::TYPE_INT32: + case turbomind::TYPE_BF16: { + auto num_element = + std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies()); + auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8; + turbomind::TM_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]); + cudaMemcpy( + const_cast(self->data), const_cast(src->data), num_bytes, cudaMemcpyDefault); + break; + } + default: + turbomind::TM_CHECK(0); + } + }, + "tensor"_a) + .def( + "__dlpack__", + [](turbomind::Tensor* self, long stream) { + DLManagedTensor* dlmt = TurbomindTensorToDLManagedTensor(*self); + return py::capsule(dlmt, kDlTensorCapsuleName, [](PyObject* obj) { + DLManagedTensor* dlmt = + static_cast(PyCapsule_GetPointer(obj, kDlTensorCapsuleName)); + if (dlmt) { + dlmt->deleter(dlmt); + } + else { + // The tensor has been deleted. Clear any error from + // PyCapsule_GetPointer. + PyErr_Clear(); + } + }); + }, + "stream"_a = 0) + .def("__dlpack_device__", [](turbomind::Tensor* self) { + auto device = getDLDevice(*self); + return std::tuple(int(device.device_type), device.device_id); + }); + m.def( + "from_dlpack", + [](py::object obj) { + py::capsule cap = obj.attr("__dlpack__")(); + DLManagedTensor* dlmt = + static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); + auto ret = DLManagedTensorToTurbomindTensor(dlmt); + return ret; + }, + "dl_managed_tensor"_a); + // m.def("convert_qweight", [](py::object qweight, int input_dims, int output_dims, bool simt) { + // py::capsule cap = qweight.attr("__dlpack__")(); + // DLManagedTensor* dlmt = + // static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); + // auto _qweight = DLManagedTensorToTurbomindTensor(dlmt); + // _qweight = convert_qweight(_qweight, input_dims, output_dims, simt); + // return *_qweight; + // }); + // m.def("convert_scales_zeros", + // [](py::object scales, py::object qzeros, py::object scales_zeros, int input_dims, int output_dims, int group_size, bool simt) { + // auto cap_scales = scales.attr("__dlpack__")(); + // auto cap_zeros = qzeros.attr("__dlpack__")(); + // auto cap_scales_zeros = scales_zeros.attr("__dlpack__")(); + // auto dlmt_scales = static_cast(PyCapsule_GetPointer(cap_scales.ptr(), kDlTensorCapsuleName)); + // auto dlmt_zeros = static_cast(PyCapsule_GetPointer(cap_zeros.ptr(), kDlTensorCapsuleName)); + // auto dlmt_scales_zeros = static_cast(PyCapsule_GetPointer(cap_scales_zeros.ptr(), kDlTensorCapsuleName)); + // auto _scales = DLManagedTensorToTurbomindTensor(dlmt_scales); + // auto _zeros = DLManagedTensorToTurbomindTensor(dlmt_zeros); + // auto _scales_zeros = DLManagedTensorToTurbomindTensor(dlmt_scales_zeros); + // _scales_zeros = convert_scales_zeros(_scales, _zeros, _scales_zeros, input_dims, output_dims, group_size, simt); + // return *_scales_zeros; + // }); + + // Instantiate turbomind::Linear + py::class_>(m, "Linear") + .def(py::init([](size_t in_features, size_t out_features, int w_bit, int group_size) { + return new turbomind::Linear(in_features, out_features, w_bit, group_size); + })) + .def("post_init", [](turbomind::Linear* linear, py::object qweight, py::object scales, py::object qzeros, + bool simt){ + auto _qweight = TorchTensorToTurbomindTensor(qweight); + auto _scales = TorchTensorToTurbomindTensor(scales); + auto _qzeros = TorchTensorToTurbomindTensor(qzeros); + linear->post_init(_qweight, _scales, _qzeros, simt); + }) + .def("forward", [](turbomind::Linear* linear, py::object in, py::object out) { + auto _in = TorchTensorToTurbomindTensor(in); + auto _out = TorchTensorToTurbomindTensor(out); + return linear->forward(_in, _out); + }); +} diff --git a/src/turbomind/api/python/linear.cc b/src/turbomind/api/python/linear.cc new file mode 100644 index 0000000..8c68cdd --- /dev/null +++ b/src/turbomind/api/python/linear.cc @@ -0,0 +1,284 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/kernels/gemm/cast.h" +#include "src/turbomind/kernels/gemm/gemm.h" +#include "src/turbomind/kernels/gemm/types.h" +#include "src/turbomind/api/python/linear.h" +#include "src/turbomind/utils/cuda_utils.h" +#include +#include +#include + + +namespace turbomind { + +class GemmSinglton { +public: + static GemmSinglton& getInstance() { + static GemmSinglton singleton; + return singleton; + } +private: + GemmSinglton() { + + } + // cublasMMWrapper* cublas_wrapper_; + // gemm::Gemm gemm_; +}; + + +struct Linear::Impl { + + Impl(size_t input_dims, size_t output_dims, int w_bit, int group_size) : + input_dims_(input_dims), output_dims_(output_dims), w_bit_(w_bit), group_size_(group_size) { + workspace_ = {}; + + workspace_.barriers_size = gemm::Gemm::kBarriersSize; + workspace_.partials_size = gemm::Gemm::kPartialsSize; + cudaMallocAsync(&workspace_.barriers, workspace_.barriers_size, stream_); + cudaMallocAsync(&workspace_.partials, workspace_.partials_size, stream_); + cudaMemsetAsync(workspace_.barriers, 0, workspace_.barriers_size, stream_); + } + + ~Impl() + { + cudaFreeAsync(workspace_.barriers, stream_); + cudaFreeAsync(workspace_.partials, stream_); + workspace_ = {}; + check_cuda_error(cudaFree(scales_zeros_)); + } + + void post_init(std::shared_ptr qweight, std::shared_ptr scales, std::shared_ptr qzeros, + bool simt) { + const auto workspace_size = input_dims_ * output_dims_ * sizeof(uint16_t); + void *workspace {}; + check_cuda_error(cudaMalloc((void**)&workspace, workspace_size)); + + convert_qweight(workspace, qweight, input_dims_, output_dims_, simt); + convert_scales_zeros(workspace, scales, qzeros, input_dims_, output_dims_, group_size_, simt); + + check_cuda_error(cudaFree(workspace)); + } + + void forward(std::shared_ptr in, std::shared_ptr out) { + TM_CHECK(in->type == TYPE_FP16 && out->type == TYPE_FP16); + TM_CHECK(in->shape.size() == 2 && in->shape[1] == input_dims_); + TM_CHECK(out->shape.size() == 2 && out->shape[0] == in->shape[0] && out->shape[1] == output_dims_); + + using namespace gemm; + + const Operation operation{dispatch_policy_, + Epilogue::kNone, + {QuantType::kNone}, + {QuantType::kDefault, group_size_}, + 0, + nullptr}; + + const MatrixLayout a_desc{ + gemm::DataType::F16, // get_data_type_v, + kRowMajor, + (int)in->shape[0], // row + (int)input_dims_, // col + (int)input_dims_ // input_data.pitch, // input_data.pitch = input_dims_ if input_data.pitch==0 + }; + + const MatrixLayout c_desc{ + gemm::DataType::F16, // get_data_type_v, + kRowMajor, + (int)in->shape[0], // row + (int)output_dims_, // col + (int)output_dims_ + }; + + auto ec = gemm_.Run(operation, + 1.f, + in->data, + a_desc, + nullptr, + {}, + weight_->data, + k_desc_, + scales_zeros_, + q_desc_, + 0.0f, + out->data, + c_desc, + const_cast(out->data), + c_desc, + workspace_, + stream_); + + if (ec) { + // TM_LOG_ERROR("%s: %d", __PRETTY_FUNCTION__, ec); + std::abort(); + } + } + void convert_qweight(void* workspace, std::shared_ptr weight, size_t input_dims, size_t output_dims, bool use_simt) { + // const auto workspace_size = input_dims * output_dims * sizeof(uint16_t); + // void *workspace {}; + // check_cuda_error(cudaMalloc((void**)&workspace, workspace_size)); + // std::cout << "where: " << weight->where << ", type: " << weight->type << ", shape: "; + // for (size_t i = 0; i < weight->shape.size(); ++i) { + // std::cout << weight->shape[i] << ", "; + // } + // std::cout << std::endl; + // std::vector _temp(weight->shape[0] * weight->shape[1]); + // cudaMemcpy(_temp.data(), weight->data, _temp.size() * sizeof(int), cudaMemcpyDeviceToHost); + // int row = 4095; + // for (size_t i = 0; i < _temp.size() && i < 100; i++) { + // std::cout << _temp[row * weight->shape[1] + i] << ", "; + // } + // cudaDeviceSynchronize(); + + using namespace gemm; + auto [order_b, pack_b, order_v, pack_v] = get_weight_and_scales_layout(getSMVersion(), use_simt); + + + // std::cout << "oder_b: " << int(order_b) << ", input_dims: " << input_dims << ", output_dims: " << output_dims << std::endl; + if (order_b == kColMajor) { + transpose_u4((uint4_t*)workspace, (const uint4_t*)weight->data, input_dims, output_dims); + cudaMemcpy(const_cast(weight->data), workspace, input_dims * output_dims / 2, cudaMemcpyDefault); + } + + extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight->data, input_dims * output_dims); + sync_check_cuda_error(); + + if constexpr (0) { + std::vector tmp(input_dims * output_dims); + cudaMemcpy(tmp.data(), workspace, sizeof(uint16_t) * tmp.size(), cudaMemcpyDefault); + cudaDeviceSynchronize(); + int i = 0; + for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) { + std::cout << *it << " "; + } + i = 0; + std::cout << "\n"; + for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) { + std::cout << *it << " "; + } + } + + MatrixLayout w_desc{ + gemm::DataType::F16, + order_b, + (int)input_dims, // k + (int)output_dims, // n + order_b == kRowMajor ? (int)output_dims : (int)input_dims, + }; + + k_desc_ = w_desc; + k_desc_.type = gemm::DataType::U4; + k_desc_.pack = pack_b; + + cudaMemset(const_cast(weight->data), 0, input_dims * output_dims / 2); + + TM_CHECK(Convert(workspace, w_desc, const_cast(weight->data), k_desc_, 0) == 0); + sync_check_cuda_error(); + + cudaDeviceSynchronize(); + + if constexpr (0) { + std::vector tmp(input_dims * output_dims / 8); + cudaMemcpy(tmp.data(), weight->data, sizeof(uint32_t) * tmp.size(), cudaMemcpyDefault); + cudaDeviceSynchronize(); + int i = 0; + for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) { + std::cout << std::hex << *it << " "; + } + i = 0; + std::cout << "\n"; + for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) { + std::cout << std::hex << *it << " "; + } + } + + weight_ = weight; + } + + void convert_scales_zeros(void* workspace, + std::shared_ptr scales, + std::shared_ptr qzeros, + size_t input_dims, + size_t output_dims, + int group_size, + bool use_simt) { + const auto scale_count = input_dims / group_size * output_dims; + + using namespace gemm; + auto [order_b, pack_b, order_v, pack_v] = get_weight_and_scales_layout(getSMVersion(), use_simt); + + fuse_scales_and_zeros((half*)workspace, (const half*)scales->data, (half*)qzeros->data, scale_count); + sync_check_cuda_error(); + + cudaDeviceSynchronize(); + + check_cuda_error(cudaMalloc((half**)&scales_zeros_, scale_count * 2)); + + MatrixLayout s_desc{ + gemm::DataType::U32, + order_v, + (int)input_dims / group_size, // k + (int)output_dims, // n + (int)output_dims, + }; + + q_desc_ = s_desc; + q_desc_.pack = pack_v; + + TM_CHECK(Convert(workspace, s_desc, scales_zeros_, q_desc_, 0) == 0); + sync_check_cuda_error(); + + // if constexpr (0) { + // std::vector tmp(scale_count * 2); + // cudaMemcpy(tmp.data(), scales_zeros_, sizeof(half) * tmp.size(), cudaMemcpyDefault); + // cudaDeviceSynchronize(); + // // for (const auto& x: tmp) { + // // std::cout << (float)x << " "; + // // } + // int i = 0; + // for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) { + // std::cout << std::hex << *it << " "; + // } + // i = 0; + // std::cout << "\n"; + // for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) { + // std::cout << std::hex << *it << " "; + // } + // } + } + +private: + // cublasMMWrapper* cublas_wrapper_; + gemm::Gemm gemm_; + gemm::DispatchPolicy dispatch_policy_{gemm::DispatchPolicy::kDefault}; + gemm::Workspace workspace_; + cudaStream_t stream_{}; + + size_t input_dims_; + size_t output_dims_; + int w_bit_; + int group_size_; + + std::shared_ptr weight_; + half* scales_zeros_; + + gemm::MatrixLayout k_desc_; + gemm::MatrixLayout q_desc_; +}; + +Linear::Linear(size_t input_dims, size_t output_dims, int w_bit, int group_size) { + impl_ = std::make_shared(input_dims, output_dims, w_bit, group_size); +} + +void Linear::post_init(std::shared_ptr qweight, + std::shared_ptr scales, + std::shared_ptr qzeros, + bool simt) { + impl_->post_init(qweight, scales, qzeros, simt); +} + +void Linear::forward(std::shared_ptr in, std::shared_ptr out) +{ + impl_->forward(in, out); +} +} // namespace turbomind diff --git a/src/turbomind/api/python/linear.h b/src/turbomind/api/python/linear.h new file mode 100644 index 0000000..ee56b3a --- /dev/null +++ b/src/turbomind/api/python/linear.h @@ -0,0 +1,50 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include "src/turbomind/kernels/gemm/types.h" +#include "src/turbomind/utils/tensor.h" + +namespace turbomind { + + +enum class WeightType : int +{ + kFP32, + kFP16, + kFP8, // not supported yet + kBF16, + kINT8, + kINT4 +}; + +std::shared_ptr convert_qweight(std::shared_ptr qweight, + size_t input_dims, + size_t output_dims, + bool use_simt); +std::shared_ptr convert_scales_zeros(std::shared_ptr scales, + std::shared_ptr qzeros, + std::shared_ptr scales_zeros, + size_t input_dims, + size_t output_dims, + int group_size, + bool use_simt); + + +class Linear { +public: + Linear(size_t input_dims, size_t output_dims, int w_bit, int group_size); + void post_init(std::shared_ptr qweight, std::shared_ptr scales, std::shared_ptr qzeros, + bool simt); + void forward(std::shared_ptr in, std::shared_ptr out); + ~Linear() {} + +private: + struct Impl; + std::shared_ptr impl_; +}; +}; diff --git a/src/turbomind/kernels/gemm/CMakeLists.txt b/src/turbomind/kernels/gemm/CMakeLists.txt index 5484719..6fc634d 100644 --- a/src/turbomind/kernels/gemm/CMakeLists.txt +++ b/src/turbomind/kernels/gemm/CMakeLists.txt @@ -34,7 +34,7 @@ target_compile_options(gemm2 PRIVATE set_property(TARGET gemm2 PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET gemm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -# if (BUILD_TEST) +if (BUILD_TEST) add_executable(gemm_test test/gemm_test.cu test/test_utils.cu @@ -61,4 +61,4 @@ set_property(TARGET gemm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) test/reference.cu) target_link_libraries(gemm_bench PRIVATE gemm2 nvbench::nvbench cublas) endif () -# endif () +endif () diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h new file mode 100644 index 0000000..157fd8e --- /dev/null +++ b/src/turbomind/utils/cuda_utils.h @@ -0,0 +1,456 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +// #include "src/turbomind/macro.h" +// #include "src/turbomind/utils/cuda_bf16_wrapper.h" +// #include "src/turbomind/utils/logger.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef SPARSITY_ENABLED +#include +#endif + +namespace turbomind { + +#define MAX_CONFIG_NUM 20 +#define COL32_ 32 +// workspace for cublas gemm : 32MB +#define CUBLAS_WORKSPACE_SIZE 33554432 + +typedef struct __align__(4) +{ + half x, y, z, w; +} +half4; + +/* **************************** type definition ***************************** */ + +enum CublasDataType +{ + FLOAT_DATATYPE = 0, + HALF_DATATYPE = 1, + BFLOAT16_DATATYPE = 2, + INT8_DATATYPE = 3, + FP8_DATATYPE = 4 +}; + +enum FtCudaDataType +{ + FP32 = 0, + FP16 = 1, + BF16 = 2, + INT8 = 3, + FP8 = 4 +}; + +enum class OperationType +{ + FP32, + FP16, + BF16, + INT8, + FP8 +}; + +/* **************************** debug tools ********************************* */ +static const char* _cudaGetErrorEnum(cudaError_t error) +{ + return cudaGetErrorString(error); +} + +static const char* _cudaGetErrorEnum(cublasStatus_t error) +{ + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + return ""; +} + +template +void check(T result, char const* const func, const char* const file, int const line) +{ + if (result) { + throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " " + + file + ":" + std::to_string(line) + " \n"); + } +} + +#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__) +#define check_cuda_error_2(val, file, line) check((val), #val, file, line) + +inline void syncAndCheck(const char* const file, int const line) +{ + // When FT_DEBUG_LEVEL=DEBUG, must check error + static char* level_name = std::getenv("TM_DEBUG_LEVEL"); + if (level_name != nullptr) { + static std::string level = std::string(level_name); + if (level == "DEBUG") { + cudaDeviceSynchronize(); + cudaError_t result = cudaGetLastError(); + if (result) { + throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + + " " + file + ":" + std::to_string(line) + " \n"); + } + // TM_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line)); + } + } + +#ifndef NDEBUG + cudaDeviceSynchronize(); + cudaError_t result = cudaGetLastError(); + if (result) { + throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " " + + file + ":" + std::to_string(line) + " \n"); + } +#endif +} + +#define sync_check_cuda_error() syncAndCheck(__FILE__, __LINE__) + +#define checkCUDNN(expression) \ + { \ + cudnnStatus_t status = (expression); \ + if (status != CUDNN_STATUS_SUCCESS) { \ + std::cerr << "Error on file " << __FILE__ << " line " << __LINE__ << ": " << cudnnGetErrorString(status) \ + << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + } + +[[noreturn]] inline void throwRuntimeError(const char* const file, int const line, std::string const& info = "") +{ + throw std::runtime_error(std::string("[TM][ERROR] ") + info + " Assertion fail: " + file + ":" + + std::to_string(line) + " \n"); +} + +inline void myAssert(bool result, const char* const file, int const line, std::string const& info = "") +{ + if (!result) { + throwRuntimeError(file, line, info); + } +} + +#define TM_CHECK(val) myAssert(val, __FILE__, __LINE__) +#define TM_CHECK_WITH_INFO(val, info) \ + do { \ + bool is_valid_val = (val); \ + if (!is_valid_val) { \ + turbomind::myAssert(is_valid_val, __FILE__, __LINE__, (info)); \ + } \ + } while (0) + +#define TM_THROW(info) throwRuntimeError(__FILE__, __LINE__, info) + +#ifdef SPARSITY_ENABLED +#define CHECK_CUSPARSE(func) \ + { \ + cusparseStatus_t status = (func); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + throw std::runtime_error(std::string("[TM][ERROR] CUSPARSE API failed at line ") \ + + std::to_string(__LINE__) + " in file " + __FILE__ + ": " \ + + cusparseGetErrorString(status) + " " + std::to_string(status)); \ + } \ + } +#endif + +/*************Time Handling**************/ +class CudaTimer { +private: + cudaEvent_t event_start_; + cudaEvent_t event_stop_; + cudaStream_t stream_; + +public: + explicit CudaTimer(cudaStream_t stream = 0) + { + stream_ = stream; + } + void start() + { + check_cuda_error(cudaEventCreate(&event_start_)); + check_cuda_error(cudaEventCreate(&event_stop_)); + check_cuda_error(cudaEventRecord(event_start_, stream_)); + } + float stop() + { + float time; + check_cuda_error(cudaEventRecord(event_stop_, stream_)); + check_cuda_error(cudaEventSynchronize(event_stop_)); + check_cuda_error(cudaEventElapsedTime(&time, event_start_, event_stop_)); + check_cuda_error(cudaEventDestroy(event_start_)); + check_cuda_error(cudaEventDestroy(event_stop_)); + return time; + } + ~CudaTimer() {} +}; + +/* ***************************** common utils ****************************** */ + +inline void print_mem_usage(std::string time = "after allocation") +{ + size_t free_bytes, total_bytes; + check_cuda_error(cudaMemGetInfo(&free_bytes, &total_bytes)); + float free = static_cast(free_bytes) / 1024.0 / 1024.0 / 1024.0; + float total = static_cast(total_bytes) / 1024.0 / 1024.0 / 1024.0; + float used = total - free; + printf("%-20s: free: %5.2f GB, total: %5.2f GB, used: %5.2f GB\n", time.c_str(), free, total, used); +} + +inline int getSMVersion() +{ + int device{-1}; + check_cuda_error(cudaGetDevice(&device)); + int sm_major = 0; + int sm_minor = 0; + check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device)); + check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device)); + return sm_major * 10 + sm_minor; +} + +inline int getMaxSharedMemoryPerBlock() +{ + int device{-1}; + check_cuda_error(cudaGetDevice(&device)); + int max_shared_memory_size = 0; + check_cuda_error(cudaDeviceGetAttribute(&max_shared_memory_size, cudaDevAttrMaxSharedMemoryPerBlock, device)); + return max_shared_memory_size; +} + +inline std::string getDeviceName() +{ + int device{-1}; + check_cuda_error(cudaGetDevice(&device)); + cudaDeviceProp props; + check_cuda_error(cudaGetDeviceProperties(&props, device)); + return std::string(props.name); +} + +inline int div_up(int a, int n) +{ + return (a + n - 1) / n; +} + +cudaError_t getSetDevice(int i_device, int* o_device = NULL); + +inline int getDevice() +{ + int current_dev_id = 0; + check_cuda_error(cudaGetDevice(¤t_dev_id)); + return current_dev_id; +} + +inline int getDeviceCount() +{ + int count = 0; + check_cuda_error(cudaGetDeviceCount(&count)); + return count; +} + +template +CublasDataType getCublasDataType() +{ + if (std::is_same::value) { + return HALF_DATATYPE; + } +#ifdef ENABLE_BF16 + else if (std::is_same::value) { + return BFLOAT16_DATATYPE; + } +#endif + else if (std::is_same::value) { + return FLOAT_DATATYPE; + } + else { + TM_CHECK(false); + return FLOAT_DATATYPE; + } +} + +template +cudaDataType_t getCudaDataType() +{ + if (std::is_same::value) { + return CUDA_R_16F; + } +#ifdef ENABLE_BF16 + else if (std::is_same::value) { + return CUDA_R_16BF; + } +#endif + else if (std::is_same::value) { + return CUDA_R_32F; + } + else { + TM_CHECK(false); + return CUDA_R_32F; + } +} + +template +struct getTypeFromCudaDataType { + using Type = float; +}; + +template<> +struct getTypeFromCudaDataType { + using Type = half; +}; + +#ifdef ENABLE_BF16 +template<> +struct getTypeFromCudaDataType { + using Type = __nv_bfloat16; +}; +#endif + +// clang-format off +template struct packed_type; +template <> struct packed_type { using type = float; }; // we don't need to pack float by default +template <> struct packed_type { using type = half2; }; + +#ifdef ENABLE_BF16 +template<> +struct packed_type<__nv_bfloat16> { + using type = __nv_bfloat162; +}; +#endif + +template struct num_elems; +template <> struct num_elems { static constexpr int value = 1; }; +template <> struct num_elems { static constexpr int value = 2; }; +template <> struct num_elems { static constexpr int value = 4; }; +template <> struct num_elems { static constexpr int value = 1; }; +template <> struct num_elems { static constexpr int value = 2; }; +#ifdef ENABLE_BF16 +template <> struct num_elems<__nv_bfloat16> { static constexpr int value = 1; }; +template <> struct num_elems<__nv_bfloat162> { static constexpr int value = 2; }; +#endif + +template struct packed_as; +template struct packed_as { using type = T; }; +template<> struct packed_as { using type = half2; }; +template<> struct packed_as { using type = float2; }; +template<> struct packed_as { using type = int16_t; }; +template<> struct packed_as { using type = int2; }; +template<> struct packed_as { using type = half; }; +#ifdef ENABLE_BF16 +template<> struct packed_as<__nv_bfloat16, 2> { using type = __nv_bfloat162; }; +template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16; }; +#endif + +inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); } +inline __device__ float2 operator*(float2 a, float b) { return make_float2(a.x * b, a.y * b); } +// clang-format on + +template +void compareTwoTensor( + const T1* pred, const T2* ref, const int size, const int print_size = 0, const std::string filename = "") +{ + T1* h_pred = new T1[size]; + T2* h_ref = new T2[size]; + check_cuda_error(cudaMemcpy(h_pred, pred, size * sizeof(T1), cudaMemcpyDeviceToHost)); + check_cuda_error(cudaMemcpy(h_ref, ref, size * sizeof(T2), cudaMemcpyDeviceToHost)); + + FILE* fd = nullptr; + if (filename != "") { + fd = fopen(filename.c_str(), "w"); + fprintf(fd, "| %10s | %10s | %10s | %10s | \n", "pred", "ref", "abs_diff", "rel_diff(%)"); + } + + if (print_size > 0) { + printf(" id | pred | ref |abs diff | rel diff (%) |"); + } + float mean_abs_diff = 0.0f; + float mean_rel_diff = 0.0f; + int count = 0; + for (int i = 0; i < size; i++) { + if (i < print_size) { + printf("%4d | % 6.4f | % 6.4f | % 6.4f | % 7.4f |", + i, + (float)h_pred[i], + (float)h_ref[i], + abs((float)h_pred[i] - (float)h_ref[i]), + abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f); + } + if ((float)h_pred[i] == 0) { + continue; + } + count += 1; + mean_abs_diff += abs((float)h_pred[i] - (float)h_ref[i]); + mean_rel_diff += abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f; + + if (fd != nullptr) { + fprintf(fd, + "| %10.5f | %10.5f | %10.5f | %11.5f |\n", + (float)h_pred[i], + (float)h_ref[i], + abs((float)h_pred[i] - (float)h_ref[i]), + abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f); + } + } + mean_abs_diff = mean_abs_diff / (float)count; + mean_rel_diff = mean_rel_diff / (float)count; + printf("mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff); + + if (fd != nullptr) { + fprintf(fd, "mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff); + fclose(fd); + } + delete[] h_pred; + delete[] h_ref; +} + +bool is_16xx_series(const char* name); + +/* ************************** end of common utils ************************** */ +} // namespace turbomind diff --git a/src/turbomind/utils/tensor.h b/src/turbomind/utils/tensor.h new file mode 100644 index 0000000..e765eeb --- /dev/null +++ b/src/turbomind/utils/tensor.h @@ -0,0 +1,86 @@ +#pragma once + + +#include +#include +#include +#include +#include +// #include +// #include +#include +#include + +namespace turbomind{ + +typedef enum datatype_enum +{ + TYPE_INVALID, + TYPE_BOOL, + TYPE_UINT8, + TYPE_UINT16, + TYPE_UINT32, + TYPE_UINT64, + TYPE_INT8, + TYPE_INT16, + TYPE_INT32, + TYPE_INT64, + TYPE_FP16, + TYPE_FP32, + TYPE_FP64, + TYPE_BYTES, + TYPE_BF16 +} DataType; + +typedef enum memorytype_enum +{ + MEMORY_CPU, + MEMORY_CPU_PINNED, + MEMORY_GPU +} MemoryType; + + +struct Tensor { + MemoryType where; + DataType type; + std::vector shape; + const void* data; + + Tensor(): where(MEMORY_CPU), type(TYPE_INVALID), shape({}), data(nullptr) {} + Tensor(const MemoryType _where, const DataType _type, const std::vector _shape, const void* _data): + where(_where), type(_type), shape(_shape), data(_data) + { + } + + size_t size() const { + if (data == nullptr || shape.size() == 0) { + return 0; + } + return std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies()); + } + + size_t sizeBytes() const { + return size() * typeSize(); + } + + size_t typeSize() const { + static const std::unordered_map type_map{{TYPE_BOOL, sizeof(bool)}, + {TYPE_BYTES, sizeof(char)}, + {TYPE_UINT8, sizeof(uint8_t)}, + {TYPE_UINT16, sizeof(uint16_t)}, + {TYPE_UINT32, sizeof(uint32_t)}, + {TYPE_UINT64, sizeof(uint64_t)}, + {TYPE_INT8, sizeof(int8_t)}, + {TYPE_INT16, sizeof(int16_t)}, + {TYPE_INT32, sizeof(int32_t)}, + {TYPE_INT64, sizeof(int64_t)}, +#ifdef ENABLE_BF16 + {TYPE_BF16, sizeof(__nv_bfloat16)}, +#endif + {TYPE_FP16, sizeof(half)}, + {TYPE_FP32, sizeof(float)}, + {TYPE_FP64, sizeof(double)}}; + return type_map.at(type); + } +}; +} // namespace turbomind diff --git a/turbomind/__init__.py b/turbomind/__init__.py index ef101fe..5de11b0 100644 --- a/turbomind/__init__.py +++ b/turbomind/__init__.py @@ -1 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. + +from .linear import Linear + +__all__ = ['Linear'] diff --git a/turbomind/linear.py b/turbomind/linear.py index 0b322a1..3abdc59 100644 --- a/turbomind/linear.py +++ b/turbomind/linear.py @@ -1,8 +1,229 @@ # Copyright (c) OpenMMLab. All rights reserved. +import logging +import os.path as osp +import sys +from typing import List + import torch -# from . import utils +import turbomind + +turbomind_dir = osp.split(turbomind.__file__)[0] +sys.path.append(osp.join(turbomind_dir, 'lib')) + +try: + import turbomind_kernels + TURBOMIND_KERNELS_INSTALLED = True +except Exception as e: + logging.error(f'turbomind_kernels is not installed: {e}') + TURBOMIND_KERNELS_INSTALLED = False + + +def pack_u4_row(x: torch.Tensor) -> torch.Tensor: + assert x.dtype == torch.uint8 + xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1) + a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device) + for t in reversed(xs): + a = (a << 4) | t + return a.squeeze(dim=-1) + + +def transpose(x): + return x.t() if x is not None else x + + +def pad_out_dims(x: torch.Tensor, dims: int): + pad = dims - x.size(-1) + assert pad >= 0 + return torch.nn.functional.pad(x, (0, pad), 'constant', 0) + + +def pad_in_dims(x: torch.Tensor, dims: int): + pad = dims - x.size(0) + assert x.dim() == 2 + assert pad >= 0 + return torch.nn.functional.pad(x, (0, 0, 0, pad), 'constant', 0) + + +def to_cuda(x: torch.Tensor, *args): + return x.cuda() + + +def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> List[torch.Tensor]: + assert x.dtype == torch.int32 + xs = [] + for _ in range(8): + xs.append((x & 15).to(dtype)) + x = x >> 4 + return xs + + +def unpack_awq_gemm(x: torch.Tensor) -> torch.Tensor: + xs = get_u4_slices(x, torch.uint8) + order = [0, 4, 1, 5, 2, 6, 3, 7] + ys = [xs[i] for i in order] + return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1) + + +def process_awq_gemm(x: torch.Tensor, kind: str): + x = x.cuda() + if x.dtype == torch.int32: + x = unpack_awq_gemm(x) + if kind in ['qweight', 'qzeros', 'scales']: + x = x.t() + return x + + +def process_gptq(x: torch.Tensor, kind: str): + x = x.cuda() + if x.dtype == torch.int32: + xs = get_u4_slices(x, torch.uint8) + if kind == 'qweight': # (k/8,n) + x = torch.stack(xs, dim=1).view(-1, x.size(-1)) + else: # 'qzeros' (k/g,n/8) + x = torch.stack(xs, dim=-1).view(x.size(0), -1) + 1 + if kind in ['qweight', 'qzeros', 'scales']: + x = x.t() + return x class Linear(torch.nn.Module): - pass + + def __init__(self, + in_features, + out_features, + bias: bool = False, + quant_method: str = '', + w_bit: int = 4, + group_size: int = 128, + device: str = 'cuda'): + super().__init__() + + if w_bit not in [4]: + raise NotImplementedError( + f'Only 4-bit is supported for now, but got {w_bit} bit') + if group_size != 128: + raise NotImplementedError( + f'Only group_size 128 is supported for now, ' + f'but got group_size {group_size}') + if bias: + raise NotImplementedError('bias has not been supported yet') + self.w_bit = w_bit + self.group_size = group_size + self.in_features = in_features + self.out_features = out_features + self.bias = bias + self.quant_method = quant_method + + # quick sanity check (make sure alignment) + assert self.in_features % self.group_size == 0 + assert self.out_features % (32 // self.w_bit) == 0 + + self.register_buffer( + 'qweight', + torch.zeros( + (in_features, out_features // (32 // self.w_bit)), + dtype=torch.int32, + device=device, + ), + ) + self.register_buffer( + 'qzeros', + torch.zeros( + (in_features // self.group_size, out_features // + (32 // self.w_bit)), + dtype=torch.int32, + device=device, + ), + ) + self.register_buffer( + 'scales', + torch.zeros( + (in_features // self.group_size, out_features), + dtype=torch.float16, + device=device, + ), + ) + + # if bias: + # self.register_buffer( + # 'bias', + # torch.zeros( + # (out_features), + # dtype=torch.float16, + # device=device, + # ), + # ) + # else: + # self.bias = None + + self.linear = turbomind_kernels.Linear(self.in_features, + self.out_features, self.w_bit, + self.group_size) + + def post_init(self): + assert self.qweight.device.type == 'cuda' + if self.quant_method == 'awq': + self.qweight = unpack_awq_gemm(self.qweight).t() + self.qzeros = unpack_awq_gemm(self.qzeros).t() + self.scales = self.scales.t() + elif self.quant_method == 'gptq': + xs = get_u4_slices(self.qweight, torch.uint8) + self.qweight = torch.stack(xs, dim=1).view(-1, + self.qweight.size(-1)) + xs = get_u4_slices(self.qzeros, torch.uint8) + self.qzeros = torch.stack(xs, dim=-1).view(self.qzeros.size(0), + -1) + 1 + self.qweight = self.qweight.t() + self.qzeros = self.qzeros.t() + self.scales = self.scales.t() + else: + return + + self.qweight = transpose(self.qweight) + self.qzeros = transpose(self.qzeros) + self.scales = transpose(self.scales) + + self.qweight = pack_u4_row(self.qweight) + self.qzeros = self.qzeros.to(torch.half) + + device_id = self.qweight.device.index + properties = torch.cuda.get_device_properties(device_id) + + def is_16xx_series(name): + import re + pattern = r'GTX 16\d\d' + return bool(re.search(pattern, name)) + + simt = is_16xx_series(properties.name) + self.qweight = self.qweight.contiguous() + self.scales = self.scales.contiguous() + self.qzeros = self.qzeros.contiguous() + self.linear.post_init(self.qweight, self.scales, self.qzeros, simt) + + @torch.no_grad() + def forward(self, x): + assert TURBOMIND_KERNELS_INSTALLED, ( + 'turbomind kernels are not installed. ' + 'Please perform `pip install turbomind` to install turbomind ' + 'kernels.') + input_dtype = x.dtype + if input_dtype != torch.float16: + x = x.half() + x = x.view(-1, x.shape[-1]) + out_shape = x.shape[:-1] + (self.out_features, ) + out = torch.empty( + (x.shape[0], self.out_features), + dtype=torch.float16, + device=x.device, + ) + self.linear.forward(x, out) + out = torch.from_dlpack(out) + print(out) + return out.view(out_shape) + + def __call__(self, x: torch.Tensor): + return self.forward(x) + + def to_half(x: torch.Tensor): + return x.to(torch.half) From e6eaafe42140de3fe3cf730f5dff50805e1e341e Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Sat, 12 Oct 2024 19:25:47 +0800 Subject: [PATCH 06/18] remove example/awq --- example/awq/__init__.py | 0 example/awq/_config.py | 109 ------------- example/awq/act.py | 12 -- example/awq/auto.py | 64 -------- example/awq/base.py | 244 ----------------------------- example/awq/gemm.py | 287 ----------------------------------- example/awq/llama.py | 81 ---------- example/awq/module.py | 60 -------- example/awq/packing_utils.py | 97 ------------ example/awq/utils.py | 110 -------------- 10 files changed, 1064 deletions(-) delete mode 100644 example/awq/__init__.py delete mode 100644 example/awq/_config.py delete mode 100644 example/awq/act.py delete mode 100644 example/awq/auto.py delete mode 100644 example/awq/base.py delete mode 100644 example/awq/gemm.py delete mode 100644 example/awq/llama.py delete mode 100644 example/awq/module.py delete mode 100644 example/awq/packing_utils.py delete mode 100644 example/awq/utils.py diff --git a/example/awq/__init__.py b/example/awq/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/example/awq/_config.py b/example/awq/_config.py deleted file mode 100644 index a966e5e..0000000 --- a/example/awq/_config.py +++ /dev/null @@ -1,109 +0,0 @@ -import json -import os -from dataclasses import dataclass, field -from typing import Dict, List, Optional - -from transformers.utils.hub import PushToHubMixin, cached_file - - -@dataclass -class AwqConfig(PushToHubMixin): - quant_method: str = field(default='awq') - zero_point: bool = field(default=True) - q_group_size: int = field(default=128) - w_bit: int = field(default=4) - version: str = field(default='gemm') - config_file_name = 'config.json' - modules_to_not_convert: Optional[List] = None - - @classmethod - def from_dict(cls, quant_config: Dict = {}): - if not quant_config: - quant_config = cls() - else: - quant_config = cls(**quant_config) - quant_config.version = quant_config.version.lower() - - return quant_config - - @classmethod - def from_pretrained(cls, save_dir: str, **kwargs): - cache_dir = kwargs.pop('cache_dir', None) - force_download = kwargs.pop('force_download', False) - resume_download = kwargs.pop('resume_download', False) - proxies = kwargs.pop('proxies', None) - local_files_only = kwargs.pop('local_files_only', False) - use_auth_token = kwargs.pop('use_auth_token', None) - revision = kwargs.pop('revision', None) - subfolder = kwargs.pop('subfolder', None) - commit_hash = kwargs.pop('_commit_hash', None) - - if os.path.isdir(save_dir): # Local - resolved_config_file = os.path.join(save_dir, cls.config_file_name) - else: # Remote - resolved_config_file = cached_file( - save_dir, - cls.config_file_name, - cache_dir=cache_dir, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - use_auth_token=use_auth_token, - revision=revision, - local_files_only=local_files_only, - subfolder=subfolder, - _raise_exceptions_for_missing_entries=False, - _raise_exceptions_for_connection_errors=False, - _commit_hash=commit_hash, - ) - - quant_config = None - if os.path.exists(resolved_config_file): - with open(resolved_config_file, 'r', encoding='utf-8') as file: - loaded_config = json.loads(file.read()) - - quant_config = loaded_config.get('quantization_config') - - if quant_config is not None: - awq_config = cls.from_transformers_dict(cls, quant_config) - quant_config = cls(**awq_config) - - if quant_config is None: - quant_config = cls() - - return quant_config - - def to_dict(self): - return { - 'zero_point': self.zero_point, - 'q_group_size': self.q_group_size, - 'w_bit': self.w_bit, - 'version': self.version, - 'modules_to_not_convert': self.modules_to_not_convert, - } - - def to_transformers_dict(self): - return { - 'quant_method': self.quant_method, - 'zero_point': self.zero_point, - 'group_size': self.q_group_size, - 'bits': self.w_bit, - 'version': self.version.lower(), - 'modules_to_not_convert': self.modules_to_not_convert, - } - - def from_transformers_dict(self, transformers_dict: Dict): - return { - 'quant_method': - transformers_dict.get('quant_method'), - 'zero_point': - transformers_dict.get('zero_point'), - 'q_group_size': - transformers_dict.get('group_size'), - 'w_bit': - transformers_dict.get('bits'), - 'version': - transformers_dict.get('version'), - 'modules_to_not_convert': - transformers_dict.get('modules_to_not_convert'), - } diff --git a/example/awq/act.py b/example/awq/act.py deleted file mode 100644 index 59ca7c4..0000000 --- a/example/awq/act.py +++ /dev/null @@ -1,12 +0,0 @@ -import torch.nn as nn - - -class ScaledActivation(nn.Module): - - def __init__(self, module, scales): - super().__init__() - self.act = module - self.scales = nn.Parameter(scales.data) - - def forward(self, x): - return self.act(x) / self.scales.view(1, 1, -1).to(x.device) diff --git a/example/awq/auto.py b/example/awq/auto.py deleted file mode 100644 index a18db0c..0000000 --- a/example/awq/auto.py +++ /dev/null @@ -1,64 +0,0 @@ -import logging -import os - -from transformers import AutoConfig - -from .base import BaseAWQForCausalLM -from .llama import LlamaAWQForCausalLM - -AWQ_CAUSAL_LM_MODEL_MAP = { - 'llama': LlamaAWQForCausalLM, -} - - -def check_and_get_model_type(model_dir, **model_init_kwargs): - config = AutoConfig.from_pretrained(model_dir, - trust_remote_code=True, - **model_init_kwargs) - if config.model_type not in AWQ_CAUSAL_LM_MODEL_MAP.keys(): - raise TypeError(f"{config.model_type} isn't supported yet.") - model_type = config.model_type - return model_type - - -class AutoAWQForCausalLM: - - def __init__(self): - raise EnvironmentError( - 'You must instantiate AutoAWQForCausalLM with\n' - 'AutoAWQForCausalLM.from_quantized or AutoAWQForCausalLM.' - 'from_pretrained') - - @classmethod - def from_quantized( - self, - quant_path, - max_seq_len=2048, - fuse_layers=True, - batch_size=1, - device_map='balanced', - max_memory=None, - offload_folder=None, - download_kwargs=None, - **config_kwargs, - ) -> BaseAWQForCausalLM: - os.environ['AWQ_BATCH_SIZE'] = str(batch_size) - model_type = check_and_get_model_type(quant_path) - - if config_kwargs.get('max_new_tokens') is not None: - max_seq_len = config_kwargs['max_new_tokens'] - logging.warning( - 'max_new_tokens argument is deprecated... gracefully ' - 'setting max_seq_len=max_new_tokens.') - - return AWQ_CAUSAL_LM_MODEL_MAP[model_type].from_quantized( - quant_path, - model_type, - max_seq_len, - fuse_layers=fuse_layers, - device_map=device_map, - max_memory=max_memory, - offload_folder=offload_folder, - download_kwargs=download_kwargs, - **config_kwargs, - ) diff --git a/example/awq/base.py b/example/awq/base.py deleted file mode 100644 index 9a815c4..0000000 --- a/example/awq/base.py +++ /dev/null @@ -1,244 +0,0 @@ -import gc -from typing import Dict, Union - -import torch -import torch.nn as nn -import transformers -from accelerate.big_modeling import (init_empty_weights, - load_checkpoint_and_dispatch) -from tqdm import tqdm -from transformers import AutoConfig, PretrainedConfig, PreTrainedModel -from typing_extensions import Annotated, Doc - -from ._config import AwqConfig -from .act import ScaledActivation -from .gemm import WQLinear_GEMM -from .module import (exclude_layers_to_not_quantize, get_named_linears, - set_op_by_name) - -# from turbomind import Linear -# from turbomind.utils import turbomind_post_init - -# Since we support different `AutoModelForxxx` from transformers -# we need to define a custom mapping dict as below: -TRANSFORMERS_AUTO_MAPPING_DICT = { - 'llama': 'AutoModelForCausalLM', -} - - -class BaseAWQForCausalLM(nn.Module): - - def __init__( - self, - model, - model_type, - is_quantized, - config, - quant_config, - ): - """The base model for all AutoAWQ models. - - Args: - model: The pretrained or quantized model. - model_type: The model type, found in config.json. - is_quantized: Indicates if the current model is quantized - config: The config of the model. - quant_config: The quantization config of the model. - """ - super().__init__() - self.model: PreTrainedModel = model - self.model_type: str = model_type - self.is_quantized: bool = is_quantized - self.search_result = None - self.config: PretrainedConfig = config - self.quant_config: AwqConfig = quant_config - - def to(self, device: Annotated[str, - Doc('The device to move your model to.')]): - """A utility function for moving the model to a device.""" - return self.model.to(device) - - def forward(self, *args, **kwargs): - """A forward function that mimics the torch forward.""" - return self.model(*args, **kwargs) - - def generate(self, *args, **kwargs): - """A generate function that mimics the HF generate function.""" - with torch.inference_mode(): - return self.model.generate(*args, **kwargs) - - # @staticmethod - # def fuse_layers(model): - # pass - - @classmethod - def from_quantized(self, - model_path: str, - model_type: str, - max_seq_len: int, - torch_dtype: torch.dtype = torch.float16, - device_map: Union[str, Dict] = 'balanced', - **config_kwargs: Dict): - """A method for initialization of a quantized model, usually in INT4. - - Args: - model_path (str): The model path - model_type (str): The model type, loaded from config.json. - max_seq_len (int): The maximum sequence cached sequence length of - the model. Larger values may increase loading time and - memory usage. - torch_dtype: The dtype to load the model as. May not work with - other values than float16. - device_map: A device map that will be passed onto the model - loading method from transformers. - **config_kwargs: Additional kwargs that are passed to the config - during initialization - """ - # [STEP 1-2] Load weights path and configs - model_weights_path, config, quant_config = self._load_config( - self, - model_path, - max_seq_len=max_seq_len, - **config_kwargs, - ) - - target_cls_name = TRANSFORMERS_AUTO_MAPPING_DICT[config.model_type] - target_cls = getattr(transformers, target_cls_name) - - # [STEP 3] Load model - with init_empty_weights(): - model = target_cls.from_config( - config=config, - torch_dtype=torch_dtype, - trust_remote_code=True, - ) - # Prepare WQLinear layers, replace nn.Linear - self._load_quantized_modules( - self, - model, - quant_config, - quant_config.version, - use_exllama=False, - use_exllama_v2=False, - use_qbits=False, - ) - - model.tie_weights() - - # loads the weights into modules and distributes - # across available devices automatically - load_checkpoint_and_dispatch( - model, - checkpoint=model_weights_path, - device_map=device_map, - no_split_module_classes=[self.layer_type], - dtype=torch_dtype, - ) - - # TODO - # model = turbomind_post_init(model) - - # # Dispatch to devices - # if fuse_layers: - # self.fuse_layers(model) - - model.eval() - - return self( - model, - model_type, - is_quantized=True, - config=config, - quant_config=quant_config, - ) - - def _load_config( - self, - model_path, - max_seq_len=4096, - **config_kwargs, - ): - # [STEP 2] Load config and set sequence length - # TODO: Create BaseAWQConfig class - quant_config = AwqConfig.from_pretrained(model_path) - - # Load model config and set max generation length - if max_seq_len is None and hasattr(self, 'max_seq_len_key'): - config = AutoConfig.from_pretrained(model_path, - trust_remote_code=True, - **config_kwargs) - config.max_seq_len = getattr(config, self.max_seq_len_key, 2048) - # To add the generate support for Multi-modal models as well - if hasattr(config, 'text_config'): - config.text_config.max_seq_len = getattr( - config, self.max_seq_len_key, 2048) - else: - max_seq_len = 2048 if max_seq_len is None else max_seq_len - config = AutoConfig.from_pretrained(model_path, - trust_remote_code=True, - **config_kwargs) - config.max_seq_len = max_seq_len - - return model_path, config, quant_config - - def _load_quantized_modules(self, - model, - quant_config, - version, - use_exllama, - use_exllama_v2, - use_qbits=False): - # Real quantization of weights - assert not (version == 'gemv' and - (use_exllama or use_exllama_v2 or - use_qbits)), 'Exllama kernels only support GEMM version.' - - # Get blocks of model - layers = self.get_model_layers(model) - - for i in tqdm(range(len(layers)), desc='Replacing layers...'): - layer = layers[i] - - # Get every linear layer in a block - named_linears = get_named_linears(layer) - - # Filter out the linear layers we don't want to include - named_linears = exclude_layers_to_not_quantize( - named_linears, quant_config.modules_to_not_convert) - - # Replace activation functions - self._scale_activations(self, layer) - - # Replace nn.Linear with WQLinear - for name, module in named_linears.items(): - assert version == 'gemm' - - q_linear_module = WQLinear_GEMM - # q_linear_module = Linear - q_linear = q_linear_module.from_linear( - module, quant_config.w_bit, quant_config.q_group_size, - True) - q_linear.to(next(layer.parameters()).device) - set_op_by_name(layer, name, q_linear) - - if not use_qbits: - torch.cuda.empty_cache() - gc.collect() - - @staticmethod - def _scale_activations(self, layer): - scale_dict = self.get_act_for_scaling(layer) - - if scale_dict['is_scalable']: - if not isinstance(scale_dict['scale_layer'], ScaledActivation): - param = next(layer.parameters()) - - # get activation scale - scale_like = torch.ones(scale_dict['scale_shape'], - dtype=param.dtype, - device=param.device) - - # scale activation - scaled_act = ScaledActivation(scale_dict['scale_layer'], - scale_like) - set_op_by_name(layer, scale_dict['scale_name'], scaled_act) diff --git a/example/awq/gemm.py b/example/awq/gemm.py deleted file mode 100644 index 664b4f5..0000000 --- a/example/awq/gemm.py +++ /dev/null @@ -1,287 +0,0 @@ -import warnings - -import torch -import torch.nn as nn -from torch.autograd import Function - -from .packing_utils import dequantize_gemm -from .utils import get_best_device - -try: - import awq_ext # with CUDA kernels (AutoAWQ_kernels) - - AWQ_INSTALLED = True -except Exception as ex: - AWQ_INSTALLED = False - warnings.warn( - f'AutoAWQ could not load GEMM kernels extension. Details: {ex}') - - -# Adapted from https://github.com/compressa-ai/AutoAWQ/tree/dev -class WQLinearMMFunction(Function): - - @staticmethod - # ctx is the first argument to forward - def forward( - ctx, - x, - qweight, - qzeros, - scales, - w_bit=4, - group_size=128, - bias=None, - out_features=0, - ): - # The forward pass can use ctx. - ctx.save_for_backward(x, qweight, qzeros, scales, bias) - ctx.out_features = out_features - - out_shape = x.shape[:-1] + (out_features, ) - x = x.to(torch.float16) - - if AWQ_INSTALLED: - FP16_MATMUL_HEURISTIC_CONDITION = x.shape[0] * x.shape[1] >= 1024 - - if FP16_MATMUL_HEURISTIC_CONDITION: - out = awq_ext.dequantize_weights_cuda(qweight, scales, qzeros, - 0, 0, 0, False) - out = torch.matmul(x, out) - else: - out = awq_ext.gemm_forward_cuda(x.reshape(-1, x.shape[-1]), - qweight, scales, qzeros, 8) - else: - out = dequantize_gemm(qweight, qzeros, scales, w_bit, group_size) - out = torch.matmul(x, out) - - out = out + bias if bias is not None else out - out = out.reshape(out_shape) - - # always want 3D tensor if tensor is 2D - if len(out.shape) == 2: - out = out.unsqueeze(0) - - return out - - @staticmethod - def backward(ctx, grad_output): - input, qweight, qzeros, scales, bias = ctx.saved_tensors - - if not AWQ_INSTALLED: - raise ValueError( - 'auto-awq kernels is needed to be installed to ' - 'use `.backward()`. Make sure to install the auto-awq kernels' - ' by following the installation guides in ' - 'https://github.com/casper-hansen/AutoAWQ_kernels') - - # Cast to correct dtype for mixed precision training - weights = awq_ext.dequantize_weights_cuda(qweight, scales, qzeros, 1, - 0, 0, - False).to(grad_output.dtype) - - if ctx.needs_input_grad[0]: - # 3D matmul using torch.bmm: - # https://pytorch.org/docs/stable/generated/torch.bmm.html#torch.bmm # noqa - # to propagate gradient across all batch sizes. - batch_size = grad_output.shape[0] - grad_input = grad_output.bmm( - weights.transpose(0, 1).unsqueeze(0).repeat(batch_size, 1, 1)) - - return grad_input, None, None, None, None, None, None, None - - -class WQLinear_GEMM(nn.Module): - - def __init__(self, - w_bit, - group_size, - in_features, - out_features, - bias, - dev, - training=False): - super().__init__() - - if w_bit not in [4]: - raise NotImplementedError('Only 4-bit are supported for now.') - - self.in_features = in_features - self.out_features = out_features - self.w_bit = w_bit - self.group_size = group_size if group_size != -1 else in_features - self.training = training - - # quick sanity check (make sure alignment) - assert self.in_features % self.group_size == 0 - assert out_features % (32 // self.w_bit) == 0 - - self.register_buffer( - 'qweight', - torch.zeros( - (in_features, out_features // (32 // self.w_bit)), - dtype=torch.int32, - device=dev, - ), - ) - self.register_buffer( - 'qzeros', - torch.zeros( - (in_features // self.group_size, out_features // - (32 // self.w_bit)), - dtype=torch.int32, - device=dev, - ), - ) - self.register_buffer( - 'scales', - torch.zeros( - (in_features // self.group_size, out_features), - dtype=torch.float16, - device=dev, - ), - ) - if bias: - self.register_buffer( - 'bias', - torch.zeros( - (out_features), - dtype=torch.float16, - device=dev, - ), - ) - else: - self.bias = None - - @classmethod - def from_linear(cls, - linear, - w_bit, - group_size, - init_only=False, - scales=None, - zeros=None): - awq_linear = cls( - w_bit, - group_size, - linear.in_features, - linear.out_features, - linear.bias is not None, - linear.weight.device, - ) - if init_only: # just prepare for loading sd - return awq_linear - - # need scales and zeros info for real quantization - assert scales is not None and zeros is not None - scale_zeros = zeros * scales - - awq_linear.scales = scales.clone().half() - if linear.bias is not None: - awq_linear.bias = linear.bias.clone().half() - - pack_num = 32 // awq_linear.w_bit - - intweight = [] - for idx in range(awq_linear.in_features): - intweight.append( - torch.round( - (linear.weight.data[:, idx] + - scale_zeros[idx // group_size]) / - awq_linear.scales[idx // group_size]).to(torch.int)[:, - None]) - intweight = torch.cat(intweight, dim=1) - intweight = intweight.t().contiguous() - intweight = intweight.to(dtype=torch.int32) - - best_device = get_best_device() - - # Avoid: The operator 'aten::__lshift__.Scalar' is not currently - # implemented for the MPS device - if 'mps' in best_device: - intweight = intweight.to('cpu') - - qweight = torch.zeros( - (intweight.shape[0], intweight.shape[1] // 32 * awq_linear.w_bit), - dtype=torch.int32, - device=intweight.device, - ) - - for col in range(intweight.shape[1] // pack_num): - if awq_linear.w_bit == 4: - order_map = [0, 2, 4, 6, 1, 3, 5, 7] - else: - raise NotImplementedError('Only 4-bit are supported for now.') - for i in range(pack_num): - qweight_col = intweight[:, col * pack_num + order_map[i]] - qweight[:, col] |= qweight_col << (i * awq_linear.w_bit) - awq_linear.qweight = qweight - - zeros = zeros.to(dtype=torch.int32, device=best_device) - - if 'mps' in best_device: - zeros = zeros.to('cpu') - - qzeros = torch.zeros( - (zeros.shape[0], zeros.shape[1] // 32 * awq_linear.w_bit), - dtype=torch.int32, - device=zeros.device, - ) - - for col in range(zeros.shape[1] // pack_num): - if awq_linear.w_bit == 4: - order_map = [0, 2, 4, 6, 1, 3, 5, 7] - else: - raise NotImplementedError('Only 4-bit are supported for now.') - for i in range(pack_num): - qzero_col = zeros[:, col * pack_num + order_map[i]] - qzeros[:, col] |= qzero_col << (i * awq_linear.w_bit) - awq_linear.qzeros = qzeros - - return awq_linear - - def forward(self, x): - out_shape = x.shape[:-1] + (self.out_features, ) - - input_dtype = x.dtype - if input_dtype != torch.float16: - x = x.half() - - if self.training: - out = WQLinearMMFunction.apply( - x, - self.qweight, - self.qzeros, - self.scales, - self.w_bit, - self.group_size, - self.bias, - self.out_features, - ) - else: - with torch.no_grad(): - out = WQLinearMMFunction.apply( - x, - self.qweight, - self.qzeros, - self.scales, - self.w_bit, - self.group_size, - self.bias, - self.out_features, - ) - - if input_dtype != torch.float16: - out = out.to(dtype=input_dtype) - - return out.reshape(out_shape) - - def extra_repr(self) -> str: - return ( - 'in_features={}, out_features={}, bias={}, w_bit={}, group_size={}' - .format( - self.in_features, - self.out_features, - self.bias is not None, - self.w_bit, - self.group_size, - )) diff --git a/example/awq/llama.py b/example/awq/llama.py deleted file mode 100644 index f221e48..0000000 --- a/example/awq/llama.py +++ /dev/null @@ -1,81 +0,0 @@ -# from awq.utils.fused_utils import fuse_qkv -# from awq.modules.fused.block import LlamaLikeBlock -# from awq.modules.fused.model import LlamaLikeModel -from transformers.models.llama.modeling_llama import \ - LlamaDecoderLayer as OldLlamaDecoderLayer -from transformers.models.llama.modeling_llama import \ - LlamaForCausalLM as OldLlamaForCausalLM - -from .base import BaseAWQForCausalLM - -# from awq.modules.fused.norm import FasterTransformerRMSNorm - - -class LlamaAWQForCausalLM(BaseAWQForCausalLM): - layer_type = 'LlamaDecoderLayer' - max_seq_len_key = 'max_position_embeddings' - - # @staticmethod - # def fuse_layers(model: OldLlamaForCausalLM): - # fuser = LlamaFuser(model) - # fuser.fuse_transformer() - - @staticmethod - def get_model_layers(model: OldLlamaForCausalLM): - return model.model.layers - - @staticmethod - def get_act_for_scaling(module: OldLlamaDecoderLayer): - return dict(is_scalable=False) - - @staticmethod - def move_embed(model: OldLlamaForCausalLM, device: str): - model.model.embed_tokens = model.model.embed_tokens.to(device) - - @staticmethod - def get_layers_for_scaling(module: OldLlamaDecoderLayer, input_feat, - module_kwargs): - layers = [] - - # attention input - layers.append( - dict( - prev_op=module.input_layernorm, - layers=[ - module.self_attn.q_proj, - module.self_attn.k_proj, - module.self_attn.v_proj, - ], - inp=input_feat['self_attn.q_proj'], - module2inspect=module.self_attn, - kwargs=module_kwargs, - )) - - # attention out - # Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696 # noqa - if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape: # noqa - layers.append( - dict( - prev_op=module.self_attn.v_proj, - layers=[module.self_attn.o_proj], - inp=input_feat['self_attn.o_proj'], - )) - - # linear 1 - layers.append( - dict( - prev_op=module.post_attention_layernorm, - layers=[module.mlp.gate_proj, module.mlp.up_proj], - inp=input_feat['mlp.gate_proj'], - module2inspect=module.mlp, - )) - - # linear 2 - layers.append( - dict( - prev_op=module.mlp.up_proj, - layers=[module.mlp.down_proj], - inp=input_feat['mlp.down_proj'], - )) - - return layers diff --git a/example/awq/module.py b/example/awq/module.py deleted file mode 100644 index aeefb66..0000000 --- a/example/awq/module.py +++ /dev/null @@ -1,60 +0,0 @@ -import torch.nn as nn - - -def get_named_linears(module): - return { - name: m - for name, m in module.named_modules() if isinstance(m, nn.Linear) - } - - -def get_op_by_name(module, op_name): - # get the op by its name relative to the module - for name, m in module.named_modules(): - if name == op_name: - return m - raise ValueError(f'Cannot find op {op_name} in module {module}') - - -def set_op_by_name(layer, name, new_module): - levels = name.split('.') - if len(levels) > 1: - mod_ = layer - for l_idx in range(len(levels) - 1): - if levels[l_idx].isdigit(): - mod_ = mod_[int(levels[l_idx])] - else: - mod_ = getattr(mod_, levels[l_idx]) - setattr(mod_, levels[-1], new_module) - else: - setattr(layer, name, new_module) - - -def get_op_name(module, op): - # get the name of the op relative to the module - for name, m in module.named_modules(): - if m is op: - return name - raise ValueError(f'Cannot find op {op} in module {module}') - - -def append_str_prefix(x, prefix): - if isinstance(x, str): - return prefix + x - elif isinstance(x, tuple): - return tuple([append_str_prefix(y, prefix) for y in x]) - elif isinstance(x, list): - return [append_str_prefix(y, prefix) for y in x] - else: - return x - - -def exclude_layers_to_not_quantize(linear_layers, modules_to_not_convert): - if modules_to_not_convert is None: - return linear_layers - - filtered_layers = {} - for name, linear_layer in linear_layers.items(): - if not any(key in name for key in modules_to_not_convert): - filtered_layers[name] = linear_layer - return filtered_layers diff --git a/example/awq/packing_utils.py b/example/awq/packing_utils.py deleted file mode 100644 index a9724fa..0000000 --- a/example/awq/packing_utils.py +++ /dev/null @@ -1,97 +0,0 @@ -import torch - -AWQ_ORDER = [0, 2, 4, 6, 1, 3, 5, 7] -AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7] - - -def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int): - shifts = torch.arange(0, 32, bits, device=qzeros.device) - - # unpacking columnwise - iweights = torch.bitwise_right_shift( - qweight[:, :, None], - shifts[None, None, :]).to(torch.int8 # smallest dtype available - ) - iweights = iweights.view(iweights.shape[0], -1) - - # unpacking columnwise - if qzeros is not None: - izeros = torch.bitwise_right_shift( - qzeros[:, :, None], - shifts[None, None, :]).to(torch.int8 # smallest dtype available - ) - izeros = izeros.view(izeros.shape[0], -1) - else: - izeros = qzeros - - return iweights, izeros - - -def reverse_awq_order(iweights: torch.Tensor, izeros: torch.Tensor, bits: int): - reverse_order_tensor = torch.arange( - iweights.shape[-1], - dtype=torch.int32, - device=izeros.device, - ) - reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits) - reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER] - reverse_order_tensor = reverse_order_tensor.view(-1) - - if izeros is not None: - izeros = izeros[:, reverse_order_tensor] - iweights = iweights[:, reverse_order_tensor] - - return iweights, izeros - - -def pack_exllama(iweights: torch.Tensor, izeros: torch.Tensor, bits: int): - shifts = torch.arange(0, 32, bits, device=iweights.device) - - # packing rowwise - iweights = iweights.view(iweights.shape[0] // (32 // bits), 32 // bits, -1) - qweight = (torch.bitwise_left_shift( - iweights, shifts[None, :, None]).sum(dim=1).to(torch.int32)) - - # packing columnwise - izeros = izeros.view(-1, izeros.shape[1] // (32 // bits), 32 // bits) - qzeros = (torch.bitwise_left_shift( - izeros, shifts[None, None, :]).sum(dim=-1).to(torch.int32)) - - return qweight, qzeros - - -def unpack_reorder_pack(qweight, qzeros, bits): - # Unpack the qweight and qzeros tensors - iweight, izeros = unpack_awq(qweight, qzeros, bits) - # Reverse the order of the iweight and izeros tensors - iweight, izeros = reverse_awq_order(iweight, izeros, bits) - - # overflow checks - iweight = torch.bitwise_and(iweight, (2**bits) - 1) - izeros = torch.bitwise_and(izeros, (2**bits) - 1) - - # Subtract 1 from the izeros tensor (exllama adds 1 during inference) - # We can remove it if we remove the +1 in the exllama code - izeros = izeros - 1 - # Pack the qweight and qzeros tensors - qweight, qzeros = pack_exllama(iweight, izeros, bits) - - return qweight, qzeros - - -def dequantize_gemm(qweight, qzeros, scales, bits, group_size): - # Unpack the qweight and qzeros tensors - iweight, izeros = unpack_awq(qweight, qzeros, bits) - # Reverse the order of the iweight and izeros tensors - iweight, izeros = reverse_awq_order(iweight, izeros, bits) - - # overflow checks - iweight = torch.bitwise_and(iweight, (2**bits) - 1) - izeros = torch.bitwise_and(izeros, (2**bits) - 1) - - # fp16 weights - scales = scales.repeat_interleave(group_size, dim=0) - izeros = izeros.repeat_interleave(group_size, dim=0) - iweight = (iweight - izeros) * scales - - return iweight diff --git a/example/awq/utils.py b/example/awq/utils.py deleted file mode 100644 index 3ba594c..0000000 --- a/example/awq/utils.py +++ /dev/null @@ -1,110 +0,0 @@ -import gc -import importlib - -import accelerate -import torch - -qbits_available = importlib.util.find_spec( - 'intel_extension_for_transformers') is not None - - -def get_module_by_name_suffix(model, module_name: str): - for name, module in model.named_modules(): - if name.endswith(module_name): - return module - - -def simple_dispatch_model(model, device_map): - from accelerate.hooks import AlignDevicesHook, add_hook_to_module - - if '' in device_map: - d = device_map[''] - model = model.to(torch.device(d)) - model.hf_device_map = device_map - return model - - tied_params = accelerate.utils.modeling.find_tied_parameters(model) - if set(device_map.values()) == {'cpu'} or set(device_map.values()) == { - 'cpu', - 'disk', - }: - main_device = 'cpu' - else: - main_device = [ - d for d in device_map.values() if d not in ['cpu', 'disk'] - ][0] - - cpu_offload_group = [(n, d) for n, d in device_map.items() if d == 'cpu'] - prev_hook = None - for idx, (n, d) in enumerate(cpu_offload_group): - m = get_module_by_name_suffix(model, n) - _, prev_hook = accelerate.cpu_offload_with_hook( - m, execution_device=main_device, prev_module_hook=prev_hook) - # set first cpu offload module's prev_module_hook - # to the last cpu offload module's hook - if len(cpu_offload_group) > 1: - get_module_by_name_suffix( - model, - cpu_offload_group[0][0])._hf_hook.prev_module_hook = prev_hook - - for n, d in device_map.items(): - m = get_module_by_name_suffix(model, n) - if d != 'cpu': - d = torch.device(d) - hook = AlignDevicesHook(d, - io_same_device=True, - place_submodules=True) - add_hook_to_module(m, hook) - accelerate.utils.modeling.retie_parameters(model, tied_params) - model.hf_device_map = device_map - - return model - - -def set_module_name(model, name, value): - if '.' in name: - parent_name = name.rsplit('.', 1)[0] - child_name = name[len(parent_name) + 1:] - parent = model.get_submodule(parent_name) - else: - parent_name = '' - parent = model - child_name = name - - setattr(parent, child_name, value) - - -def clear_memory(weight=None): - if weight is not None: - del weight - gc.collect() - torch.cuda.empty_cache() - - -def compute_memory_used_pct(device): - memory_used = torch.cuda.max_memory_allocated(device) / (1024**3) - memory_pct = (memory_used / - (torch.cuda.get_device_properties(device).total_memory / - (1024**3)) * 100) - return memory_pct - - -def get_best_device(): - if torch.backends.mps.is_available(): - return 'mps' - elif torch.cuda.is_available(): - return 'cuda:0' - else: - return 'cpu' - - -def get_lowest_memory_device_index(): - device = None - curr_device_memory_pct = 0 - for device_index in range(torch.cuda.device_count()): - device_memory_pct = compute_memory_used_pct(device_index) - if device is None or device_memory_pct < curr_device_memory_pct: - device = device_index - curr_device_memory_pct = device_memory_pct - - return device From a8c64c4095d5b0f9d49a6227f349da6a64207239 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Sat, 12 Oct 2024 19:31:09 +0800 Subject: [PATCH 07/18] remove useless code --- example/generate.py | 44 ------------------------------ src/turbomind/api/python/bind.cpp | 22 --------------- src/turbomind/api/python/linear.cc | 20 +------------- 3 files changed, 1 insertion(+), 85 deletions(-) delete mode 100644 example/generate.py diff --git a/example/generate.py b/example/generate.py deleted file mode 100644 index cca53ab..0000000 --- a/example/generate.py +++ /dev/null @@ -1,44 +0,0 @@ -import torch -from awq.auto import AutoAWQForCausalLM -from transformers import AutoTokenizer, TextStreamer - -quant_path = '/mnt/140/llama3/Meta-Llama-3-8B-Instruct-hf-AWQ' - -# Load model -model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True) - -tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True) -streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) - -prompt = "You're standing on the surface of the Earth. "\ - 'You walk one mile south, one mile west and one mile north. '\ - 'You end up exactly where you started. Where are you?' - -chat = [ - { - 'role': 'system', - 'content': 'You are a concise assistant that helps answer questions.' - }, - { - 'role': 'user', - 'content': prompt - }, -] - -terminators = [ - tokenizer.eos_token_id, - tokenizer.convert_tokens_to_ids('<|eot_id|>') -] - -tokens = tokenizer.apply_chat_template(chat, return_tensors='pt') -tokens = tokens.to(torch.device('cuda')) - -# Generate output -generation_output = model.generate(tokens, - streamer=streamer, - max_new_tokens=64, - eos_token_id=terminators) - -res = tokenizer.decode(generation_output[0].cpu().numpy().tolist()) -print(f'token_ids: {generation_output}') -print(f'output: {res}') diff --git a/src/turbomind/api/python/bind.cpp b/src/turbomind/api/python/bind.cpp index bd81fd9..409fc5d 100644 --- a/src/turbomind/api/python/bind.cpp +++ b/src/turbomind/api/python/bind.cpp @@ -324,28 +324,6 @@ PYBIND11_MODULE(turbomind_kernels, m) { return ret; }, "dl_managed_tensor"_a); - // m.def("convert_qweight", [](py::object qweight, int input_dims, int output_dims, bool simt) { - // py::capsule cap = qweight.attr("__dlpack__")(); - // DLManagedTensor* dlmt = - // static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); - // auto _qweight = DLManagedTensorToTurbomindTensor(dlmt); - // _qweight = convert_qweight(_qweight, input_dims, output_dims, simt); - // return *_qweight; - // }); - // m.def("convert_scales_zeros", - // [](py::object scales, py::object qzeros, py::object scales_zeros, int input_dims, int output_dims, int group_size, bool simt) { - // auto cap_scales = scales.attr("__dlpack__")(); - // auto cap_zeros = qzeros.attr("__dlpack__")(); - // auto cap_scales_zeros = scales_zeros.attr("__dlpack__")(); - // auto dlmt_scales = static_cast(PyCapsule_GetPointer(cap_scales.ptr(), kDlTensorCapsuleName)); - // auto dlmt_zeros = static_cast(PyCapsule_GetPointer(cap_zeros.ptr(), kDlTensorCapsuleName)); - // auto dlmt_scales_zeros = static_cast(PyCapsule_GetPointer(cap_scales_zeros.ptr(), kDlTensorCapsuleName)); - // auto _scales = DLManagedTensorToTurbomindTensor(dlmt_scales); - // auto _zeros = DLManagedTensorToTurbomindTensor(dlmt_zeros); - // auto _scales_zeros = DLManagedTensorToTurbomindTensor(dlmt_scales_zeros); - // _scales_zeros = convert_scales_zeros(_scales, _zeros, _scales_zeros, input_dims, output_dims, group_size, simt); - // return *_scales_zeros; - // }); // Instantiate turbomind::Linear py::class_>(m, "Linear") diff --git a/src/turbomind/api/python/linear.cc b/src/turbomind/api/python/linear.cc index 8c68cdd..cfb1202 100644 --- a/src/turbomind/api/python/linear.cc +++ b/src/turbomind/api/python/linear.cc @@ -109,32 +109,14 @@ struct Linear::Impl { stream_); if (ec) { - // TM_LOG_ERROR("%s: %d", __PRETTY_FUNCTION__, ec); + printf("%s: %d", __PRETTY_FUNCTION__, ec); std::abort(); } } void convert_qweight(void* workspace, std::shared_ptr weight, size_t input_dims, size_t output_dims, bool use_simt) { - // const auto workspace_size = input_dims * output_dims * sizeof(uint16_t); - // void *workspace {}; - // check_cuda_error(cudaMalloc((void**)&workspace, workspace_size)); - // std::cout << "where: " << weight->where << ", type: " << weight->type << ", shape: "; - // for (size_t i = 0; i < weight->shape.size(); ++i) { - // std::cout << weight->shape[i] << ", "; - // } - // std::cout << std::endl; - // std::vector _temp(weight->shape[0] * weight->shape[1]); - // cudaMemcpy(_temp.data(), weight->data, _temp.size() * sizeof(int), cudaMemcpyDeviceToHost); - // int row = 4095; - // for (size_t i = 0; i < _temp.size() && i < 100; i++) { - // std::cout << _temp[row * weight->shape[1] + i] << ", "; - // } - // cudaDeviceSynchronize(); - using namespace gemm; auto [order_b, pack_b, order_v, pack_v] = get_weight_and_scales_layout(getSMVersion(), use_simt); - - // std::cout << "oder_b: " << int(order_b) << ", input_dims: " << input_dims << ", output_dims: " << output_dims << std::endl; if (order_b == kColMajor) { transpose_u4((uint4_t*)workspace, (const uint4_t*)weight->data, input_dims, output_dims); cudaMemcpy(const_cast(weight->data), workspace, input_dims * output_dims / 2, cudaMemcpyDefault); From 9b6e35bd3a564f80c34d4ca13d4a4cfb91955ccf Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Sat, 12 Oct 2024 19:42:58 +0800 Subject: [PATCH 08/18] update --- turbomind/linear.py | 50 ++------------------------------------------- 1 file changed, 2 insertions(+), 48 deletions(-) diff --git a/turbomind/linear.py b/turbomind/linear.py index 3abdc59..c91380a 100644 --- a/turbomind/linear.py +++ b/turbomind/linear.py @@ -2,12 +2,13 @@ import logging import os.path as osp import sys -from typing import List import torch import turbomind +from .utils import get_u4_slices, pack_u4_row, unpack_awq_gemm + turbomind_dir = osp.split(turbomind.__file__)[0] sys.path.append(osp.join(turbomind_dir, 'lib')) @@ -19,15 +20,6 @@ TURBOMIND_KERNELS_INSTALLED = False -def pack_u4_row(x: torch.Tensor) -> torch.Tensor: - assert x.dtype == torch.uint8 - xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1) - a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device) - for t in reversed(xs): - a = (a << 4) | t - return a.squeeze(dim=-1) - - def transpose(x): return x.t() if x is not None else x @@ -49,44 +41,6 @@ def to_cuda(x: torch.Tensor, *args): return x.cuda() -def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> List[torch.Tensor]: - assert x.dtype == torch.int32 - xs = [] - for _ in range(8): - xs.append((x & 15).to(dtype)) - x = x >> 4 - return xs - - -def unpack_awq_gemm(x: torch.Tensor) -> torch.Tensor: - xs = get_u4_slices(x, torch.uint8) - order = [0, 4, 1, 5, 2, 6, 3, 7] - ys = [xs[i] for i in order] - return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1) - - -def process_awq_gemm(x: torch.Tensor, kind: str): - x = x.cuda() - if x.dtype == torch.int32: - x = unpack_awq_gemm(x) - if kind in ['qweight', 'qzeros', 'scales']: - x = x.t() - return x - - -def process_gptq(x: torch.Tensor, kind: str): - x = x.cuda() - if x.dtype == torch.int32: - xs = get_u4_slices(x, torch.uint8) - if kind == 'qweight': # (k/8,n) - x = torch.stack(xs, dim=1).view(-1, x.size(-1)) - else: # 'qzeros' (k/g,n/8) - x = torch.stack(xs, dim=-1).view(x.size(0), -1) + 1 - if kind in ['qweight', 'qzeros', 'scales']: - x = x.t() - return x - - class Linear(torch.nn.Module): def __init__(self, From 2fc6cf751db69b77e5fe13a542c3d55c1808754f Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 14 Oct 2024 16:49:24 +0800 Subject: [PATCH 09/18] fix cudmalloc error size --- example/test_linear.py | 36 +++++++-------- generate.sh | 1 + src/turbomind/api/python/linear.cc | 74 ++++++++++++++++++++++-------- turbomind/linear.py | 17 ++++++- 4 files changed, 91 insertions(+), 37 deletions(-) diff --git a/example/test_linear.py b/example/test_linear.py index 2bd6e46..87572bd 100644 --- a/example/test_linear.py +++ b/example/test_linear.py @@ -1,4 +1,5 @@ import torch +import torch.nn as nn from safetensors import safe_open import turbomind as tm @@ -76,7 +77,7 @@ def makup_qzeros(in_features: int, out_features: int, group_size: int): return qzeros -def makup_scales(in_features: int, out_featurse: int, group_size: int): +def makup_scales(in_features: int, out_features: int, group_size: int): assert in_features % group_size == 0 and in_features // group_size >= 1 scales = torch.rand((in_features // group_size, out_features), dtype=torch.float16, @@ -106,27 +107,15 @@ def dequantize(qweight, qzeros, scales, group_size: int = 128): # out_features=out_features, # group_size=group_size) # scales = makup_scales(in_features, -# out_featurse=out_features, +# out_features=out_features, # group_size=group_size) -# weight = dequantize(qweight, qzeros, scales, group_size) -# print(f'-- dequantization: weight.shape={weight.shape}, weight: \n{weight}') -# ref_linear = nn.Linear(in_features, out_features, bias=False, device='cuda') -# with torch.no_grad(): -# ref_linear.weight = nn.Parameter(weight.T) - -# x = torch.randn(in_features, device=weight.device, dtype=weight.dtype) -# print(f'input: {x}') -# print(weight.device, x.device) -# ref_res = ref_linear(x) -# print(ref_res) - def load_specified_linear_weights(): ckpt_path = '/models/140/llama3/Meta-Llama-3-8B-Instruct-hf-AWQ/model-00001-of-00002.safetensors' # noqa layer_id = 0 # prefix = f'model.layers.{layer_id}.self_attn.q_proj.' - prefix = f'model.layers.{layer_id}.mlp.gate_proj.' + prefix = f'model.layers.{layer_id}.self_attn.o_proj.' keys = ['qweight', 'qzeros', 'scales'] tensors = {} with safe_open(ckpt_path, framework='pt', device='cuda') as f: @@ -144,6 +133,16 @@ def load_specified_linear_weights(): in_features = qweight.shape[0] out_features = qweight.shape[1] * 8 +x = torch.randn(in_features, device=qweight.device, dtype=torch.float16) + +weight = dequantize(qweight, qzeros, scales, group_size) +print(f'-- dequantization: weight.shape={weight.shape}, weight: \n{weight}') +ref_linear = nn.Linear(in_features, out_features, bias=False, device='cuda') +with torch.no_grad(): + ref_linear.weight = nn.Parameter(weight.T) + ref_res = ref_linear(x) + print(f'nn.linear.res: {ref_res}') + model = tm.Linear(in_features=in_features, out_features=out_features, bias=False, @@ -157,7 +156,8 @@ def load_specified_linear_weights(): model.post_init() -x = torch.randn(in_features, device=qweight.device, dtype=torch.float16) res = model(x) -# max_diff = max(abs(ref_res - res)) -# ave_diff = sum(abs(ref_res - res)) / ref_res.numel() +print(f'tm.linear.res: {res}') +max_diff = torch.max(abs(ref_res - res)) +ave_diff = torch.sum(abs(ref_res - res)) / ref_res.numel() +print(f'max_diff {max_diff}, ave_diff {ave_diff}') diff --git a/generate.sh b/generate.sh index 09c51a7..4083da6 100755 --- a/generate.sh +++ b/generate.sh @@ -5,6 +5,7 @@ cmake .. \ -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ -DCMAKE_CUDA_FLAGS="-lineinfo" \ -DUSE_NVTX=ON \ + -DBUILD_TEST=ON \ -DFETCHCONTENT_UPDATES_DISCONNECTED=ON \ -DLMDEPLOY_ASAN_ENABLE=OFF \ -DLMDEPLOY_UBSAN_ENABLE=OFF \ diff --git a/src/turbomind/api/python/linear.cc b/src/turbomind/api/python/linear.cc index cfb1202..aec546b 100644 --- a/src/turbomind/api/python/linear.cc +++ b/src/turbomind/api/python/linear.cc @@ -8,6 +8,7 @@ #include #include #include +#include namespace turbomind { @@ -50,6 +51,7 @@ struct Linear::Impl { void post_init(std::shared_ptr qweight, std::shared_ptr scales, std::shared_ptr qzeros, bool simt) { + cudaDeviceSynchronize(); const auto workspace_size = input_dims_ * output_dims_ * sizeof(uint16_t); void *workspace {}; check_cuda_error(cudaMalloc((void**)&workspace, workspace_size)); @@ -184,6 +186,41 @@ struct Linear::Impl { size_t output_dims, int group_size, bool use_simt) { + if constexpr (0) { + std::cout << "scales: " << std::endl; + std::vector<__half> tmp(input_dims / group_size * output_dims); + cudaMemcpy(tmp.data(), scales->data, sizeof(__half) * tmp.size(), cudaMemcpyDefault); + cudaDeviceSynchronize(); + int i = 0; + for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) { + std::cout << __half2float(*it) << " "; + } + std::cout << std::endl; + i = 0; + for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) { + std::cout << __half2float(*it) << " "; + } + std::cout << std::endl; + } + + if constexpr (0) { + std::cout << "zeros: " << std::endl; + std::vector<__half> tmp(input_dims / group_size * output_dims / 8); + cudaMemcpy(tmp.data(), qzeros->data, sizeof(__half) * tmp.size(), cudaMemcpyDefault); + cudaDeviceSynchronize(); + int i = 0; + for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) { + std::cout << __half2float(*it) << " "; + } + std::cout << std::endl; + i = 0; + for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) { + std::cout << __half2float(*it) << " "; + } + std::cout << std::endl; + } + + const auto scale_count = input_dims / group_size * output_dims; using namespace gemm; @@ -194,7 +231,7 @@ struct Linear::Impl { cudaDeviceSynchronize(); - check_cuda_error(cudaMalloc((half**)&scales_zeros_, scale_count * 2)); + check_cuda_error(cudaMalloc(&scales_zeros_, sizeof(uint16_t) * scale_count * 2)); MatrixLayout s_desc{ gemm::DataType::U32, @@ -210,23 +247,24 @@ struct Linear::Impl { TM_CHECK(Convert(workspace, s_desc, scales_zeros_, q_desc_, 0) == 0); sync_check_cuda_error(); - // if constexpr (0) { - // std::vector tmp(scale_count * 2); - // cudaMemcpy(tmp.data(), scales_zeros_, sizeof(half) * tmp.size(), cudaMemcpyDefault); - // cudaDeviceSynchronize(); - // // for (const auto& x: tmp) { - // // std::cout << (float)x << " "; - // // } - // int i = 0; - // for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) { - // std::cout << std::hex << *it << " "; - // } - // i = 0; - // std::cout << "\n"; - // for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) { - // std::cout << std::hex << *it << " "; - // } - // } + if constexpr (0) { + std::vector<__half> tmp(scale_count * 2); + cudaMemcpy(tmp.data(), scales_zeros_, sizeof(__half) * tmp.size(), cudaMemcpyDefault); + cudaDeviceSynchronize(); + // for (const auto& x: tmp) { + // std::cout << (float)x << " "; + // } + int i = 0; + for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) { + std::cout << __half2float(*it) << " "; + } + std::cout << std::endl; + i = 0; + for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) { + std::cout << __half2float(*it) << " "; + } + std::cout << std::endl; + } } private: diff --git a/turbomind/linear.py b/turbomind/linear.py index c91380a..4a19913 100644 --- a/turbomind/linear.py +++ b/turbomind/linear.py @@ -153,6 +153,22 @@ def is_16xx_series(name): self.qweight = self.qweight.contiguous() self.scales = self.scales.contiguous() self.qzeros = self.qzeros.contiguous() + + # _list = self.scales.flatten().cpu().tolist() + # def print_formatted_float(a): + # formatted_list = " ".join(f"{num:.8f}" for num in a) + # print(formatted_list) + # print_formatted_float(_list[0:1000]) + # print_formatted_float(_list[-1000:][::-1]) + + # def print_formatted_int(a): + # formatted_list = ' '.join(f'{num:.0f}' for num in a) + # print(formatted_list) + + # _list = self.qzeros.flatten().cpu().tolist() + # print_formatted_int(_list[0:1000]) + # print_formatted_int(_list[-1000:][::-1]) + self.linear.post_init(self.qweight, self.scales, self.qzeros, simt) @torch.no_grad() @@ -173,7 +189,6 @@ def forward(self, x): ) self.linear.forward(x, out) out = torch.from_dlpack(out) - print(out) return out.view(out_shape) def __call__(self, x: torch.Tensor): From a25827b7ecec500b5ac9ac30a5490ea2c1f7af9c Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 14 Oct 2024 17:36:34 +0800 Subject: [PATCH 10/18] update --- src/turbomind/api/python/linear.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/turbomind/api/python/linear.cc b/src/turbomind/api/python/linear.cc index aec546b..d2bc25c 100644 --- a/src/turbomind/api/python/linear.cc +++ b/src/turbomind/api/python/linear.cc @@ -205,7 +205,7 @@ struct Linear::Impl { if constexpr (0) { std::cout << "zeros: " << std::endl; - std::vector<__half> tmp(input_dims / group_size * output_dims / 8); + std::vector<__half> tmp(input_dims / group_size * output_dims); cudaMemcpy(tmp.data(), qzeros->data, sizeof(__half) * tmp.size(), cudaMemcpyDefault); cudaDeviceSynchronize(); int i = 0; @@ -249,7 +249,7 @@ struct Linear::Impl { if constexpr (0) { std::vector<__half> tmp(scale_count * 2); - cudaMemcpy(tmp.data(), scales_zeros_, sizeof(__half) * tmp.size(), cudaMemcpyDefault); + cudaMemcpy(tmp.data(), workspace, sizeof(__half) * tmp.size(), cudaMemcpyDefault); cudaDeviceSynchronize(); // for (const auto& x: tmp) { // std::cout << (float)x << " "; From 584c80d79e49fc69ded71a1e08ecb41e9b2a60cd Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 14 Oct 2024 18:08:39 +0800 Subject: [PATCH 11/18] remove useless code --- example/test_linear.py | 96 ++---------------------------------------- 1 file changed, 3 insertions(+), 93 deletions(-) diff --git a/example/test_linear.py b/example/test_linear.py index 87572bd..1293929 100644 --- a/example/test_linear.py +++ b/example/test_linear.py @@ -3,92 +3,14 @@ from safetensors import safe_open import turbomind as tm +from turbomind.utils import unpack_awq_gemm torch.manual_seed(0) -def i32x8_to_i4x8(w): - """merge 8 integers (range from 0 to 15) into one 32-bit integer.""" - assert w.shape[-1] % 8 == 0 - shape = (w.shape[0], w.numel() // (w.shape[0] * 8), 8) - shape = shape[:-1] + (1, ) - result = torch.zeros(shape, dtype=w.dtype, device=w.device) - mask = torch.tensor([15], dtype=w.dtype, device=w.device) - for i in range(8): - shift = 4 * (7 - i) - result[..., 0] |= (w[..., i] & mask) << shift - result = result.view(w.shape[0], -1) - return result - - -def i4x8_to_i32x8(w): - """split one integer every 4bits into 8 integers (range from 0 to 15)""" - shape = w.shape + (8, ) - result = torch.zeros(shape, dtype=w.dtype, device=w.device) - mask = torch.tensor([15], dtype=w.dtype, device=w.device) - for i in range(8): - shift = 4 * (7 - i) - result[..., i] = (w >> shift) & mask - result = result.view(w.shape[0], -1) - return result - - -# ## test i4x8_to_i32x8 -# value = 1636164468 -# print(hex(value)) -# a = torch.tensor([[value, value], [value, value]], dtype=torch.int32) -# b = i4x8_to_i32x8(a) -# print(b) -# c = i32x8_to_i4x8(b) -# print(c) -# cmp = a == c -# assert torch.sum(cmp) == cmp.numel() -# exit(0) -# ## end test - - -def makeup_qweight(in_features: int, out_features: int): - assert out_features % 8 == 0 - qweight = torch.randint(0, - 16, (in_features, out_features // 8, 8), - dtype=torch.int32, - device='cuda') - print(f'-- makeup qweight: shape {qweight.shape}') - print(qweight.view(in_features, -1)) - qweight = i32x8_to_i4x8(qweight) - print(f'-- merge qweight: shape {qweight.shape}') - print(qweight) - return qweight - - -def makup_qzeros(in_features: int, out_features: int, group_size: int): - assert out_features % 8 == 0 - assert in_features % group_size == 0 and in_features // group_size >= 1 - - qzeros = torch.randint(0, - 16, - (in_features // group_size, out_features // 8, 8), - dtype=torch.int32, - device='cuda') - print(f'-- makeup qzero: shape {qzeros.shape}') - print(qzeros.view(in_features // group_size, -1)) - qzeros = i32x8_to_i4x8(qzeros) - print(f'-- merge qzero: shape {qzeros.shape}\n{qzeros}') - return qzeros - - -def makup_scales(in_features: int, out_features: int, group_size: int): - assert in_features % group_size == 0 and in_features // group_size >= 1 - scales = torch.rand((in_features // group_size, out_features), - dtype=torch.float16, - device='cuda') - print(f'-- makeup scales: shape {scales.shape}\n{scales}') - return scales - - def dequantize(qweight, qzeros, scales, group_size: int = 128): - _qweight = i4x8_to_i32x8(qweight) - _qzeros = i4x8_to_i32x8(qzeros) + _qweight = unpack_awq_gemm(qweight) + _qzeros = unpack_awq_gemm(qzeros) _qzeros = _qzeros.half() weight = _qweight.clone().half() for i in range(qzeros.shape[0]): @@ -99,18 +21,6 @@ def dequantize(qweight, qzeros, scales, group_size: int = 128): return weight -# in_features = 128 -# out_features = 8 -# group_size = 128 -# qweight = makeup_qweight(in_features, out_features) -# qzeros = makup_qzeros(in_features=in_features, -# out_features=out_features, -# group_size=group_size) -# scales = makup_scales(in_features, -# out_features=out_features, -# group_size=group_size) - - def load_specified_linear_weights(): ckpt_path = '/models/140/llama3/Meta-Llama-3-8B-Instruct-hf-AWQ/model-00001-of-00002.safetensors' # noqa layer_id = 0 From 3136c42c73acfaa91e7579dccce34942c99ff19e Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 15 Oct 2024 11:49:39 +0800 Subject: [PATCH 12/18] change the data type of zeros, scales, in and out from std::shared_ptr to reference --- src/turbomind/api/python/bind.cpp | 4 +-- src/turbomind/api/python/linear.cc | 43 ++++++++++++------------------ src/turbomind/api/python/linear.h | 16 ++--------- 3 files changed, 21 insertions(+), 42 deletions(-) diff --git a/src/turbomind/api/python/bind.cpp b/src/turbomind/api/python/bind.cpp index 409fc5d..276cd12 100644 --- a/src/turbomind/api/python/bind.cpp +++ b/src/turbomind/api/python/bind.cpp @@ -335,11 +335,11 @@ PYBIND11_MODULE(turbomind_kernels, m) { auto _qweight = TorchTensorToTurbomindTensor(qweight); auto _scales = TorchTensorToTurbomindTensor(scales); auto _qzeros = TorchTensorToTurbomindTensor(qzeros); - linear->post_init(_qweight, _scales, _qzeros, simt); + linear->post_init(_qweight, *_scales, *_qzeros, simt); }) .def("forward", [](turbomind::Linear* linear, py::object in, py::object out) { auto _in = TorchTensorToTurbomindTensor(in); auto _out = TorchTensorToTurbomindTensor(out); - return linear->forward(_in, _out); + return linear->forward(*_in, *_out); }); } diff --git a/src/turbomind/api/python/linear.cc b/src/turbomind/api/python/linear.cc index d2bc25c..6641110 100644 --- a/src/turbomind/api/python/linear.cc +++ b/src/turbomind/api/python/linear.cc @@ -49,9 +49,7 @@ struct Linear::Impl { check_cuda_error(cudaFree(scales_zeros_)); } - void post_init(std::shared_ptr qweight, std::shared_ptr scales, std::shared_ptr qzeros, - bool simt) { - cudaDeviceSynchronize(); + void post_init(std::shared_ptr qweight, const Tensor& scales, const Tensor& qzeros, bool simt) { const auto workspace_size = input_dims_ * output_dims_ * sizeof(uint16_t); void *workspace {}; check_cuda_error(cudaMalloc((void**)&workspace, workspace_size)); @@ -62,10 +60,10 @@ struct Linear::Impl { check_cuda_error(cudaFree(workspace)); } - void forward(std::shared_ptr in, std::shared_ptr out) { - TM_CHECK(in->type == TYPE_FP16 && out->type == TYPE_FP16); - TM_CHECK(in->shape.size() == 2 && in->shape[1] == input_dims_); - TM_CHECK(out->shape.size() == 2 && out->shape[0] == in->shape[0] && out->shape[1] == output_dims_); + void forward(const Tensor& in, Tensor& out) { + TM_CHECK(in.type == TYPE_FP16 && out.type == TYPE_FP16); + TM_CHECK(in.shape.size() == 2 && in.shape[1] == input_dims_); + TM_CHECK(out.shape.size() == 2 && out.shape[0] == in.shape[0] && out.shape[1] == output_dims_); using namespace gemm; @@ -79,7 +77,7 @@ struct Linear::Impl { const MatrixLayout a_desc{ gemm::DataType::F16, // get_data_type_v, kRowMajor, - (int)in->shape[0], // row + (int)in.shape[0], // row (int)input_dims_, // col (int)input_dims_ // input_data.pitch, // input_data.pitch = input_dims_ if input_data.pitch==0 }; @@ -87,14 +85,14 @@ struct Linear::Impl { const MatrixLayout c_desc{ gemm::DataType::F16, // get_data_type_v, kRowMajor, - (int)in->shape[0], // row + (int)in.shape[0], // row (int)output_dims_, // col (int)output_dims_ }; auto ec = gemm_.Run(operation, 1.f, - in->data, + in.data, a_desc, nullptr, {}, @@ -103,9 +101,9 @@ struct Linear::Impl { scales_zeros_, q_desc_, 0.0f, - out->data, + out.data, c_desc, - const_cast(out->data), + const_cast(out.data), c_desc, workspace_, stream_); @@ -180,8 +178,8 @@ struct Linear::Impl { } void convert_scales_zeros(void* workspace, - std::shared_ptr scales, - std::shared_ptr qzeros, + const Tensor& scales, + const Tensor& qzeros, size_t input_dims, size_t output_dims, int group_size, @@ -189,7 +187,7 @@ struct Linear::Impl { if constexpr (0) { std::cout << "scales: " << std::endl; std::vector<__half> tmp(input_dims / group_size * output_dims); - cudaMemcpy(tmp.data(), scales->data, sizeof(__half) * tmp.size(), cudaMemcpyDefault); + cudaMemcpy(tmp.data(), scales.data, sizeof(__half) * tmp.size(), cudaMemcpyDefault); cudaDeviceSynchronize(); int i = 0; for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) { @@ -206,7 +204,7 @@ struct Linear::Impl { if constexpr (0) { std::cout << "zeros: " << std::endl; std::vector<__half> tmp(input_dims / group_size * output_dims); - cudaMemcpy(tmp.data(), qzeros->data, sizeof(__half) * tmp.size(), cudaMemcpyDefault); + cudaMemcpy(tmp.data(), qzeros.data, sizeof(__half) * tmp.size(), cudaMemcpyDefault); cudaDeviceSynchronize(); int i = 0; for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) { @@ -226,7 +224,7 @@ struct Linear::Impl { using namespace gemm; auto [order_b, pack_b, order_v, pack_v] = get_weight_and_scales_layout(getSMVersion(), use_simt); - fuse_scales_and_zeros((half*)workspace, (const half*)scales->data, (half*)qzeros->data, scale_count); + fuse_scales_and_zeros((half*)workspace, (const half*)scales.data, (half*)qzeros.data, scale_count); sync_check_cuda_error(); cudaDeviceSynchronize(); @@ -251,9 +249,6 @@ struct Linear::Impl { std::vector<__half> tmp(scale_count * 2); cudaMemcpy(tmp.data(), workspace, sizeof(__half) * tmp.size(), cudaMemcpyDefault); cudaDeviceSynchronize(); - // for (const auto& x: tmp) { - // std::cout << (float)x << " "; - // } int i = 0; for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) { std::cout << __half2float(*it) << " "; @@ -268,7 +263,6 @@ struct Linear::Impl { } private: - // cublasMMWrapper* cublas_wrapper_; gemm::Gemm gemm_; gemm::DispatchPolicy dispatch_policy_{gemm::DispatchPolicy::kDefault}; gemm::Workspace workspace_; @@ -290,14 +284,11 @@ Linear::Linear(size_t input_dims, size_t output_dims, int w_bit, int group_size) impl_ = std::make_shared(input_dims, output_dims, w_bit, group_size); } -void Linear::post_init(std::shared_ptr qweight, - std::shared_ptr scales, - std::shared_ptr qzeros, - bool simt) { +void Linear::post_init(std::shared_ptr qweight, const Tensor& scales, const Tensor& qzeros, bool simt) { impl_->post_init(qweight, scales, qzeros, simt); } -void Linear::forward(std::shared_ptr in, std::shared_ptr out) +void Linear::forward(const Tensor& in, Tensor& out) { impl_->forward(in, out); } diff --git a/src/turbomind/api/python/linear.h b/src/turbomind/api/python/linear.h index ee56b3a..11e81eb 100644 --- a/src/turbomind/api/python/linear.h +++ b/src/turbomind/api/python/linear.h @@ -22,25 +22,13 @@ enum class WeightType : int kINT4 }; -std::shared_ptr convert_qweight(std::shared_ptr qweight, - size_t input_dims, - size_t output_dims, - bool use_simt); -std::shared_ptr convert_scales_zeros(std::shared_ptr scales, - std::shared_ptr qzeros, - std::shared_ptr scales_zeros, - size_t input_dims, - size_t output_dims, - int group_size, - bool use_simt); - class Linear { public: Linear(size_t input_dims, size_t output_dims, int w_bit, int group_size); - void post_init(std::shared_ptr qweight, std::shared_ptr scales, std::shared_ptr qzeros, + void post_init(std::shared_ptr qweight, const Tensor& scales, const Tensor& qzeros, bool simt); - void forward(std::shared_ptr in, std::shared_ptr out); + void forward(const Tensor& in, Tensor& out); ~Linear() {} private: From 2de746ec3918a65bf82f5a6de0fded49d4daf6f6 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 15 Oct 2024 12:00:56 +0800 Subject: [PATCH 13/18] rename to _turbomind_ext --- CMakeLists.txt | 2 +- src/turbomind/api/python/CMakeLists.txt | 2 +- src/turbomind/api/python/bind.cpp | 3 +-- turbomind/linear.py | 10 +++++----- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 29299f2..9ef743b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,7 +150,7 @@ link_directories( add_subdirectory(src/turbomind) # install python api -install(TARGETS turbomind_kernels DESTINATION ${CMAKE_SOURCE_DIR}/turbomind/lib) +install(TARGETS _turbomind_ext DESTINATION ${CMAKE_SOURCE_DIR}/turbomind/lib) if (MSVC) diff --git a/src/turbomind/api/python/CMakeLists.txt b/src/turbomind/api/python/CMakeLists.txt index bf9acb7..4bda327 100644 --- a/src/turbomind/api/python/CMakeLists.txt +++ b/src/turbomind/api/python/CMakeLists.txt @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. cmake_minimum_required(VERSION 3.8) -project(turbomind_kernels) +project(_turbomind_ext) find_package(pybind11 CONFIG) if(NOT pybind11_FOUND) diff --git a/src/turbomind/api/python/bind.cpp b/src/turbomind/api/python/bind.cpp index 276cd12..8140b46 100644 --- a/src/turbomind/api/python/bind.cpp +++ b/src/turbomind/api/python/bind.cpp @@ -14,7 +14,6 @@ #include namespace py = pybind11; -// namespace tm = turbomind; using namespace pybind11::literals; static const char kDlTensorCapsuleName[] = "dltensor"; @@ -209,7 +208,7 @@ std::shared_ptr TorchTensorToTurbomindTensor(py::object obj) return DLManagedTensorToTurbomindTensor(dlmt); } -PYBIND11_MODULE(turbomind_kernels, m) { +PYBIND11_MODULE(_turbomind_ext, m) { py::enum_(m, "WeightType") .value("kFP32", turbomind::WeightType::kFP32) .value("kFP16", turbomind::WeightType::kFP16) diff --git a/turbomind/linear.py b/turbomind/linear.py index 4a19913..f09f944 100644 --- a/turbomind/linear.py +++ b/turbomind/linear.py @@ -13,10 +13,10 @@ sys.path.append(osp.join(turbomind_dir, 'lib')) try: - import turbomind_kernels + import _turbomind_ext TURBOMIND_KERNELS_INSTALLED = True except Exception as e: - logging.error(f'turbomind_kernels is not installed: {e}') + logging.error(f'_turbomind_ext is not installed: {e}') TURBOMIND_KERNELS_INSTALLED = False @@ -111,9 +111,9 @@ def __init__(self, # else: # self.bias = None - self.linear = turbomind_kernels.Linear(self.in_features, - self.out_features, self.w_bit, - self.group_size) + self.linear = _turbomind_ext.Linear(self.in_features, + self.out_features, self.w_bit, + self.group_size) def post_init(self): assert self.qweight.device.type == 'cuda' From 66ca496971012fc5d6cef08b3f317e7cc556fa23 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 15 Oct 2024 12:03:51 +0800 Subject: [PATCH 14/18] remove useless code --- turbomind/linear.py | 36 ++++++------------------------------ 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/turbomind/linear.py b/turbomind/linear.py index f09f944..e888248 100644 --- a/turbomind/linear.py +++ b/turbomind/linear.py @@ -20,10 +20,6 @@ TURBOMIND_KERNELS_INSTALLED = False -def transpose(x): - return x.t() if x is not None else x - - def pad_out_dims(x: torch.Tensor, dims: int): pad = dims - x.size(-1) assert pad >= 0 @@ -118,9 +114,9 @@ def __init__(self, def post_init(self): assert self.qweight.device.type == 'cuda' if self.quant_method == 'awq': - self.qweight = unpack_awq_gemm(self.qweight).t() - self.qzeros = unpack_awq_gemm(self.qzeros).t() - self.scales = self.scales.t() + self.qweight = unpack_awq_gemm(self.qweight) + self.qzeros = unpack_awq_gemm(self.qzeros) + self.scales = self.scales elif self.quant_method == 'gptq': xs = get_u4_slices(self.qweight, torch.uint8) self.qweight = torch.stack(xs, dim=1).view(-1, @@ -128,16 +124,12 @@ def post_init(self): xs = get_u4_slices(self.qzeros, torch.uint8) self.qzeros = torch.stack(xs, dim=-1).view(self.qzeros.size(0), -1) + 1 - self.qweight = self.qweight.t() - self.qzeros = self.qzeros.t() - self.scales = self.scales.t() + self.qweight = self.qweight + self.qzeros = self.qzeros + self.scales = self.scales else: return - self.qweight = transpose(self.qweight) - self.qzeros = transpose(self.qzeros) - self.scales = transpose(self.scales) - self.qweight = pack_u4_row(self.qweight) self.qzeros = self.qzeros.to(torch.half) @@ -153,22 +145,6 @@ def is_16xx_series(name): self.qweight = self.qweight.contiguous() self.scales = self.scales.contiguous() self.qzeros = self.qzeros.contiguous() - - # _list = self.scales.flatten().cpu().tolist() - # def print_formatted_float(a): - # formatted_list = " ".join(f"{num:.8f}" for num in a) - # print(formatted_list) - # print_formatted_float(_list[0:1000]) - # print_formatted_float(_list[-1000:][::-1]) - - # def print_formatted_int(a): - # formatted_list = ' '.join(f'{num:.0f}' for num in a) - # print(formatted_list) - - # _list = self.qzeros.flatten().cpu().tolist() - # print_formatted_int(_list[0:1000]) - # print_formatted_int(_list[-1000:][::-1]) - self.linear.post_init(self.qweight, self.scales, self.qzeros, simt) @torch.no_grad() From 824c9e89883b4eea7254ba36d3a37263394df911 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 15 Oct 2024 14:32:16 +0800 Subject: [PATCH 15/18] update linear test --- example/test_linear.py | 79 ++++++++++++++++++++++++++++++++++++------ turbomind/linear.py | 30 ++++++++-------- 2 files changed, 85 insertions(+), 24 deletions(-) diff --git a/example/test_linear.py b/example/test_linear.py index 1293929..025f60c 100644 --- a/example/test_linear.py +++ b/example/test_linear.py @@ -8,6 +8,53 @@ torch.manual_seed(0) +def i32x8_to_i4x8(w): + """merge 8 integers (range from 0 to 15) into one 32-bit integer.""" + assert w.shape[-1] % 8 == 0 + shape = (w.shape[0], w.numel() // (w.shape[0] * 8), 8) + shape = shape[:-1] + (1, ) + result = torch.zeros(shape, dtype=w.dtype, device=w.device) + mask = torch.tensor([15], dtype=w.dtype, device=w.device) + for i in range(8): + shift = 4 * (7 - i) + result[..., 0] |= (w[..., i] & mask) << shift + result = result.view(w.shape[0], -1) + return result + + +def makeup_weights(in_features: int, out_features: int, group_size: int = 128): + # make up qweight + assert out_features % 8 == 0 + qweight = torch.randint(0, + 16, (in_features, out_features // 8, 8), + dtype=torch.int32, + device='cuda') + print(f'-- makeup qweight: shape {qweight.shape}') + print(qweight.view(in_features, -1)) + qweight = i32x8_to_i4x8(qweight) + print(f'-- merge qweight: shape {qweight.shape}') + print(qweight) + + # make up qzeros + assert in_features % group_size == 0 and in_features // group_size >= 1 + qzeros = torch.randint(0, + 16, + (in_features // group_size, out_features // 8, 8), + dtype=torch.int32, + device='cuda') + print(f'-- makeup qzero: shape {qzeros.shape}') + print(qzeros.view(in_features // group_size, -1)) + qzeros = i32x8_to_i4x8(qzeros) + print(f'-- merge qzero: shape {qzeros.shape}\n{qzeros}') + + # make up scales + scales = torch.rand((in_features // group_size, out_features), + dtype=torch.float16, + device='cuda') + print(f'-- makeup scales: shape {scales.shape}\n{scales}') + return qweight, qzeros, scales + + def dequantize(qweight, qzeros, scales, group_size: int = 128): _qweight = unpack_awq_gemm(qweight) _qzeros = unpack_awq_gemm(qzeros) @@ -32,18 +79,22 @@ def load_specified_linear_weights(): for key in keys: tensors[key] = f.get_tensor(prefix + key) - return tensors + return tensors['qweight'], tensors['qzeros'], tensors['scales'] -tensors = load_specified_linear_weights() -qweight, qzeros, scales = tensors['qweight'], tensors['qzeros'], tensors[ - 'scales'] +# qweight, qzeros, scales = load_specified_linear_weights() +# in_features = qweight.shape[0] +# out_features = qweight.shape[1] * 8 group_size = 128 -in_features = qweight.shape[0] -out_features = qweight.shape[1] * 8 +batch_size = 16384 +in_features = 16384 +out_features = 16384 +qweight, qzeros, scales = makeup_weights(in_features, out_features, group_size) -x = torch.randn(in_features, device=qweight.device, dtype=torch.float16) +x = torch.randn((batch_size, in_features), + device=qweight.device, + dtype=torch.float16) weight = dequantize(qweight, qzeros, scales, group_size) print(f'-- dequantization: weight.shape={weight.shape}, weight: \n{weight}') @@ -68,6 +119,14 @@ def load_specified_linear_weights(): res = model(x) print(f'tm.linear.res: {res}') -max_diff = torch.max(abs(ref_res - res)) -ave_diff = torch.sum(abs(ref_res - res)) / ref_res.numel() -print(f'max_diff {max_diff}, ave_diff {ave_diff}') +abs_diff = torch.abs(res - ref_res).float() +rel_diff = abs_diff / torch.max(torch.abs(ref_res), torch.abs(res)) +rtol = 0.01 +atol = 0.0001 +outliers = abs_diff > atol + rtol * torch.abs(ref_res) +abs_diff = torch.sum(abs_diff) / abs_diff.numel() +rel_diff = torch.sum(rel_diff) / rel_diff.numel() +outliers = torch.sum(outliers) / outliers.shape[0] +print(f'abs_diff {abs_diff:4f}, ' + f'rel_diff {rel_diff:4f}, ' + f'outliers {outliers:4f}') diff --git a/turbomind/linear.py b/turbomind/linear.py index e888248..1e9d221 100644 --- a/turbomind/linear.py +++ b/turbomind/linear.py @@ -14,10 +14,10 @@ try: import _turbomind_ext - TURBOMIND_KERNELS_INSTALLED = True + TURBOMIND_EXTENSION_INSTALLED = True except Exception as e: logging.error(f'_turbomind_ext is not installed: {e}') - TURBOMIND_KERNELS_INSTALLED = False + TURBOMIND_EXTENSION_INSTALLED = False def pad_out_dims(x: torch.Tensor, dims: int): @@ -95,17 +95,17 @@ def __init__(self, ), ) - # if bias: - # self.register_buffer( - # 'bias', - # torch.zeros( - # (out_features), - # dtype=torch.float16, - # device=device, - # ), - # ) - # else: - # self.bias = None + if bias: + self.register_buffer( + 'bias', + torch.zeros( + (out_features), + dtype=torch.float16, + device=device, + ), + ) + else: + self.bias = None self.linear = _turbomind_ext.Linear(self.in_features, self.out_features, self.w_bit, @@ -149,7 +149,7 @@ def is_16xx_series(name): @torch.no_grad() def forward(self, x): - assert TURBOMIND_KERNELS_INSTALLED, ( + assert TURBOMIND_EXTENSION_INSTALLED, ( 'turbomind kernels are not installed. ' 'Please perform `pip install turbomind` to install turbomind ' 'kernels.') @@ -165,6 +165,8 @@ def forward(self, x): ) self.linear.forward(x, out) out = torch.from_dlpack(out) + if self.bias is not None: + out.add_(self.bias) return out.view(out_shape) def __call__(self, x: torch.Tensor): From a4ecf6a1c41fc27b332b33dbfc353910d5d53dda Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 15 Oct 2024 14:40:38 +0800 Subject: [PATCH 16/18] update --- example/test_linear.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/example/test_linear.py b/example/test_linear.py index 025f60c..c2ad4db 100644 --- a/example/test_linear.py +++ b/example/test_linear.py @@ -58,14 +58,15 @@ def makeup_weights(in_features: int, out_features: int, group_size: int = 128): def dequantize(qweight, qzeros, scales, group_size: int = 128): _qweight = unpack_awq_gemm(qweight) _qzeros = unpack_awq_gemm(qzeros) - _qzeros = _qzeros.half() - weight = _qweight.clone().half() + _qzeros = _qzeros.float() + _qweight = _qweight.float() + _scales = scales.float() for i in range(qzeros.shape[0]): start = i * group_size end = start + group_size - weight[start:end] = (weight[start:end, :] - - _qzeros[i:i + 1, :]) * scales[i:i + 1, :] - return weight + _qweight[start:end] = (_qweight[start:end, :] - + _qzeros[i:i + 1, :]) * _scales[i:i + 1, :] + return _qweight.half() def load_specified_linear_weights(): From 7948aee7ef1276ccae1002f74065fa8eab207b99 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 15 Oct 2024 14:48:09 +0800 Subject: [PATCH 17/18] add cudaStream_t param in Linear::Forward --- src/turbomind/api/python/linear.cc | 19 +++++++++---------- src/turbomind/api/python/linear.h | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/turbomind/api/python/linear.cc b/src/turbomind/api/python/linear.cc index 6641110..d05c7d4 100644 --- a/src/turbomind/api/python/linear.cc +++ b/src/turbomind/api/python/linear.cc @@ -36,15 +36,15 @@ struct Linear::Impl { workspace_.barriers_size = gemm::Gemm::kBarriersSize; workspace_.partials_size = gemm::Gemm::kPartialsSize; - cudaMallocAsync(&workspace_.barriers, workspace_.barriers_size, stream_); - cudaMallocAsync(&workspace_.partials, workspace_.partials_size, stream_); - cudaMemsetAsync(workspace_.barriers, 0, workspace_.barriers_size, stream_); + cudaMallocAsync(&workspace_.barriers, workspace_.barriers_size, 0); + cudaMallocAsync(&workspace_.partials, workspace_.partials_size, 0); + cudaMemsetAsync(workspace_.barriers, 0, workspace_.barriers_size, 0); } ~Impl() { - cudaFreeAsync(workspace_.barriers, stream_); - cudaFreeAsync(workspace_.partials, stream_); + cudaFreeAsync(workspace_.barriers, 0); + cudaFreeAsync(workspace_.partials, 0); workspace_ = {}; check_cuda_error(cudaFree(scales_zeros_)); } @@ -60,7 +60,7 @@ struct Linear::Impl { check_cuda_error(cudaFree(workspace)); } - void forward(const Tensor& in, Tensor& out) { + void forward(const Tensor& in, Tensor& out, cudaStream_t stream) { TM_CHECK(in.type == TYPE_FP16 && out.type == TYPE_FP16); TM_CHECK(in.shape.size() == 2 && in.shape[1] == input_dims_); TM_CHECK(out.shape.size() == 2 && out.shape[0] == in.shape[0] && out.shape[1] == output_dims_); @@ -106,7 +106,7 @@ struct Linear::Impl { const_cast(out.data), c_desc, workspace_, - stream_); + stream); if (ec) { printf("%s: %d", __PRETTY_FUNCTION__, ec); @@ -266,7 +266,6 @@ struct Linear::Impl { gemm::Gemm gemm_; gemm::DispatchPolicy dispatch_policy_{gemm::DispatchPolicy::kDefault}; gemm::Workspace workspace_; - cudaStream_t stream_{}; size_t input_dims_; size_t output_dims_; @@ -288,8 +287,8 @@ void Linear::post_init(std::shared_ptr qweight, const Tensor& scales, co impl_->post_init(qweight, scales, qzeros, simt); } -void Linear::forward(const Tensor& in, Tensor& out) +void Linear::forward(const Tensor& in, Tensor& out, cudaStream_t stream) { - impl_->forward(in, out); + impl_->forward(in, out, stream); } } // namespace turbomind diff --git a/src/turbomind/api/python/linear.h b/src/turbomind/api/python/linear.h index 11e81eb..8b611f2 100644 --- a/src/turbomind/api/python/linear.h +++ b/src/turbomind/api/python/linear.h @@ -28,7 +28,7 @@ class Linear { Linear(size_t input_dims, size_t output_dims, int w_bit, int group_size); void post_init(std::shared_ptr qweight, const Tensor& scales, const Tensor& qzeros, bool simt); - void forward(const Tensor& in, Tensor& out); + void forward(const Tensor& in, Tensor& out, cudaStream_t stream = nullptr); ~Linear() {} private: From 5904bf86f43467655d854db083e3344d39db3efa Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 15 Oct 2024 15:46:17 +0800 Subject: [PATCH 18/18] update --- example/test_linear.py | 6 +++++- src/turbomind/api/python/bind.cpp | 9 +++++---- turbomind/linear.py | 3 ++- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/example/test_linear.py b/example/test_linear.py index c2ad4db..8ccbbbc 100644 --- a/example/test_linear.py +++ b/example/test_linear.py @@ -118,7 +118,11 @@ def load_specified_linear_weights(): model.post_init() -res = model(x) +stream = torch.cuda.Stream() +with torch.cuda.stream(stream): + res = model(x) +stream.synchronize() + print(f'tm.linear.res: {res}') abs_diff = torch.abs(res - ref_res).float() rel_diff = abs_diff / torch.max(torch.abs(ref_res), torch.abs(res)) diff --git a/src/turbomind/api/python/bind.cpp b/src/turbomind/api/python/bind.cpp index 8140b46..5eadc1e 100644 --- a/src/turbomind/api/python/bind.cpp +++ b/src/turbomind/api/python/bind.cpp @@ -329,16 +329,17 @@ PYBIND11_MODULE(_turbomind_ext, m) { .def(py::init([](size_t in_features, size_t out_features, int w_bit, int group_size) { return new turbomind::Linear(in_features, out_features, w_bit, group_size); })) - .def("post_init", [](turbomind::Linear* linear, py::object qweight, py::object scales, py::object qzeros, + .def("post_init", [](turbomind::Linear* self, py::object qweight, py::object scales, py::object qzeros, bool simt){ auto _qweight = TorchTensorToTurbomindTensor(qweight); auto _scales = TorchTensorToTurbomindTensor(scales); auto _qzeros = TorchTensorToTurbomindTensor(qzeros); - linear->post_init(_qweight, *_scales, *_qzeros, simt); + self->post_init(_qweight, *_scales, *_qzeros, simt); }) - .def("forward", [](turbomind::Linear* linear, py::object in, py::object out) { + .def("forward", [](turbomind::Linear* self, py::object in, py::object out, int64_t stream_id = 0) { auto _in = TorchTensorToTurbomindTensor(in); auto _out = TorchTensorToTurbomindTensor(out); - return linear->forward(*_in, *_out); + auto stream = reinterpret_cast(stream_id); + return self->forward(*_in, *_out, stream); }); } diff --git a/turbomind/linear.py b/turbomind/linear.py index 1e9d221..a7fea9b 100644 --- a/turbomind/linear.py +++ b/turbomind/linear.py @@ -163,7 +163,8 @@ def forward(self, x): dtype=torch.float16, device=x.device, ) - self.linear.forward(x, out) + stream = torch.cuda.current_stream() + self.linear.forward(x, out, stream.cuda_stream) out = torch.from_dlpack(out) if self.bias is not None: out.add_(self.bias)