predibase · flozi00 · Feb 9, 2024 · Feb 9, 2024 · Feb 9, 2024 · Feb 9, 2024
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -4,9 +4,10 @@ on:
   workflow_dispatch:
   push:
     branches:
-      - 'main'
+      - "main"
+      - "aqlm"
     tags:
-      - 'v*'
+      - "v*"
 
 jobs:
   build-and-push-image:
@@ -41,8 +42,8 @@ jobs:
       - name: Install soci
         uses: lerentis/[email protected]
         with:
-          soci-release: 'v0.4.0'
-      
+          soci-release: "v0.4.0"
+
       - name: Set up Docker Buildx
         uses: docker/[email protected]
 
@@ -51,7 +52,7 @@ jobs:
         with:
           config-inline: |
             version = 2
-            
+
             # persistent data location
             root = "/var/lib/kubelet/containerd"
 
@@ -62,11 +63,8 @@ jobs:
           images: |
             ghcr.io/predibase/lorax
           tags: |
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=sha,prefix=,suffix=,format=short
-            type=raw,value=latest
-
+            type=raw,value=aqlm
+
       - name: Create a hash from tags
         env:
           tags: ${{ steps.meta.outputs.tags }}
@@ -93,7 +91,7 @@ jobs:
         uses: docker/build-push-action@v5
         with:
           context: .
-          file: ./Dockerfile  # Path to your Dockerfile
+          file: ./Dockerfile # Path to your Dockerfile
           push: false
           tags: ${{ steps.meta.outputs.tags }}
           outputs: type=oci,compression=gzip,dest=${{ steps.vars.outputs.image_path }}-${{ steps.vars.outputs.tag_hash }}.tar.gz
@@ -124,7 +122,7 @@ jobs:
             echo "Pushing $tag to GHCR"
             sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag
           done
-      
+
       - name: Create and push soci index
         env:
           tags: ${{ steps.meta.outputs.tags }}
@@ -151,4 +149,3 @@ jobs:
 
           # Delete the SHA image(s) from containerd store
           sudo ctr i rm $(sudo ctr i ls -q)
-
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
@@ -30,6 +30,7 @@ enum Quantization {
     Hqq_4bit,
     Hqq_3bit,
     Hqq_2bit,
+    Aqlm,
 }
 
 impl std::fmt::Display for Quantization {
@@ -63,6 +64,9 @@ impl std::fmt::Display for Quantization {
             Quantization::Hqq_2bit => {
                 write!(f, "hqq-2bit")
             }
+            Quantization::Aqlm => {
+                write!(f, "aqlm")
+            }
         }
     }
 }

diff --git a/server/lorax_server/cli.py b/server/lorax_server/cli.py
@@ -21,6 +21,7 @@ class Quantization(str, Enum):
     hqq_4bit = "hqq-4bit"
     hqq_3bit = "hqq-3bit"
     hqq_2bit = "hqq-2bit"
+    aqlm = "aqlm"
 
 
 class Dtype(str, Enum):

diff --git a/server/lorax_server/models/__init__.py b/server/lorax_server/models/__init__.py
@@ -234,7 +234,7 @@ def get_model(
                 trust_remote_code=trust_remote_code,
             )
 
-    if model_type == "llama":
+    if model_type in ["llama", "llama_aqlm"]:
         if FLASH_ATTENTION:
             return FlashLlama(
                 model_id,
@@ -306,7 +306,7 @@ def get_model(
                     trust_remote_code=trust_remote_code,
                 )
 
-    if model_type == "mistral":
+    if model_type in ["mistral", "mistral_aqlm"]:
         if MISTRAL:
             return FlashMistral(
                 model_id,
@@ -320,7 +320,7 @@ def get_model(
             )
         raise NotImplementedError("Mistral model requires flash attention v2")
 
-    if model_type == "mixtral":
+    if model_type in ["mixtral", "mixtral_aqlm"]:
         if MIXTRAL:
             return FlashMixtral(
                 model_id,

diff --git a/server/lorax_server/models/flash_llama.py b/server/lorax_server/models/flash_llama.py
@@ -91,6 +91,8 @@ def __init__(
 
         if config.quantize in ["gptq", "awq", "eetq"]:
             weights._set_gptq_params(model_id)
+        elif config.quantize == "aqlm":
+            weights._set_aqlm_params(model_id)
 
         model = FlashLlamaForCausalLM(config, weights)
 

diff --git a/server/lorax_server/models/flash_mistral.py b/server/lorax_server/models/flash_mistral.py
@@ -376,6 +376,8 @@ def __init__(
 
         if config.quantize in ["gptq", "awq", "eetq"]:
             weights._set_gptq_params(model_id)
+        elif config.quantize == "aqlm":
+            weights._set_aqlm_params(model_id)
 
         model = FlashMistralForCausalLM(config, weights)
 

diff --git a/server/lorax_server/models/flash_mixtral.py b/server/lorax_server/models/flash_mixtral.py
@@ -383,6 +383,8 @@ def __init__(
 
         if config.quantize in ["gptq", "awq", "eetq"]:
             weights._set_gptq_params(model_id)
+        elif config.quantize == "aqlm":
+            weights._set_aqlm_params(model_id)
 
         model = FlashMixtralForCausalLM(config, weights)
 

diff --git a/server/lorax_server/utils/layers.py b/server/lorax_server/utils/layers.py
@@ -40,6 +40,12 @@ def weight(self) -> torch.Tensor:
 except ImportError:
     HAS_HQQ = False
 
+HAS_AQLM = True
+try:
+    from aqlm import QuantizedLinear
+except ImportError:
+    HAS_AQLM = False
+
 from accelerate import init_empty_weights
 
 from lorax_server.utils.gptq.quant_linear import QuantLinear
@@ -385,6 +391,15 @@ def get_linear(weight, bias, quantize, fan_in_fan_out=False):
                 layer.bias.data = bias
 
         linear = HQQLinearLayer(layer, quant_config, del_orig=True)
+    elif quantize == "aqlm":
+        scales, codebooks, codes, nbits_per_codebook, num_codebooks, out_group_size, in_group_size = weight
+        linear = QuantizedLinear(scales.shape[1], scales.shape[0], in_group_size, out_group_size, num_codebooks, nbits_per_codebook)
+        with torch.no_grad():
+            linear.scales = scales
+            linear.codebooks = codebooks
+            linear.codes = codes
+            if bias is not None:
+                linear.bias.data = bias
     else:
         raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
     return linear

diff --git a/server/lorax_server/utils/weights.py b/server/lorax_server/utils/weights.py
@@ -204,6 +204,12 @@ def get_multi_weights_col(self, prefixes: List[Union[str, Tuple]], quantize: str
 
             bits, groupsize = self._get_gptq_params()
             weight = (qweight, qzeros, scales, g_idx, bits, groupsize, False)
+        elif quantize == "aqlm":
+            nbits_per_codebook, num_codebooks, out_group_size, in_group_size = self._get_aqlm_params()
+            scales = self.get_sharded_list("scales", prefixes, dim=0)
+            codebooks = self.get_sharded_list("codebooks", prefixes, dim=0)
+            codes = self.get_sharded_list("codes", prefixes, dim=0)
+            weight = (scales, codebooks, codes, nbits_per_codebook, num_codebooks, out_group_size, in_group_size)
         else:
             w = self.get_sharded_list("weight", prefixes, dim=0)
             weight = torch.cat(w, dim=dim)
@@ -290,6 +296,12 @@ def get_multi_weights_row(self, prefix: str, quantize: str):
             g_idx = None
             use_exllama = False
             weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
+        elif quantize == "aqlm":
+            nbits_per_codebook, num_codebooks, out_group_size, in_group_size = self._get_aqlm_params()
+            scales = self.get_sharded(f"{prefix}.scales", dim=1)
+            codebooks = self.get_sharded(f"{prefix}.codebooks", dim=1)
+            codes = self.get_sharded(f"{prefix}.codes", dim=1)
+            weight = (scales, codebooks, codes, nbits_per_codebook, num_codebooks, out_group_size, in_group_size)
         else:
             weight = self.get_sharded(f"{prefix}.weight", dim=1)
         return weight
@@ -342,6 +354,22 @@ def _set_gptq_params(self, model_id):
                     self.gptq_groupsize = data["q_group_size"]
                 except Exception:
                     pass
+
+    def _get_aqlm_params(self) -> Tuple[int, int, int, int]:
+        return self.nbits_per_codebook, self.num_codebooks, self.out_group_size, self.in_group_size
+
+    def _set_aqlm_params(self, model_id):
+        filename = "config.json"
+        if os.path.exists(os.path.join(model_id, filename)):
+            filename = os.path.join(model_id, filename)
+        else:
+            filename = hf_hub_download(model_id, filename=filename)
+        with open(filename, "r") as f:
+            data = json.load(f)
+        self.nbits_per_codebook = data["aqlm"]["nbits_per_codebook"]
+        self.num_codebooks = data["aqlm"]["num_codebooks"]
+        self.out_group_size = data["aqlm"]["out_group_size"]
+        self.in_group_size = data["aqlm"]["in_group_size"]
 
 def get_start_stop_idxs_for_rank(offset, size, rank, world_size):
     block_size = size // world_size

diff --git a/server/pyproject.toml b/server/pyproject.toml
@@ -38,13 +38,14 @@ boto3 = "^1.28.34"
 urllib3 = "<=1.26.18"
 hqq = { version = "^0.1.2", optional = true }
 stanford-stk = { version = "^0.7.0", markers = "sys_platform == 'linux'" }
+aqlm = { version = "^1.0.0"}
 
 [tool.poetry.extras]
 torch = ["torch"]
 accelerate = ["accelerate"]
 bnb = ["bitsandbytes"]
 peft = ["peft"]
-quantize = ["texttable", "datasets", "accelerate", "hqq"]
+quantize = ["texttable", "datasets", "accelerate", "hqq", "aqlm"]
 
 [tool.poetry.group.dev.dependencies]
 grpcio-tools = "^1.51.1"