predibase · tgaddair · Feb 12, 2024 · Feb 5, 2024 · Feb 5, 2024 · Feb 5, 2024
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -22,6 +22,7 @@ jobs:
     env:
       SCCACHE_GHA_ENABLED: "on"
       RUSTC_WRAPPER: /usr/local/bin/sccache
+      RUST_BACKTRACE: 1
       SCCACHE: 0.3.3
 
     steps:
@@ -33,7 +34,7 @@ jobs:
       - name: Install Rust
         uses: actions-rs/toolchain@v1
         with:
-          toolchain: 1.70.0
+          toolchain: 1.74.0
           override: true
           components: rustfmt, clippy
       - name: Install Protoc

diff --git a/Dockerfile b/Dockerfile
@@ -218,7 +218,7 @@ COPY server/Makefile server/Makefile
 
 RUN cd server && \
     make gen-server && \
-    pip install ".[bnb, accelerate, quantize, peft]" --no-cache-dir
+    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir
 
 # Install router
 COPY --from=builder /usr/src/target/release/lorax-router /usr/local/bin/lorax-router

diff --git a/clients/python/lorax/__init__.py b/clients/python/lorax/__init__.py
@@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.3.0"
+__version__ = "0.3.1"
 
 from lorax.client import Client, AsyncClient, MergedAdapters
diff --git a/clients/python/lorax/client.py b/clients/python/lorax/client.py
@@ -3,7 +3,7 @@
 
 from aiohttp import ClientSession, ClientTimeout
 from pydantic import ValidationError
-from typing import Dict, Optional, List, AsyncIterator, Iterator
+from typing import Any, Dict, Optional, List, AsyncIterator, Iterator
 
 from lorax.types import (
     StreamResponse,
@@ -79,6 +79,7 @@ def generate(
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: bool = False,
+        schema: Optional[Dict[str, Any]] = None,
         decoder_input_details: bool = False,
     ) -> Response:
         """
@@ -124,6 +125,8 @@ def generate(
                 See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
             watermark (`bool`):
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+            schema (`Optional[Dict[str, Any]]`):
+                Optional JSON schema to validate the response
             decoder_input_details (`bool`):
                 Return the decoder input token logprobs and ids
 
@@ -150,20 +153,22 @@ def generate(
             truncate=truncate,
             typical_p=typical_p,
             watermark=watermark,
+            schema=json.dumps(schema) if schema is not None else None,
             decoder_input_details=decoder_input_details,
         )
         request = Request(inputs=prompt, stream=False, parameters=parameters)
 
         resp = requests.post(
             self.base_url,
-            json=request.dict(),
+            json=request.dict(by_alias=True),
             headers=self.headers,
             cookies=self.cookies,
             timeout=self.timeout,
         )
         payload = resp.json()
         if resp.status_code != 200:
             raise parse_error(resp.status_code, payload)
+
         return Response(**payload[0])
 
     def generate_stream(
@@ -185,6 +190,7 @@ def generate_stream(
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: bool = False,
+        schema: Optional[Dict[str, Any]] = None,
     ) -> Iterator[StreamResponse]:
         """
         Given a prompt, generate the following stream of tokens
@@ -227,6 +233,8 @@ def generate_stream(
                 See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
             watermark (`bool`):
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+            schema (`Optional[Dict[str, Any]]`):
+                Optional JSON schema to validate the response
 
         Returns:
             Iterator[StreamResponse]: stream of generated tokens
@@ -252,6 +260,7 @@ def generate_stream(
             truncate=truncate,
             typical_p=typical_p,
             watermark=watermark,
+            schema=json.dumps(schema) if schema is not None else None,
         )
         request = Request(inputs=prompt, stream=True, parameters=parameters)
 
@@ -353,6 +362,7 @@ async def generate(
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: bool = False,
+        schema: Optional[Dict[str, Any]] = None,
         decoder_input_details: bool = False,
     ) -> Response:
         """
@@ -398,6 +408,8 @@ async def generate(
                 See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
             watermark (`bool`):
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+            schema (`Optional[Dict[str, Any]]`):
+                Optional JSON schema to validate the response
             decoder_input_details (`bool`):
                 Return the decoder input token logprobs and ids
 
@@ -425,6 +437,7 @@ async def generate(
             truncate=truncate,
             typical_p=typical_p,
             watermark=watermark,
+            schema=json.dumps(schema) if schema is not None else None,
         )
         request = Request(inputs=prompt, stream=False, parameters=parameters)
 
@@ -457,6 +470,7 @@ async def generate_stream(
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: bool = False,
+        schema: Optional[Dict[str, Any]] = None,
     ) -> AsyncIterator[StreamResponse]:
         """
         Given a prompt, generate the following stream of tokens asynchronously
@@ -499,6 +513,8 @@ async def generate_stream(
                 See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
             watermark (`bool`):
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+            schema (`Optional[Dict[str, Any]]`):
+                Optional JSON schema to validate the response
 
         Returns:
             AsyncIterator[StreamResponse]: stream of generated tokens

diff --git a/clients/python/lorax/types.py b/clients/python/lorax/types.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from pydantic import BaseModel, validator
+from pydantic import BaseModel, validator, Field
 from typing import Optional, List
 
 from lorax.errors import ValidationError
@@ -98,6 +98,8 @@ class Parameters(BaseModel):
     details: bool = False
     # Get decoder input token logprobs and ids
     decoder_input_details: bool = False
+    # Optional JSON schema string to constrain the generated text
+    json_schema: Optional[str] = Field(alias="schema")
 
     @validator("adapter_id")
     def valid_adapter_id(cls, v, values):

diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
@@ -3,7 +3,7 @@ name = "lorax-client"
 packages = [
     {include = "lorax"}
 ]
-version = "0.3.0"
+version = "0.3.1"
 description = "LoRAX Python Client"
 license = "Apache-2.0"
 authors = ["Travis Addair <[email protected]>", "Olivier Dehaene <[email protected]>"]

diff --git a/docs/reference/openapi.json b/docs/reference/openapi.json
@@ -829,6 +829,12 @@
             "default": "false",
             "example": true
           },
+          "schema": {
+            "type": "string",
+            "default": "null",
+            "example": "{\"type\": \"string\", \"title\": \"response\"}",
+            "nullable": true
+          },
           "adapter_id": {
             "type": "string",
             "nullable": true

diff --git a/proto/generate.proto b/proto/generate.proto
@@ -75,6 +75,8 @@ message NextTokenChooserParameters {
     bool watermark = 8;
     /// adapter to use with lora exchange
     string adapter_id = 9;
+    /// JSON schema used for constrained decoding (Outlines)
+    optional string schema = 10;
 }
 
 message StoppingCriteriaParameters {

diff --git a/router/client/src/client.rs b/router/client/src/client.rs
@@ -126,6 +126,7 @@ impl Client {
                     repetition_penalty: 1.2,
                     watermark: true,
                     adapter_id: "".to_string(),
+                    schema: None,
                 }),
                 stopping_parameters: Some(StoppingCriteriaParameters {
                     max_new_tokens: 2,

diff --git a/router/src/health.rs b/router/src/health.rs
@@ -45,6 +45,7 @@ impl Health {
                     repetition_penalty: 1.0,
                     watermark: false,
                     adapter_id: "".to_string(),
+                    schema: None,
                 }),
                 stopping_parameters: Some(StoppingCriteriaParameters {
                     max_new_tokens: 1,

diff --git a/router/src/lib.rs b/router/src/lib.rs
@@ -249,6 +249,13 @@ pub(crate) struct GenerateParameters {
         example = "null"
     )]
     pub seed: Option<u64>,
+    #[serde(default)]
+    #[schema(
+        nullable = true,
+        default = "null",
+        example = "{\"type\": \"string\", \"title\": \"response\"}"
+    )]
+    pub schema: Option<String>,
 }
 
 fn default_max_new_tokens() -> u32 {
@@ -277,6 +284,7 @@ fn default_parameters() -> GenerateParameters {
         decoder_input_details: false,
         apply_chat_template: false,
         seed: None,
+        schema: None,
     }
 }
 
@@ -582,6 +590,7 @@ impl From<CompletionRequest> for CompatGenerateRequest {
                 decoder_input_details: req.logprobs.is_some(),
                 apply_chat_template: false,
                 seed: None,
+                schema: None,
             },
             stream: req.stream.unwrap_or(false),
         }
@@ -616,6 +625,7 @@ impl From<ChatCompletionRequest> for CompatGenerateRequest {
                 decoder_input_details: false,
                 apply_chat_template: true,
                 seed: None,
+                schema: None,
             },
             stream: req.stream.unwrap_or(false),
         }

diff --git a/router/src/validation.rs b/router/src/validation.rs
@@ -147,6 +147,7 @@ impl Validation {
             adapter_parameters,
             decoder_input_details,
             apply_chat_template,
+            schema,
             ..
         } = request.parameters;
 
@@ -273,6 +274,7 @@ impl Validation {
             seed,
             watermark,
             adapter_id,
+            schema,
         };
         let stopping_parameters = StoppingCriteriaParameters {
             max_new_tokens,

diff --git a/rust-toolchain.toml b/rust-toolchain.toml
@@ -1,3 +1,3 @@
 [toolchain]
-channel = "1.70.0"
+channel = "1.74.0"
 components = ["rustfmt", "clippy"]
diff --git a/server/Makefile b/server/Makefile
@@ -20,7 +20,7 @@ install: gen-server
 	pip install pip --upgrade
 	pip install torch==2.2.0
 	pip install -r requirements.txt
-	pip install -e ".[bnb, accelerate, quantize, peft]"
+	pip install -e ".[bnb, accelerate, quantize, peft, outlines]"
 
 run-dev:
 	# SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=1 lorax_server/cli.py serve gpt2