max tokens

DaveCoDev · Feb 8, 2025 · c24d676 · c24d676
1 parent a36a40c
commit c24d676
Show file tree

Hide file tree

Showing 6 changed files with 176 additions and 148 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "not-again-ai"
-version = "0.16.0"
+version = "0.16.1"
 description = "Designed to once and for all collect all the little things that come up over and over again in AI projects and put them in one place."
 authors = [
     { name = "DaveCoDev", email = "[email protected]" }
@@ -40,7 +40,7 @@ poetry-plugin-export = ">=1.8"
 
 [project.optional-dependencies]
 data = [
-    "playwright>=1.49",
+    "playwright>=1.50",
     "pytest-playwright>=0.7"
 ]
 llm = [

diff --git a/src/not_again_ai/llm/chat_completion/providers/ollama_api.py b/src/not_again_ai/llm/chat_completion/providers/ollama_api.py
@@ -28,6 +28,7 @@
     "logit_bias": None,
     "top_logprobs": None,
     "presence_penalty": None,
+    "max_tokens": "num_predict",
 }
 
 
@@ -45,6 +46,10 @@ def validate(request: ChatCompletionRequest) -> None:
         logger.warning("Parameter 'stop' needs to be a string and not a list. It will be ignored.")
         request.stop = None
 
+    # Raise an error if both "max_tokens" and "max_completion_tokens" are provided
+    if request.max_tokens is not None and request.max_completion_tokens is not None:
+        raise ValueError("`max_tokens` and `max_completion_tokens` cannot both be provided.")
+
 
 def ollama_chat_completion(
     request: ChatCompletionRequest,

diff --git a/src/not_again_ai/llm/chat_completion/providers/openai_api.py b/src/not_again_ai/llm/chat_completion/providers/openai_api.py
@@ -31,6 +31,10 @@ def validate(request: ChatCompletionRequest) -> None:
     if request.json_mode and request.structured_outputs is not None:
         raise ValueError("json_schema and json_mode cannot be used together.")
 
+    # Raise an error if both "max_tokens" and "max_completion_tokens" are provided
+    if request.max_tokens is not None and request.max_completion_tokens is not None:
+        raise ValueError("`max_tokens` and `max_completion_tokens` cannot both be provided.")
+
 
 def openai_chat_completion(
     request: ChatCompletionRequest,

diff --git a/src/not_again_ai/llm/chat_completion/types.py b/src/not_again_ai/llm/chat_completion/types.py
@@ -118,6 +118,11 @@ class ChatCompletionRequest(BaseModel):
     top_k: int | None = Field(default=None)
     min_p: float | None = Field(default=None)
 
+    max_tokens: int | None = Field(
+        default=None,
+        description="Sometimes `max_completion_tokens` is not correctly supported so we provide this as a fallback.",
+    )
+
 
 class ChatCompletionChoice(BaseModel):
     message: AssistantMessage

diff --git a/tests/llm/chat_completion/test_chat_completion.py b/tests/llm/chat_completion/test_chat_completion.py
@@ -789,6 +789,16 @@ def test_chat_completion_invalid_params(openai_aoai_client_fixture: Callable[...
     print(response.model_dump(mode="json", exclude_none=True))
 
 
+def test_chat_completion_max_tokens(openai_aoai_client_fixture: Callable[..., Any]) -> None:
+    request = ChatCompletionRequest(
+        model="gpt-4o-mini-2024-07-18",
+        messages=[UserMessage(content="What is the capital of France?")],
+        max_tokens=100,
+    )
+    response = chat_completion(request, "openai", openai_aoai_client_fixture)
+    print(response.model_dump(mode="json", exclude_none=True))
+
+
 # region OpenAI
 @pytest.fixture(
     params=[
@@ -1059,4 +1069,14 @@ def test_chat_completion_ollama_vision_multiple_messages(ollama_client_fixture:
     print(response.model_dump(mode="json", exclude_none=True))
 
 
+def test_chat_completion_ollama_max_tokens(ollama_client_fixture: Callable[..., Any]) -> None:
+    request = ChatCompletionRequest(
+        model="llama3.2-vision:11b-instruct-q4_K_M",
+        messages=[UserMessage(content="What is the capital of France?")],
+        max_tokens=100,
+    )
+    response = chat_completion(request, "ollama", ollama_client_fixture)
+    print(response.model_dump(mode="json", exclude_none=True))
+
+
 # endregion