Skip to content

Commit

Permalink
max tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
DavidKoleczek committed Feb 8, 2025
1 parent a36a40c commit c24d676
Show file tree
Hide file tree
Showing 6 changed files with 176 additions and 148 deletions.
286 changes: 140 additions & 146 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "not-again-ai"
version = "0.16.0"
version = "0.16.1"
description = "Designed to once and for all collect all the little things that come up over and over again in AI projects and put them in one place."
authors = [
{ name = "DaveCoDev", email = "[email protected]" }
Expand Down Expand Up @@ -40,7 +40,7 @@ poetry-plugin-export = ">=1.8"

[project.optional-dependencies]
data = [
"playwright>=1.49",
"playwright>=1.50",
"pytest-playwright>=0.7"
]
llm = [
Expand Down
5 changes: 5 additions & 0 deletions src/not_again_ai/llm/chat_completion/providers/ollama_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"logit_bias": None,
"top_logprobs": None,
"presence_penalty": None,
"max_tokens": "num_predict",
}


Expand All @@ -45,6 +46,10 @@ def validate(request: ChatCompletionRequest) -> None:
logger.warning("Parameter 'stop' needs to be a string and not a list. It will be ignored.")
request.stop = None

# Raise an error if both "max_tokens" and "max_completion_tokens" are provided
if request.max_tokens is not None and request.max_completion_tokens is not None:
raise ValueError("`max_tokens` and `max_completion_tokens` cannot both be provided.")


def ollama_chat_completion(
request: ChatCompletionRequest,
Expand Down
4 changes: 4 additions & 0 deletions src/not_again_ai/llm/chat_completion/providers/openai_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ def validate(request: ChatCompletionRequest) -> None:
if request.json_mode and request.structured_outputs is not None:
raise ValueError("json_schema and json_mode cannot be used together.")

# Raise an error if both "max_tokens" and "max_completion_tokens" are provided
if request.max_tokens is not None and request.max_completion_tokens is not None:
raise ValueError("`max_tokens` and `max_completion_tokens` cannot both be provided.")


def openai_chat_completion(
request: ChatCompletionRequest,
Expand Down
5 changes: 5 additions & 0 deletions src/not_again_ai/llm/chat_completion/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ class ChatCompletionRequest(BaseModel):
top_k: int | None = Field(default=None)
min_p: float | None = Field(default=None)

max_tokens: int | None = Field(
default=None,
description="Sometimes `max_completion_tokens` is not correctly supported so we provide this as a fallback.",
)


class ChatCompletionChoice(BaseModel):
message: AssistantMessage
Expand Down
20 changes: 20 additions & 0 deletions tests/llm/chat_completion/test_chat_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,6 +789,16 @@ def test_chat_completion_invalid_params(openai_aoai_client_fixture: Callable[...
print(response.model_dump(mode="json", exclude_none=True))


def test_chat_completion_max_tokens(openai_aoai_client_fixture: Callable[..., Any]) -> None:
request = ChatCompletionRequest(
model="gpt-4o-mini-2024-07-18",
messages=[UserMessage(content="What is the capital of France?")],
max_tokens=100,
)
response = chat_completion(request, "openai", openai_aoai_client_fixture)
print(response.model_dump(mode="json", exclude_none=True))


# region OpenAI
@pytest.fixture(
params=[
Expand Down Expand Up @@ -1059,4 +1069,14 @@ def test_chat_completion_ollama_vision_multiple_messages(ollama_client_fixture:
print(response.model_dump(mode="json", exclude_none=True))


def test_chat_completion_ollama_max_tokens(ollama_client_fixture: Callable[..., Any]) -> None:
request = ChatCompletionRequest(
model="llama3.2-vision:11b-instruct-q4_K_M",
messages=[UserMessage(content="What is the capital of France?")],
max_tokens=100,
)
response = chat_completion(request, "ollama", ollama_client_fixture)
print(response.model_dump(mode="json", exclude_none=True))


# endregion

0 comments on commit c24d676

Please sign in to comment.