mobiusml · movchan74 · Feb 26, 2024 · Feb 16, 2024 · Feb 20, 2024 · Feb 20, 2024
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -1,2 +1,2 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
 RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 ffmpeg
diff --git a/aana/api/api_generation.py b/aana/api/api_generation.py
@@ -7,7 +7,7 @@
 from fastapi.responses import StreamingResponse
 from mobius_pipeline.node.socket import Socket
 from mobius_pipeline.pipeline.pipeline import Pipeline
-from pydantic import BaseModel, Field, ValidationError, create_model, parse_raw_as
+from pydantic import BaseModel, Field, ValidationError, create_model
 
 from aana.api.app import custom_exception_handler
 from aana.api.responses import AanaJSONResponse
@@ -237,9 +237,9 @@ def get_file_upload_field(
                 continue
 
             # check if pydantic model has file_upload field and it's set to True
-            file_upload_enabled = getattr(data_model.Config, "file_upload", False)
-            file_upload_description = getattr(
-                data_model.Config, "file_upload_description", ""
+            file_upload_enabled = data_model.model_config.get("file_upload", False)
+            file_upload_description = data_model.model_config.get(
+                "file_upload_description", ""
             )
 
             if file_upload_enabled and file_upload_field is None:
@@ -330,7 +330,7 @@ def create_endpoint_func(  # noqa: C901
 
         async def route_func_body(body: str, files: list[UploadFile] | None = None):  # noqa: C901
             # parse form data as a pydantic model and validate it
-            data = parse_raw_as(RequestModel, body)
+            data = RequestModel.model_validate_json(body)
 
             # if the input requires file upload, add the files to the data
             if file_upload_field and files:
@@ -341,7 +341,7 @@ async def route_func_body(body: str, files: list[UploadFile] | None = None):  #
             # data.dict() will convert all nested models to dicts
             # and we want to keep them as pydantic models
             data_dict = {}
-            for field_name in data.__fields__:
+            for field_name in data.model_fields:
                 field_value = getattr(data, field_name)
                 data_dict[field_name] = field_value
 

diff --git a/aana/api/app.py b/aana/api/app.py
@@ -28,7 +28,7 @@ async def validation_exception_handler(request: Request, exc: ValidationError):
             error="ValidationError",
             message="Validation error",
             data=exc.errors(),
-        ).dict(),
+        ).model_dump(),
     )
 
 
@@ -77,7 +77,7 @@ def custom_exception_handler(request: Request | None, exc_raw: Exception):
         status_code=status_code,
         content=ExceptionResponseModel(
             error=error, message=message, data=data, stacktrace=stacktrace
-        ).dict(),
+        ).model_dump(),
     )
 
 

diff --git a/aana/api/request_handler.py b/aana/api/request_handler.py
@@ -11,7 +11,7 @@
 # TODO: improve type annotations
 
 
-@serve.deployment(route_prefix="/", num_replicas=1, ray_actor_options={"num_cpus": 0.1})
+@serve.deployment(ray_actor_options={"num_cpus": 0.1})
 @serve.ingress(app)
 class RequestHandler:
     """This class is used to handle requests to the Aana application."""

diff --git a/aana/configs/db.py b/aana/configs/db.py
@@ -1,11 +1,12 @@
 from enum import Enum
 from os import PathLike
 from pathlib import Path
-from typing import TypeAlias, TypedDict
+from typing import TypeAlias
 
 from alembic import command
 from alembic.config import Config
 from sqlalchemy import String, TypeDecorator, create_engine
+from typing_extensions import TypedDict
 
 from aana.models.pydantic.media_id import MediaId
 

diff --git a/aana/configs/deployments.py b/aana/configs/deployments.py
@@ -22,12 +22,13 @@
             model="TheBloke/Llama-2-7b-Chat-AWQ",
             dtype="auto",
             quantization="awq",
-            gpu_memory_reserved=10000,
+            gpu_memory_reserved=13000,
+            enforce_eager=True,
             default_sampling_params=SamplingParams(
                 temperature=0.0, top_p=1.0, top_k=-1, max_tokens=1024
             ),
             chat_template="llama2",
-        ).dict(),
+        ).model_dump(),
     ),
     "hf_blip2_deployment_opt_2_7b": HFBlip2Deployment.options(
         num_replicas=1,
@@ -38,7 +39,7 @@
             dtype=Dtype.FLOAT16,
             batch_size=2,
             num_processing_threads=2,
-        ).dict(),
+        ).model_dump(),
     ),
     "whisper_deployment_medium": WhisperDeployment.options(
         num_replicas=1,
@@ -47,7 +48,7 @@
         user_config=WhisperConfig(
             model_size=WhisperModelSize.MEDIUM,
             compute_type=WhisperComputeType.FLOAT16,
-        ).dict(),
+        ).model_dump(),
     ),
     "stablediffusion2_deployment": StableDiffusion2Deployment.options(
         num_replicas=1,

diff --git a/aana/configs/settings.py b/aana/configs/settings.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from pydantic import BaseSettings
+from pydantic_settings import BaseSettings
 
 from aana.configs.db import DBConfig
 
@@ -17,8 +17,8 @@ class Settings(BaseSettings):
     """A pydantic model for SDK settings."""
 
     tmp_data_dir: Path = Path("/tmp/aana_data")  # noqa: S108
-    image_dir = tmp_data_dir / "images"
-    video_dir = tmp_data_dir / "videos"
+    image_dir: Path = tmp_data_dir / "images"
+    video_dir: Path = tmp_data_dir / "videos"
     num_workers: int = 2
 
     db_config: DBConfig = {

diff --git a/aana/deployments/hf_blip2_deployment.py b/aana/deployments/hf_blip2_deployment.py
@@ -1,10 +1,11 @@
-from typing import Any, TypedDict
+from typing import Any
 
 import torch
 import transformers
 from pydantic import BaseModel, Field
 from ray import serve
 from transformers import Blip2ForConditionalGeneration, Blip2Processor
+from typing_extensions import TypedDict
 
 from aana.deployments.base_deployment import BaseDeployment
 from aana.exceptions.general import InferenceException

diff --git a/aana/deployments/vllm_deployment.py b/aana/deployments/vllm_deployment.py
@@ -1,20 +1,26 @@
+import contextlib
 from collections.abc import AsyncGenerator
-from typing import Any, TypedDict
+from typing import Any
 
 from pydantic import BaseModel, Field
 from ray import serve
+from typing_extensions import TypedDict
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.model_executor.utils import set_random_seed
+
+with contextlib.suppress(ImportError):
+    from vllm.model_executor.utils import (
+        set_random_seed,  # Ignore if we don't have GPU and only run on CPU with test cache
+    )
 from vllm.sampling_params import SamplingParams as VLLMSamplingParams
-from vllm.utils import get_gpu_memory, random_uuid
+from vllm.utils import random_uuid
 
 from aana.deployments.base_deployment import BaseDeployment
 from aana.exceptions.general import InferenceException, PromptTooLongException
 from aana.models.pydantic.chat_message import ChatDialog, ChatMessage
 from aana.models.pydantic.sampling_params import SamplingParams
 from aana.utils.chat_template import apply_chat_template
-from aana.utils.general import merged_options
+from aana.utils.general import get_gpu_memory, merged_options
 from aana.utils.test import test_cache
 
 
@@ -28,6 +34,9 @@ class VLLMConfig(BaseModel):
         gpu_memory_reserved (float): the GPU memory reserved for the model in mb
         default_sampling_params (SamplingParams): the default sampling parameters.
         max_model_len (int): the maximum generated text length in tokens (optional, default: None)
+        chat_template (str): the name of the chat template, if not provided, the chat template from the model will be used
+                             but some models may not have a chat template (optional, default: None)
+        enforce_eager (bool): whether to enforce eager execution (optional, default: False)
     """
 
     model: str
@@ -37,6 +46,7 @@ class VLLMConfig(BaseModel):
     default_sampling_params: SamplingParams
     max_model_len: int | None = Field(default=None)
     chat_template: str | None = Field(default=None)
+    enforce_eager: bool | None = Field(default=False)
 
 
 class LLMOutput(TypedDict):
@@ -107,6 +117,7 @@ async def apply_config(self, config: dict[str, Any]):
             model=config_obj.model,
             dtype=config_obj.dtype,
             quantization=config_obj.quantization,
+            enforce_eager=config_obj.enforce_eager,
             gpu_memory_utilization=self.gpu_memory_utilization,
             max_model_len=config_obj.max_model_len,
         )
@@ -116,7 +127,7 @@ async def apply_config(self, config: dict[str, Any]):
 
         # create the engine
         self.engine = AsyncLLMEngine.from_engine_args(args)
-        self.tokenizer = self.engine.engine.tokenizer
+        self.tokenizer = self.engine.engine.tokenizer.tokenizer
         self.model_config = await self.engine.get_model_config()
 
     @test_cache
@@ -148,7 +159,7 @@ async def generate_stream(
         try:
             # convert SamplingParams to VLLMSamplingParams
             sampling_params_vllm = VLLMSamplingParams(
-                **sampling_params.dict(exclude_unset=True)
+                **sampling_params.model_dump(exclude_unset=True)
             )
             # start the request
             request_id = random_uuid()

diff --git a/aana/deployments/whisper_deployment.py b/aana/deployments/whisper_deployment.py
@@ -1,11 +1,12 @@
 from collections.abc import AsyncGenerator
 from enum import Enum
-from typing import Any, TypedDict, cast
+from typing import Any, cast
 
 import torch
 from faster_whisper import WhisperModel
 from pydantic import BaseModel, Field
 from ray import serve
+from typing_extensions import TypedDict
 
 from aana.deployments.base_deployment import BaseDeployment
 from aana.exceptions.general import InferenceException
@@ -161,7 +162,7 @@ async def transcribe(
             params = WhisperParams()
         media_path: str = str(media.path)
         try:
-            segments, info = self.model.transcribe(media_path, **params.dict())
+            segments, info = self.model.transcribe(media_path, **params.model_dump())
         except Exception as e:
             raise InferenceException(self.model_name) from e
 
@@ -196,7 +197,7 @@ async def transcribe_stream(
             params = WhisperParams()
         media_path: str = str(media.path)
         try:
-            segments, info = self.model.transcribe(media_path, **params.dict())
+            segments, info = self.model.transcribe(media_path, **params.model_dump())
         except Exception as e:
             raise InferenceException(self.model_name) from e
 

diff --git a/aana/models/db/transcript.py b/aana/models/db/transcript.py
@@ -69,5 +69,5 @@ def from_asr_output(
             language=info.language,
             language_confidence=info.language_confidence,
             transcript=transcription.text,
-            segments=[s.dict() for s in segments],
+            segments=[s.model_dump() for s in segments],
         )