PySpur-Dev · srijanpatel · Jan 6, 2025 · Jan 5, 2025 · Jan 5, 2025 · Jan 5, 2025
diff --git a/.env.example b/.env.example
@@ -28,14 +28,14 @@ NGINX_PORT=6080
 # ======================
 # Database Settings
 # ======================
-# SQLite Configuration
-# Change this to specify a custom SQLite database location. Only specify the directory path. 
-# The database file name will be db.sqlite and will be created automatically.
-SQLITE_DB_PATH=./backend/sqlite/
-
+# PySpur uses PostgreSQL as the database. By default, the database is hosted in a separate container.
+# If you want to use an external database, you can provide the connection details here.
 # PostgreSQL Configuration
-# Specify a complete database connection string to use PostgreSQL instead of SQLite
-# DATABASE_URL=postgresql://user:password@host:port/dbname
+POSTGRES_DB=pyspur
+POSTGRES_USER=pyspur
+POSTGRES_PASSWORD=pyspur
+POSTGRES_HOST=db
+POSTGRES_PORT=5432
 
 
 # ======================

diff --git a/README.md b/README.md
@@ -69,7 +69,7 @@ You can launch PySpur using pre-built docker images in the following steps:
     docker compose -f ./docker-compose.prod.yml up --build -d
     ```
 
-    This will start a local instance of PySpur that will store spurs in a local sqlite database (or your database if you provided it in .env file in step 2)
+    This will start a local instance of PySpur that will store spurs and other state information in a postgres database. A local postgres service is used by default. Override `POSTGRES_*` variables in the `.env` file to use an external postgres database.
 
 4. **Access the portal:**
 
@@ -105,8 +105,7 @@ The steps for dev setup are same as above, except for step 3: we launch the app
     docker compose up --build -d
     ```
 
-    This will start a local instance of PySpur that will store spurs and their runs in a local SQLite file.
-    Note: For some environments you may want to try: ```sudo docker compose up --build -d```.
+    This will start a local instance of PySpur that will store spurs and other state information in a postgres database. A local postgres service is used by default. Override `POSTGRES_*` variables in the `.env` file to use an external postgres database.
 
 
 # 🦙 Using PySpur with Ollama (Local Models)

diff --git a/backend/alembic.ini b/backend/alembic.ini
@@ -61,7 +61,7 @@ version_path_separator = os  # Use os.pathsep. Default configuration used for ne
 # are written from script.py.mako
 # output_encoding = utf-8
 
-sqlalchemy.url = driver://user:pass@localhost/dbname
+sqlalchemy.url = postgresql://%(POSTGRES_USER)s:%(POSTGRES_PASSWORD)s@%(POSTGRES_HOST)s:%(POSTGRES_PORT)s/%(POSTGRES_DB)s
 
 
 [post_write_hooks]

diff --git a/backend/app/api/workflow_run.py b/backend/app/api/workflow_run.py
@@ -34,7 +34,7 @@
 
 async def create_run_model(
     workflow_id: str,
-    workflow_version_id: int,
+    workflow_version_id: str,
     initial_inputs: Dict[str, Dict[str, Any]],
     parent_run_id: Optional[str],
     run_type: str,
@@ -75,7 +75,7 @@ async def run_workflow_blocking(
     initial_inputs = request.initial_inputs or {}
     new_run = await create_run_model(
         workflow_id,
-        workflow_version._intid,  # type: ignore
+        workflow_version.id,
         initial_inputs,
         request.parent_run_id,
         run_type,
@@ -132,7 +132,7 @@ async def run_workflow_non_blocking(
     initial_inputs = start_run_request.initial_inputs or {}
     new_run = await create_run_model(
         workflow_id,
-        workflow_version._intid,  # type: ignore
+        workflow_version.id,
         initial_inputs,
         start_run_request.parent_run_id,
         run_type,
@@ -231,7 +231,7 @@ async def batch_run_workflow_non_blocking(
 
     dataset_id = request.dataset_id
     new_run = await create_run_model(
-        workflow_id, workflow_version._intid, {}, None, "batch", db  # type: ignore
+        workflow_id, workflow_version.id, {}, None, "batch", db
     )
 
     # parse the dataset

diff --git a/backend/app/database.py b/backend/app/database.py
@@ -1,17 +1,19 @@
 import os
+from dotenv import load_dotenv
 
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 
+load_dotenv()
+
 # Get the database URL from the environment
-DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///sqlite/db.sqlite")
-
-# Create SQLite database file if it doesn't exist and using SQLite
-if DATABASE_URL.startswith("sqlite:///"):
-    db_path = DATABASE_URL.replace("sqlite:///", "")
-    os.makedirs(os.path.dirname(db_path), exist_ok=True)
-    if not os.path.exists(db_path):
-        open(db_path, "a").close()
+POSTGRES_USER = os.getenv("POSTGRES_USER")
+POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
+POSTGRES_HOST = os.getenv("POSTGRES_HOST")
+POSTGRES_PORT = os.getenv("POSTGRES_PORT")
+POSTGRES_DB = os.getenv("POSTGRES_DB")
+
+DATABASE_URL = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"
 
 # Create the SQLAlchemy engine
 engine = create_engine(DATABASE_URL)

diff --git a/backend/app/models/dataset_model.py b/backend/app/models/dataset_model.py
@@ -8,9 +8,9 @@
 class DatasetModel(BaseModel):
     __tablename__ = "datasets"
 
-    _intid: Mapped[int] = mapped_column(Integer, primary_key=True)
+    _intid: Mapped[int] = mapped_column(Integer, primary_key=True, unique=True)
     id: Mapped[str] = mapped_column(
-        String, Computed("'DS' || _intid"), nullable=False, index=True
+        String, Computed("'DS' || _intid"), nullable=False, unique=True
     )
     name: Mapped[str] = mapped_column(String, unique=True, nullable=False)
     description: Mapped[Optional[str]] = mapped_column(String)

diff --git a/backend/app/models/eval_run_model.py b/backend/app/models/eval_run_model.py
@@ -25,7 +25,7 @@ class EvalRunModel(BaseModel):
 
     _intid: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement="auto")
     id: Mapped[str] = mapped_column(
-        String, Computed("'ER' || _intid"), nullable=False, index=True
+        String, Computed("'ER' || _intid"), nullable=False, unique=True
     )
     eval_name: Mapped[str] = mapped_column(String, nullable=False)
     workflow_id: Mapped[str] = mapped_column(String, nullable=False)

diff --git a/backend/app/models/management/alembic/.gitignore b/backend/app/models/management/alembic/.gitignore
diff --git a/backend/app/models/management/alembic/versions/000_init_db.py b/backend/app/models/management/alembic/versions/000_init_db.py
@@ -0,0 +1,150 @@
+"""init_db
+
+Revision ID: 000
+Revises: 
+Create Date: 2025-01-06 00:42:14.253167
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '000'
+down_revision: Union[str, None] = None
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('datasets',
+    sa.Column('_intid', sa.Integer(), nullable=False),
+    sa.Column('id', sa.String(), sa.Computed("'DS' || _intid", ), nullable=False),
+    sa.Column('name', sa.String(), nullable=False),
+    sa.Column('description', sa.String(), nullable=True),
+    sa.Column('file_path', sa.String(), nullable=False),
+    sa.Column('uploaded_at', sa.DateTime(), nullable=False),
+    sa.PrimaryKeyConstraint('_intid'),
+    sa.UniqueConstraint('_intid'),
+    sa.UniqueConstraint('id'),
+    sa.UniqueConstraint('name')
+    )
+    op.create_table('eval_runs',
+    sa.Column('_intid', sa.Integer(), nullable=False),
+    sa.Column('id', sa.String(), sa.Computed("'ER' || _intid", ), nullable=False),
+    sa.Column('eval_name', sa.String(), nullable=False),
+    sa.Column('workflow_id', sa.String(), nullable=False),
+    sa.Column('status', sa.Enum('PENDING', 'RUNNING', 'COMPLETED', 'FAILED', name='evalrunstatus'), nullable=False),
+    sa.Column('output_variable', sa.String(), nullable=False),
+    sa.Column('num_samples', sa.Integer(), nullable=False),
+    sa.Column('start_time', sa.DateTime(), nullable=True),
+    sa.Column('end_time', sa.DateTime(), nullable=True),
+    sa.Column('results', sa.JSON(), nullable=True),
+    sa.PrimaryKeyConstraint('_intid'),
+    sa.UniqueConstraint('id')
+    )
+    op.create_table('output_files',
+    sa.Column('_intid', sa.Integer(), nullable=False),
+    sa.Column('id', sa.String(), sa.Computed("'OF' || _intid", ), nullable=False),
+    sa.Column('file_name', sa.String(), nullable=False),
+    sa.Column('file_path', sa.String(), nullable=False),
+    sa.Column('created_at', sa.DateTime(), nullable=False),
+    sa.Column('updated_at', sa.DateTime(), nullable=False),
+    sa.PrimaryKeyConstraint('_intid'),
+    sa.UniqueConstraint('id')
+    )
+    op.create_table('workflows',
+    sa.Column('_intid', sa.Integer(), nullable=False),
+    sa.Column('id', sa.String(), sa.Computed("'S' || _intid", ), nullable=False),
+    sa.Column('name', sa.String(), nullable=False),
+    sa.Column('description', sa.String(), nullable=True),
+    sa.Column('definition', sa.JSON(), nullable=False),
+    sa.Column('created_at', sa.DateTime(), nullable=False),
+    sa.Column('updated_at', sa.DateTime(), nullable=False),
+    sa.PrimaryKeyConstraint('_intid'),
+    sa.UniqueConstraint('id'),
+    sa.UniqueConstraint('name')
+    )
+    op.create_table('workflow_versions',
+    sa.Column('_intid', sa.Integer(), nullable=False),
+    sa.Column('id', sa.String(), sa.Computed("'SV' || _intid", ), nullable=False),
+    sa.Column('version', sa.Integer(), nullable=False),
+    sa.Column('workflow_id', sa.String(), nullable=False),
+    sa.Column('name', sa.String(), nullable=False),
+    sa.Column('description', sa.String(), nullable=True),
+    sa.Column('definition', sa.JSON(), nullable=False),
+    sa.Column('definition_hash', sa.String(), nullable=False),
+    sa.Column('created_at', sa.DateTime(), nullable=False),
+    sa.Column('updated_at', sa.DateTime(), nullable=False),
+    sa.ForeignKeyConstraint(['workflow_id'], ['workflows.id'], ),
+    sa.PrimaryKeyConstraint('_intid'),
+    sa.UniqueConstraint('id')
+    )
+    op.create_index(op.f('ix_workflow_versions_version'), 'workflow_versions', ['version'], unique=True)
+    op.create_index(op.f('ix_workflow_versions_workflow_id'), 'workflow_versions', ['workflow_id'], unique=False)
+    op.create_table('runs',
+    sa.Column('_intid', sa.Integer(), nullable=False),
+    sa.Column('id', sa.String(), sa.Computed("'R' || _intid", ), nullable=False),
+    sa.Column('workflow_id', sa.String(), nullable=False),
+    sa.Column('workflow_version_id', sa.String(), nullable=False),
+    sa.Column('parent_run_id', sa.String(), nullable=True),
+    sa.Column('status', sa.Enum('PENDING', 'RUNNING', 'COMPLETED', 'FAILED', name='runstatus'), nullable=False),
+    sa.Column('run_type', sa.String(), nullable=False),
+    sa.Column('initial_inputs', sa.JSON(), nullable=True),
+    sa.Column('input_dataset_id', sa.String(), nullable=True),
+    sa.Column('start_time', sa.DateTime(), nullable=True),
+    sa.Column('end_time', sa.DateTime(), nullable=True),
+    sa.Column('outputs', sa.JSON(), nullable=True),
+    sa.Column('output_file_id', sa.String(), nullable=True),
+    sa.ForeignKeyConstraint(['input_dataset_id'], ['datasets.id'], ),
+    sa.ForeignKeyConstraint(['output_file_id'], ['output_files.id'], ),
+    sa.ForeignKeyConstraint(['parent_run_id'], ['runs.id'], ),
+    sa.ForeignKeyConstraint(['workflow_id'], ['workflows.id'], ),
+    sa.ForeignKeyConstraint(['workflow_version_id'], ['workflow_versions.id'], ),
+    sa.PrimaryKeyConstraint('_intid'),
+    sa.UniqueConstraint('id')
+    )
+    op.create_index(op.f('ix_runs_input_dataset_id'), 'runs', ['input_dataset_id'], unique=False)
+    op.create_index(op.f('ix_runs_parent_run_id'), 'runs', ['parent_run_id'], unique=False)
+    op.create_index(op.f('ix_runs_workflow_id'), 'runs', ['workflow_id'], unique=False)
+    op.create_index(op.f('ix_runs_workflow_version_id'), 'runs', ['workflow_version_id'], unique=False)
+    op.create_table('tasks',
+    sa.Column('_intid', sa.Integer(), nullable=False),
+    sa.Column('id', sa.String(), sa.Computed("'T' || _intid", ), nullable=False),
+    sa.Column('run_id', sa.String(), nullable=False),
+    sa.Column('node_id', sa.String(), nullable=False),
+    sa.Column('parent_task_id', sa.String(), nullable=True),
+    sa.Column('status', sa.Enum('PENDING', 'RUNNING', 'COMPLETED', 'FAILED', name='taskstatus'), nullable=False),
+    sa.Column('inputs', sa.JSON(), nullable=True),
+    sa.Column('outputs', sa.JSON(), nullable=True),
+    sa.Column('start_time', sa.DateTime(), nullable=True),
+    sa.Column('end_time', sa.DateTime(), nullable=True),
+    sa.Column('subworkflow', sa.JSON(), nullable=True),
+    sa.Column('subworkflow_output', sa.JSON(), nullable=True),
+    sa.ForeignKeyConstraint(['parent_task_id'], ['tasks.id'], ),
+    sa.ForeignKeyConstraint(['run_id'], ['runs.id'], ),
+    sa.PrimaryKeyConstraint('_intid'),
+    sa.UniqueConstraint('id')
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('tasks')
+    op.drop_index(op.f('ix_runs_workflow_version_id'), table_name='runs')
+    op.drop_index(op.f('ix_runs_workflow_id'), table_name='runs')
+    op.drop_index(op.f('ix_runs_parent_run_id'), table_name='runs')
+    op.drop_index(op.f('ix_runs_input_dataset_id'), table_name='runs')
+    op.drop_table('runs')
+    op.drop_index(op.f('ix_workflow_versions_workflow_id'), table_name='workflow_versions')
+    op.drop_index(op.f('ix_workflow_versions_version'), table_name='workflow_versions')
+    op.drop_table('workflow_versions')
+    op.drop_table('workflows')
+    op.drop_table('output_files')
+    op.drop_table('eval_runs')
+    op.drop_table('datasets')
+    # ### end Alembic commands ###
diff --git a/backend/app/models/output_file_model.py b/backend/app/models/output_file_model.py
@@ -7,9 +7,9 @@
 class OutputFileModel(BaseModel):
     __tablename__ = "output_files"
 
-    _intid: Mapped[int] = mapped_column(Integer, primary_key=True)
+    _intid: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement="auto")
     id: Mapped[str] = mapped_column(
-        String, Computed("'OF' || _intid"), nullable=False, index=True
+        String, Computed("'OF' || _intid"), nullable=False, unique=True
     )
     file_name: Mapped[str] = mapped_column(String, nullable=False)
     file_path: Mapped[str] = mapped_column(String, nullable=False)

diff --git a/backend/app/models/run_model.py b/backend/app/models/run_model.py
@@ -28,13 +28,13 @@ class RunModel(BaseModel):
 
     _intid: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement="auto")
     id: Mapped[str] = mapped_column(
-        String, Computed("'R' || _intid"), nullable=False, index=True
+        String, Computed("'R' || _intid"), nullable=False, unique=True
     )
     workflow_id: Mapped[str] = mapped_column(
         String, ForeignKey("workflows.id"), nullable=False, index=True
     )
     workflow_version_id: Mapped[int] = mapped_column(
-        Integer, ForeignKey("workflow_versions._intid"), nullable=False, index=True
+        String, ForeignKey("workflow_versions.id"), nullable=False, index=True
     )
     parent_run_id: Mapped[Optional[str]] = mapped_column(
         String, ForeignKey("runs.id"), nullable=True, index=True

diff --git a/backend/app/models/task_model.py b/backend/app/models/task_model.py
@@ -24,9 +24,9 @@ class TaskStatus(PyEnum):
 class TaskModel(BaseModel):
     __tablename__ = "tasks"
 
-    _intid: Mapped[int] = mapped_column(Integer, primary_key=True)
+    _intid: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement="auto")
     id: Mapped[str] = mapped_column(
-        String, Computed("'T' || _intid"), nullable=False, index=True
+        String, Computed("'T' || _intid"), nullable=False, unique=True
     )
     run_id: Mapped[str] = mapped_column(String, ForeignKey("runs.id"), nullable=False)
     node_id: Mapped[str] = mapped_column(String, nullable=False)

diff --git a/backend/app/models/workflow_model.py b/backend/app/models/workflow_model.py
@@ -16,9 +16,9 @@ class WorkflowModel(BaseModel):
 
     __tablename__ = "workflows"
 
-    _intid: Mapped[int] = mapped_column(Integer, primary_key=True)
+    _intid: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement="auto")
     id: Mapped[str] = mapped_column(
-        String, Computed("'S' || _intid"), nullable=False, index=True
+        String, Computed("'S' || _intid"), nullable=False, unique=True
     )
     name: Mapped[str] = mapped_column(String, unique=True, nullable=False)
     description: Mapped[Optional[str]] = mapped_column(String)

diff --git a/backend/app/models/workflow_version_model.py b/backend/app/models/workflow_version_model.py
@@ -1,4 +1,4 @@
-from sqlalchemy import Integer, String, DateTime, JSON, ForeignKey
+from sqlalchemy import Integer, String, DateTime, JSON, ForeignKey, Computed
 from sqlalchemy.orm import Mapped, mapped_column, relationship
 from datetime import datetime, timezone
 from typing import List, Optional, Any
@@ -9,8 +9,13 @@
 class WorkflowVersionModel(BaseModel):
     __tablename__ = "workflow_versions"
 
-    _intid: Mapped[int] = mapped_column(Integer, primary_key=True)
-    version: Mapped[int] = mapped_column(Integer, nullable=False)
+    _intid: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement="auto")
+    id: Mapped[str] = mapped_column(
+        String, Computed("'SV' || _intid"), nullable=False, unique=True
+    )
+    version: Mapped[int] = mapped_column(
+        Integer, nullable=False, index=True, unique=True
+    )
     workflow_id: Mapped[int] = mapped_column(
         ForeignKey("workflows.id"), nullable=False, index=True
     )

diff --git a/backend/app/schemas/run_schemas.py b/backend/app/schemas/run_schemas.py
@@ -15,7 +15,7 @@ class StartRunRequestSchema(BaseModel):
 class RunResponseSchema(BaseModel):
     id: str
     workflow_id: str
-    workflow_version_id: int
+    workflow_version_id: str
     workflow_version: WorkflowVersionResponseSchema
     status: RunStatus
     run_type: str