feat: add dataset support to be created using distribution settings (#…

…5013) # Description This PR is the first one related with distribution task feature, adding the following changes: * Added `distribution` JSON column to `datasets` table: * This column is non-nullable so a value is always required when a dataset is created. * By default old datasets will have the value `{"strategy": "overlap", "min_submitted": 1}`. * Added `distribution` attribute to `DatasetCreate` schema: * None is not a valid value. * If no value is specified for this attribute `DatasetOverlapDistributionCreate` with `min_submitted` to `1` is used. * `DatasetOverlapDistributionCreate` only allows values greater or equal than `1` for `min_submitted` attributed. * Now the context `create_dataset` function is receiving a dictionary instead of `DatasetCreate` schema. * Moved dataset creation validations to a new `DatasetCreateValidator` class. Update of `distribution` attribute for datasets will be done in a different issue. Closes #5005 **Type of change** (Please delete options that are not relevant. Remember to title the PR according to the type of change) - [ ] Bug fix (non-breaking change which fixes an issue) - [x] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [ ] Refactor (change restructuring the codebase without changing functionality) - [ ] Improvement (change adding some improvement to an existing functionality) - [ ] Documentation update **How Has This Been Tested** (Please describe the tests that you ran to verify your changes. And ideally, reference `tests`) - [x] Adding new tests and passing old ones. - [x] Check that migration works as expected with old datasets and SQLite. - [x] Check that migration works as expected with old datasets and PostgreSQL. **Checklist** - [ ] I added relevant documentation - [ ] follows the style guidelines of this project - [ ] I did a self-review of my code - [ ] I made corresponding changes to the documentation - [ ] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I filled out [the contributor form](https://tally.so/r/n9XrxK) (see text above) - [ ] I have added relevant notes to the CHANGELOG.md file (See https://keepachangelog.com/) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Paco Aranda <[email protected]>
argilla-io · frascuchon · Jul 19, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 4, 2024
commit f62d58a2f91e16eb4d02e56c4039a432070349c0
diff --git a/argilla-frontend/v1/infrastructure/repositories/RecordRepository.ts b/argilla-frontend/v1/infrastructure/repositories/RecordRepository.ts
@@ -42,10 +42,8 @@ export class RecordRepository {
   constructor(private readonly axios: NuxtAxiosInstance) {}
 
   getRecords(criteria: RecordCriteria): Promise<BackendRecords> {
-    if (criteria.isFilteringByAdvanceSearch)
-      return this.getRecordsByAdvanceSearch(criteria);
-
-    return this.getRecordsByDatasetId(criteria);
+    return this.getRecordsByAdvanceSearch(criteria);
+    // return this.getRecordsByDatasetId(criteria);
   }
 
   async getRecord(recordId: string): Promise<BackendRecord> {
@@ -264,6 +262,30 @@ export class RecordRepository {
         };
       }
 
+      body.filters = {
+        and: [
+          {
+            type: "terms",
+            scope: {
+              entity: "response",
+              property: "status",
+            },
+            values: [status],
+          },
+        ],
+      };
+
+      if (status === "pending") {
+        body.filters.and.push({
+          type: "terms",
+          scope: {
+            entity: "record",
+            property: "status",
+          },
+          values: ["pending"],
+        });
+      }
+
       if (
         isFilteringByMetadata ||
         isFilteringByResponse ||

diff --git a/argilla-server/CHANGELOG.md b/argilla-server/CHANGELOG.md
@@ -16,12 +16,17 @@ These are the section headers that we use:
 
 ## [Unreleased]()
 
-## [2.0.0rc1](https://github.com/argilla-io/argilla/compare/v1.29.0...v2.0.0rc1)
+### Added
+
+- Added support to specify `distribution` attribute when creating a dataset. ([#5013](https://github.com/argilla-io/argilla/pull/5013))
+- Added support to change `distribution` attribute when updating a dataset. ([#5028](https://github.com/argilla-io/argilla/pull/5028))
 
 ### Changed
 
 - Change `responses` table to delete rows on cascade when a user is deleted. ([#5126](https://github.com/argilla-io/argilla/pull/5126))
 
+## [2.0.0rc1](https://github.com/argilla-io/argilla/compare/v1.29.0...v2.0.0rc1)
+
 ### Removed
 
 - Removed all API v0 endpoints. ([#4852](https://github.com/argilla-io/argilla/pull/4852))

diff --git a/...er/src/argilla_server/alembic/versions/237f7c674d74_add_status_column_to_records_table.py b/...er/src/argilla_server/alembic/versions/237f7c674d74_add_status_column_to_records_table.py
@@ -0,0 +1,60 @@
+#  Copyright 2021-present, the Recognai S.L. team.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""add status column to records table
+
+Revision ID: 237f7c674d74
+Revises: 45a12f74448b
+Create Date: 2024-06-18 17:59:36.992165
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "237f7c674d74"
+down_revision = "45a12f74448b"
+branch_labels = None
+depends_on = None
+
+
+record_status_enum = sa.Enum("pending", "completed", name="record_status_enum")
+
+
+def upgrade() -> None:
+    record_status_enum.create(op.get_bind())
+
+    op.add_column("records", sa.Column("status", record_status_enum, server_default="pending", nullable=False))
+    op.create_index(op.f("ix_records_status"), "records", ["status"], unique=False)
+
+    # NOTE: Updating existent records to have "completed" status when they have
+    # at least one response with "submitted" status.
+    op.execute("""
+        UPDATE records
+        SET status = 'completed'
+        WHERE id IN (
+            SELECT DISTINCT record_id
+            FROM responses
+            WHERE status = 'submitted'
+        );
+    """)
+
+
+def downgrade() -> None:
+    op.drop_index(op.f("ix_records_status"), table_name="records")
+    op.drop_column("records", "status")
+
+    record_status_enum.drop(op.get_bind())
diff --git a/...ff6484f8b37_add_record_metadata_column.py → ...7_add_metadata_column_to_records_table.py b/...ff6484f8b37_add_record_metadata_column.py → ...7_add_metadata_column_to_records_table.py
@@ -12,7 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-"""add record metadata column
+"""add metadata column to records table
 
 Revision ID: 3ff6484f8b37
 Revises: ae5522b4c674
@@ -31,12 +31,8 @@
 
 
 def upgrade() -> None:
-    # ### commands auto generated by Alembic - please adjust! ###
     op.add_column("records", sa.Column("metadata", sa.JSON(), nullable=True))
-    # ### end Alembic commands ###
 
 
 def downgrade() -> None:
-    # ### commands auto generated by Alembic - please adjust! ###
     op.drop_column("records", "metadata")
-    # ### end Alembic commands ###
diff --git a/...argilla_server/alembic/versions/45a12f74448b_add_distribution_column_to_datasets_table.py b/...argilla_server/alembic/versions/45a12f74448b_add_distribution_column_to_datasets_table.py
@@ -0,0 +1,45 @@
+#  Copyright 2021-present, the Recognai S.L. team.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""add distribution column to datasets table
+
+Revision ID: 45a12f74448b
+Revises: d00f819ccc67
+Create Date: 2024-06-13 11:23:43.395093
+
+"""
+
+import json
+
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "45a12f74448b"
+down_revision = "d00f819ccc67"
+branch_labels = None
+depends_on = None
+
+DISTRIBUTION_VALUE = json.dumps({"strategy": "overlap", "min_submitted": 1})
+
+
+def upgrade() -> None:
+    op.add_column("datasets", sa.Column("distribution", sa.JSON(), nullable=True))
+    op.execute(f"UPDATE datasets SET distribution = '{DISTRIBUTION_VALUE}'")
+    with op.batch_alter_table("datasets") as batch_op:
+        batch_op.alter_column("distribution", nullable=False)
+
+
+def downgrade() -> None:
+    op.drop_column("datasets", "distribution")
diff --git a/...0e_add_allow_extra_metadata_column_to_.py → ...xtra_metadata_column_to_datasets_table.py b/...0e_add_allow_extra_metadata_column_to_.py → ...xtra_metadata_column_to_datasets_table.py
@@ -12,7 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-"""add allow_extra_metadata column to dataset table
+"""add allow_extra_metadata column to datasets table
 
 Revision ID: b8458008b60e
 Revises: 7cbcccf8b57a
@@ -31,14 +31,10 @@
 
 
 def upgrade() -> None:
-    # ### commands auto generated by Alembic - please adjust! ###
     op.add_column(
         "datasets", sa.Column("allow_extra_metadata", sa.Boolean(), server_default=sa.text("true"), nullable=False)
     )
-    # ### end Alembic commands ###
 
 
 def downgrade() -> None:
-    # ### commands auto generated by Alembic - please adjust! ###
     op.drop_column("datasets", "allow_extra_metadata")
-    # ### end Alembic commands ###
diff --git a/argilla-server/src/argilla_server/api/handlers/v1/datasets/datasets.py b/argilla-server/src/argilla_server/api/handlers/v1/datasets/datasets.py
@@ -189,7 +189,7 @@ async def create_dataset(
 ):
     await authorize(current_user, DatasetPolicy.create(dataset_create.workspace_id))
 
-    return await datasets.create_dataset(db, dataset_create)
+    return await datasets.create_dataset(db, dataset_create.dict())
 
 
 @router.post("/datasets/{dataset_id}/fields", status_code=status.HTTP_201_CREATED, response_model=Field)
@@ -302,4 +302,4 @@ async def update_dataset(
 
     await authorize(current_user, DatasetPolicy.update(dataset))
 
-    return await datasets.update_dataset(db, dataset, dataset_update)
+    return await datasets.update_dataset(db, dataset, dataset_update.dict(exclude_unset=True))
diff --git a/argilla-server/src/argilla_server/api/handlers/v1/responses.py b/argilla-server/src/argilla_server/api/handlers/v1/responses.py
@@ -64,7 +64,9 @@ async def update_response(
     response = await Response.get_or_raise(
         db,
         response_id,
-        options=[selectinload(Response.record).selectinload(Record.dataset).selectinload(Dataset.questions)],
+        options=[
+            selectinload(Response.record).selectinload(Record.dataset).selectinload(Dataset.questions),
+        ],
     )
 
     await authorize(current_user, ResponsePolicy.update(response))
@@ -83,7 +85,9 @@ async def delete_response(
     response = await Response.get_or_raise(
         db,
         response_id,
-        options=[selectinload(Response.record).selectinload(Record.dataset).selectinload(Dataset.questions)],
+        options=[
+            selectinload(Response.record).selectinload(Record.dataset).selectinload(Dataset.questions),
+        ],
     )
 
     await authorize(current_user, ResponsePolicy.delete(response))

diff --git a/argilla-server/src/argilla_server/api/schemas/v1/datasets.py b/argilla-server/src/argilla_server/api/schemas/v1/datasets.py
@@ -13,11 +13,11 @@
 #  limitations under the License.
 
 from datetime import datetime
-from typing import List, Optional
+from typing import List, Literal, Optional, Union
 from uuid import UUID
 
 from argilla_server.api.schemas.v1.commons import UpdateSchema
-from argilla_server.enums import DatasetStatus
+from argilla_server.enums import DatasetDistributionStrategy, DatasetStatus
 from argilla_server.pydantic_v1 import BaseModel, Field, constr
 
 try:
@@ -44,6 +44,32 @@
 ]
 
 
+class DatasetOverlapDistribution(BaseModel):
+    strategy: Literal[DatasetDistributionStrategy.overlap]
+    min_submitted: int
+
+
+DatasetDistribution = DatasetOverlapDistribution
+
+
+class DatasetOverlapDistributionCreate(BaseModel):
+    strategy: Literal[DatasetDistributionStrategy.overlap]
+    min_submitted: int = Field(
+        ge=1,
+        description="Minimum number of submitted responses to consider a record as completed",
+    )
+
+
+DatasetDistributionCreate = DatasetOverlapDistributionCreate
+
+
+class DatasetOverlapDistributionUpdate(DatasetDistributionCreate):
+    pass
+
+
+DatasetDistributionUpdate = DatasetOverlapDistributionUpdate
+
+
 class RecordMetrics(BaseModel):
     count: int
 
@@ -74,6 +100,7 @@ class Dataset(BaseModel):
     guidelines: Optional[str]
     allow_extra_metadata: bool
     status: DatasetStatus
+    distribution: DatasetDistribution
     workspace_id: UUID
     last_activity_at: datetime
     inserted_at: datetime
@@ -91,12 +118,17 @@ class DatasetCreate(BaseModel):
     name: DatasetName
     guidelines: Optional[DatasetGuidelines]
     allow_extra_metadata: bool = True
+    distribution: DatasetDistributionCreate = DatasetOverlapDistributionCreate(
+        strategy=DatasetDistributionStrategy.overlap,
+        min_submitted=1,
+    )
     workspace_id: UUID
 
 
 class DatasetUpdate(UpdateSchema):
     name: Optional[DatasetName]
     guidelines: Optional[DatasetGuidelines]
     allow_extra_metadata: Optional[bool]
+    distribution: Optional[DatasetDistributionUpdate]
 
-    __non_nullable_fields__ = {"name", "allow_extra_metadata"}
+    __non_nullable_fields__ = {"name", "allow_extra_metadata", "distribution"}
diff --git a/argilla-server/src/argilla_server/api/schemas/v1/records.py b/argilla-server/src/argilla_server/api/schemas/v1/records.py
@@ -23,7 +23,7 @@
 from argilla_server.api.schemas.v1.metadata_properties import MetadataPropertyName
 from argilla_server.api.schemas.v1.responses import Response, ResponseFilterScope, UserResponseCreate
 from argilla_server.api.schemas.v1.suggestions import Suggestion, SuggestionCreate, SuggestionFilterScope
-from argilla_server.enums import RecordInclude, RecordSortField, SimilarityOrder, SortOrder
+from argilla_server.enums import RecordInclude, RecordSortField, SimilarityOrder, SortOrder, RecordStatus
 from argilla_server.pydantic_v1 import BaseModel, Field, StrictStr, root_validator, validator
 from argilla_server.pydantic_v1.utils import GetterDict
 from argilla_server.search_engine import TextQuery
@@ -66,6 +66,7 @@ def get(self, key: str, default: Any) -> Any:
 
 class Record(BaseModel):
     id: UUID
+    status: RecordStatus
     fields: Dict[str, Any]
     metadata: Optional[Dict[str, Any]]
     external_id: Optional[str]
@@ -196,7 +197,7 @@ def _has_relationships(self):
 
 class RecordFilterScope(BaseModel):
     entity: Literal["record"]
-    property: Union[Literal[RecordSortField.inserted_at], Literal[RecordSortField.updated_at]]
+    property: Union[Literal[RecordSortField.inserted_at], Literal[RecordSortField.updated_at], Literal["status"]]
 
 
 class Records(BaseModel):

diff --git a/argilla-server/src/argilla_server/bulk/records_bulk.py b/argilla-server/src/argilla_server/bulk/records_bulk.py
@@ -29,6 +29,7 @@
 )
 from argilla_server.api.schemas.v1.responses import UserResponseCreate
 from argilla_server.api.schemas.v1.suggestions import SuggestionCreate
+from argilla_server.contexts import distribution
 from argilla_server.contexts.accounts import fetch_users_by_ids_as_dict
 from argilla_server.contexts.records import (
     fetch_records_by_external_ids_as_dict,
@@ -67,6 +68,7 @@ async def create_records_bulk(self, dataset: Dataset, bulk_create: RecordsBulkCr
 
             await self._upsert_records_relationships(records, bulk_create.items)
             await _preload_records_relationships_before_index(self._db, records)
+            await distribution.update_records_status(self._db, records)
             await self._search_engine.index_records(dataset, records)
 
         await self._db.commit()
@@ -207,6 +209,7 @@ async def upsert_records_bulk(self, dataset: Dataset, bulk_upsert: RecordsBulkUp
 
             await self._upsert_records_relationships(records, bulk_upsert.items)
             await _preload_records_relationships_before_index(self._db, records)
+            await distribution.update_records_status(self._db, records)
             await self._search_engine.index_records(dataset, records)
 
         await self._db.commit()
@@ -237,6 +240,7 @@ async def _preload_records_relationships_before_index(db: "AsyncSession", record
         .filter(Record.id.in_([record.id for record in records]))
         .options(
             selectinload(Record.responses).selectinload(Response.user),
+            selectinload(Record.responses_submitted),
             selectinload(Record.suggestions).selectinload(Suggestion.question),
             selectinload(Record.vectors),
         )