kolenaIO · y27choi · Jan 10, 2025 · Jan 8, 2025 · Jan 8, 2025 · Jan 8, 2025
@@ -0,0 +1,72 @@
+# Example Integration: Retrieval Augmented Generation (RAG)
+
+This example integration uses the [Financebench](https://github.com/patronus-ai/financebench) dataset to
+demonstrate testing RAG system on Kolena.
+
+## Setup
+
+This project uses [uv](https://docs.astral.sh/uv/) for packaging and Python dependency management. To get started,
+install project dependencies from [`pyproject.toml`](./pyproject.toml) by running:
+
+```shell
+uv sync
+```
+
+## Usage
+
+The data for this example integration lives in the publicly accessible S3 bucket `s3://kolena-public-examples`.
+
+First, ensure that the `KOLENA_TOKEN` environment variable is populated in your environment. See our
+[initialization documentation](https://docs.kolena.com/installing-kolena/#initialization) for details.
+
+This project defines three scripts that perform the following operations:
+
+1. [`upload_dataset.py`](retrieval_augmented_generation/upload_dataset.py) creates the Financebench dataset on Kolena
+
+To run it without ground truth, use `s3://kolena-public-examples/financebench/raw/financebench_without_gt.jsonl`
+dataset jsonl file instead:
+
+```shell
+uv run python retrieval_augmented_generation/upload_dataset.py --dataset-jsonl s3://kolena-public-examples/financebench/raw/financebench_without_gt.jsonl
+```
+
+2. [`upload_results.py`](retrieval_augmented_generation/upload_results.py) uploads a RAG system's raw inference
+on the Financebench dataset.
+
+The `upload_results.py` script defines command line arguments to select which model to evaluate — run
+using the `--help` flag for more information:
+
+```shell
+$ uv run python3 retrieval_augmented_generation/upload_results.py --help
+usage: upload_results.py [-h] [--dataset-name DATASET_NAME] [--evaluate] [{baseline,qme,query_decomp}]
+
+positional arguments:
+  {baseline,qme,query_decomp}
+                        Name of the model to test.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --dataset-name DATASET_NAME
+                        Optionally specify a custom dataset name to test.
+  --evaluate            Computes metrics on the model results. Requires dataset with ground truth.
+```
+
+3. Label your dataset on [Kolena]((https://app.kolena.com/redirect/))
+
+4. Run evaluation by using `--evaluate` option from the `upload_results.py` script. It will compute metrics on the
+model results and upload the model results including the metrics to Kolena.
+
+## Quality Standards Guide
+
+Once the dataset and results have been uploaded to Kolena, visit [Kolena](https://app.kolena.com/redirect/) to
+test the rain forecast models. See our [QuickStart](https://docs.kolena.com/dataset/quickstart/) guide
+for details.
+
+Here are our [Quality Standards](https://docs.kolena.com/dataset/core-concepts/quality-standard/) recommendations for
+this workflow:
+
+### Metrics
+
+1. rate(`result.is_page_retrieved`=true): page-level retrieval rate
+2. rate(`result.is_doc_retrieved`=true): doc-level retrieval rate
+3. `is_correct` using [LLM prompt](https://docs.kolena.com/dataset/advanced-usage/llm-prompt-extraction/)
@@ -0,0 +1,29 @@
+[project]
+name = "retrieval_augmented_generation"
+version = "0.1.0"
+description = "Kolena Datasets Example integration for RAG"
+authors = [
+    { name = "Kolena Engineering", email = "[email protected]" }
+]
+license = "Apache-2.0"
+requires-python = ">=3.8,<3.12"
+
+dependencies = [
+    "fsspec>=2024.12.0",
+    "kolena>=1.50.0,<2",
+    "s3fs>=0.6.0",
+]
+
+[tool.uv]
+dev-dependencies = [
+    "pre-commit>=2.17,<3",
+    "pytest>=7,<8",
+    "pytest-depends>=1.0.1,<8",
+]
+
+[tool.uv.sources]
+kolena = { path = "../../../" }
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
@@ -0,0 +1,13 @@
+# Copyright 2021-2024 Kolena Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,22 @@
+# Copyright 2021-2024 Kolena Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+S3_BUCKET = "s3://kolena-public-examples"
+DATASET = "financebench"
+TASK = "retrieval-augmented_generation"
+ID_FIELDS = ["financebench_id"]
+MODEL_NAME = {
+    "baseline": "gpt-4o-baseline",
+    "qme": "gpt-4o-qme",
+    "query_decomp": "gpt-4o-qme-query-decomp",
+}
@@ -0,0 +1,46 @@
+# Copyright 2021-2024 Kolena Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pandas as pd
+from retrieval_augmented_generation.constants import ID_FIELDS
+
+
+def is_doc_retrieved(retrieved_contents: list, doc_name: str) -> bool:
+    return any([doc_name in content.locator for content in retrieved_contents])
+
+
+def is_page_retrieved(retrieved_contents: list, doc_name: str, relevant_pages: list) -> bool:
+    retrieved_pages = [content.page_number for content in retrieved_contents if doc_name in content.locator]
+
+    # NOTE: all relevant pages must be retrieved to be considered correct.
+    return set(relevant_pages).issubset(retrieved_pages)
+
+
+def compute_metrics(df_dataset: pd.DataFrame, df_results: pd.DataFrame) -> pd.DataFrame:
+    ground_truth_columns = ["doc_name", "relevant_pages", "financebench_id"]
+    assert set(ground_truth_columns).issubset(
+        df_dataset.columns,
+    ), f"ground truth columns {ground_truth_columns} cannot be found in dataset dataframe."
+
+    df = df_results.merge(df_dataset[ground_truth_columns], on=ID_FIELDS, how="left")
+
+    metrics = []
+    for record in df.itertuples():
+        metrics.append(
+            dict(
+                is_doc_retrieved=is_doc_retrieved(record.retrieved_contents, record.doc_name),
+                is_page_retrieved=is_page_retrieved(record.retrieved_contents, record.doc_name, record.relevant_pages),
+            ),
+        )
+
+    return pd.DataFrame(metrics)
@@ -0,0 +1,67 @@
+# Copyright 2021-2024 Kolena Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from argparse import ArgumentParser
+from argparse import Namespace
+from typing import Any
+from typing import Optional
+
+import pandas as pd
+from retrieval_augmented_generation.constants import DATASET
+from retrieval_augmented_generation.constants import ID_FIELDS
+from retrieval_augmented_generation.constants import S3_BUCKET
+from retrieval_augmented_generation.utils import to_locator
+
+from kolena.asset import DocumentAsset
+from kolena.dataset import upload_dataset
+
+
+def to_document(evidence: list[dict[str, Any]]) -> Optional[DocumentAsset]:
+    if len(evidence) > 0:
+        return DocumentAsset(to_locator(str(evidence[0]["doc_name"])))
+
+    return None
+
+
+def get_pages(evidence: list[dict[str, Any]]) -> list[int]:
+    pages = [e["evidence_page_num"] + 1 for e in evidence]  # financebench page number starts from 0 index
+    return pages
+
+
+def run(args: Namespace) -> None:
+    df_dataset = pd.read_json(args.dataset_jsonl, lines=True)
+    if "evidence" in df_dataset.columns:
+        df_dataset["document"] = df_dataset["evidence"].apply(to_document)
+        df_dataset["relevant_pages"] = df_dataset["evidence"].apply(get_pages)
+    upload_dataset(args.dataset_name, df_dataset, id_fields=ID_FIELDS)
+
+
+def main() -> None:
+    ap = ArgumentParser()
+    ap.add_argument(
+        "--dataset-jsonl",
+        type=str,
+        default=f"{S3_BUCKET}/{DATASET}/raw/financebench_open_source.jsonl",
+        help="JSONL file specifying dataset. See default JSONL for details",
+    )
+    ap.add_argument(
+        "--dataset-name",
+        type=str,
+        default=DATASET,
+        help="Optionally specify a name of the dataset",
+    )
+    run(ap.parse_args())
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,83 @@
+# Copyright 2021-2024 Kolena Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from argparse import ArgumentParser
+from argparse import Namespace
+from typing import Any
+
+import pandas as pd
+from retrieval_augmented_generation.constants import DATASET
+from retrieval_augmented_generation.constants import MODEL_NAME
+from retrieval_augmented_generation.constants import S3_BUCKET
+from retrieval_augmented_generation.metrics import compute_metrics
+from retrieval_augmented_generation.utils import to_locator
+
+from kolena.asset import DocumentAsset
+from kolena.dataset import download_dataset
+from kolena.dataset import upload_results
+
+
+def to_documents(retrieved_contents: list[dict[str, Any]]) -> list:
+    if not retrieved_contents:
+        return []
+
+    documents = []
+    for content in retrieved_contents:
+        documents.append(
+            DocumentAsset(
+                locator=to_locator(content["doc_name"]),
+                content=content["content"],  # type: ignore[call-arg]
+                page_number=content["page_number"],  # type: ignore[call-arg]
+            ),
+        )
+
+    return documents
+
+
+def run(args: Namespace) -> None:
+    model_name = MODEL_NAME[args.model]
+    df_results = pd.read_json(f"{S3_BUCKET}/{DATASET}/results/raw/{model_name}.jsonl", lines=True)
+    df_results["retrieved_contents"] = df_results["retrieved_contents"].apply(to_documents)
+    if args.evaluate:
+        df_dataset = download_dataset(args.dataset_name)
+        df_metrics = compute_metrics(df_dataset, df_results)
+        df_results = pd.concat([df_results, df_metrics], axis=1)
+    upload_results(args.dataset_name, model_name, df_results)
+
+
+def main() -> None:
+    ap = ArgumentParser()
+    ap.add_argument(
+        "model",
+        type=str,
+        default="baseline",
+        nargs="?",
+        choices=list(MODEL_NAME.keys()),
+        help="Name of the model to test.",
+    )
+    ap.add_argument(
+        "--dataset-name",
+        type=str,
+        default=DATASET,
+        help="Optionally specify a custom dataset name to test.",
+    )
+    ap.add_argument(
+        "--evaluate",
+        action="store_true",
+        help="Computes metrics on the model results. Requires dataset with ground truth.",
+    )
+    run(ap.parse_args())
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,19 @@
+# Copyright 2021-2024 Kolena Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from retrieval_augmented_generation.constants import DATASET
+from retrieval_augmented_generation.constants import S3_BUCKET
+
+
+def to_locator(filename: str) -> str:
+    return f"{S3_BUCKET}/{DATASET}/data/{filename}.pdf"