From b774a8d36aca35ce684fae1aa0de65a54890a683 Mon Sep 17 00:00:00 2001 From: Yoohee Choi Date: Wed, 8 Jan 2025 09:06:56 -0500 Subject: [PATCH 1/9] Upload dataset script --- .../retrieval_augmented_generation/README.md | 54 +++++++++++++++ .../pyproject.toml | 28 ++++++++ .../__init__.py | 13 ++++ .../constants.py | 17 +++++ .../upload_dataset.py | 68 +++++++++++++++++++ 5 files changed, 180 insertions(+) create mode 100644 examples/dataset/retrieval_augmented_generation/README.md create mode 100644 examples/dataset/retrieval_augmented_generation/pyproject.toml create mode 100644 examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py create mode 100644 examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py create mode 100644 examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py diff --git a/examples/dataset/retrieval_augmented_generation/README.md b/examples/dataset/retrieval_augmented_generation/README.md new file mode 100644 index 000000000..27e83d5ac --- /dev/null +++ b/examples/dataset/retrieval_augmented_generation/README.md @@ -0,0 +1,54 @@ +# Example Integration: Retrieval Augmented Generation (RAG) + +This example integration uses the [Financebench](https://github.com/patronus-ai/financebench) dataset to +demonstrate testing RAG system on Kolena. + +## Setup + +This project uses [uv](https://docs.astral.sh/uv/) for packaging and Python dependency management. To get started, +install project dependencies from [`pyproject.toml`](./pyproject.toml) by running: + +```shell +uv sync +``` + +## Usage + +The data for this example integration lives in the publicly accessible S3 bucket `s3://kolena-public-examples`. + +First, ensure that the `KOLENA_TOKEN` environment variable is populated in your environment. See our +[initialization documentation](https://docs.kolena.com/installing-kolena/#initialization) for details. + +This project defines three scripts that perform the following operations: + +1. [`upload_dataset.py`](retrieval_augmented_generation/upload_dataset.py) creates the Financebench dataset on Kolena + +2. [`upload_results.py`](retrieval_augmented_generation/upload_results.py) uploads a RAG system's raw inference +on the Financebench dataset. + +3. [`evaluate.py`](retrieval_augmented_generation/evaluate.py) tests a RAG system on the Financebench dataset. This +script requires ground truth annotations. Make sure you have annotated your dataset on Kolena before running this script. + +The `upload_results.py` and `evaluate.py` script defines command line arguments to select which model to evaluate — run +using the `--help` flag for more information: + +```shell +$ uv run python3 retrieval_augmented_generation/upload_results.py --help +usage: upload_results.py [-h] [--dataset DATASET] {ann,logreg} + +positional arguments: + {ann,logreg} Name of the model to test. + +optional arguments: + -h, --help show this help message and exit + --dataset DATASET Optionally specify a custom dataset name to test. +``` + +## Quality Standards Guide + +Once the dataset and results have been uploaded to Kolena, visit [Kolena](https://app.kolena.com/redirect/) to +test the rain forecast models. See our [QuickStart](https://docs.kolena.com/dataset/quickstart/) guide +for details. + +Here are our [Quality Standards](https://docs.kolena.com/dataset/core-concepts/quality-standard/) recommendations for +this workflow: diff --git a/examples/dataset/retrieval_augmented_generation/pyproject.toml b/examples/dataset/retrieval_augmented_generation/pyproject.toml new file mode 100644 index 000000000..d4528ba02 --- /dev/null +++ b/examples/dataset/retrieval_augmented_generation/pyproject.toml @@ -0,0 +1,28 @@ +[project] +name = "retrieval_augmented_generation" +version = "0.1.0" +description = "Kolena Datasets Example integration for RAG" +authors = [ + { name = "Kolena Engineering", email = "eng@kolena.com" } +] +license = "Apache-2.0" +requires-python = ">=3.8,<3.12" + +dependencies = [ + "kolena", + "numpy>=1.24.4", +] + +[tool.uv] +dev-dependencies = [ + "pre-commit>=2.17,<3", + "pytest>=7,<8", + "pytest-depends>=1.0.1,<8", +] + +[tool.uv.sources] +kolena = { path = "../../../" } + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py new file mode 100644 index 000000000..5f584e024 --- /dev/null +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2021-2024 Kolena Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py new file mode 100644 index 000000000..da34f68e5 --- /dev/null +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py @@ -0,0 +1,17 @@ +# Copyright 2021-2024 Kolena Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +S3_BUCKET = "s3://kolena-public-examples" +DATASET = "financebench" +TASK = "retrieval-augmented_generation" +ID_FIELDS = ["financebench_id"] diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py new file mode 100644 index 000000000..4a0a82cdc --- /dev/null +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py @@ -0,0 +1,68 @@ +# Copyright 2021-2024 Kolena Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from argparse import ArgumentParser +from argparse import Namespace +from typing import Optional + +import pandas as pd +from retrieval_augmented_generation.constants import DATASET +from retrieval_augmented_generation.constants import ID_FIELDS +from retrieval_augmented_generation.constants import S3_BUCKET + +from kolena.asset import DocumentAsset +from kolena.dataset import upload_dataset + + +def to_locator(filename: str) -> str: + return f"{S3_BUCKET}/{DATASET}/data/{filename}.pdf" + + +def to_document(evidence: list[dict[str, str]]) -> Optional[DocumentAsset]: + if len(evidence) > 0: + return DocumentAsset(to_locator(evidence[0]["doc_name"])) + + return None + + +def get_pages(evidence: list[dict[str, str]]) -> str: + pages = [e["evidence_page_num"] for e in evidence] + return ", ".join(map(str, pages)) + + +def run(args: Namespace) -> None: + df_dataset = pd.read_json(args.dataset_jsonl, lines=True) + df_dataset["document"] = df_dataset["evidence"].apply(to_document) + df_dataset["relevant_pages"] = df_dataset["evidence"].apply(get_pages) + upload_dataset(args.dataset_name, df_dataset, id_fields=ID_FIELDS) + + +def main() -> None: + ap = ArgumentParser() + ap.add_argument( + "--dataset-jsonl", + type=str, + default=f"{S3_BUCKET}/{DATASET}/raw/financebench_open_source.jsonl", + help="JSONL file specifying dataset. See default JSONL for details", + ) + ap.add_argument( + "--dataset-name", + type=str, + default=DATASET, + help="Optionally specify a name of the dataset", + ) + run(ap.parse_args()) + + +if __name__ == "__main__": + main() From 84c47a3a75f5bd2736eca0f521a4ff7d135688e5 Mon Sep 17 00:00:00 2001 From: Yoohee Choi Date: Wed, 8 Jan 2025 09:09:43 -0500 Subject: [PATCH 2/9] Uses latest kolena package --- examples/dataset/retrieval_augmented_generation/pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/dataset/retrieval_augmented_generation/pyproject.toml b/examples/dataset/retrieval_augmented_generation/pyproject.toml index d4528ba02..672e81bca 100644 --- a/examples/dataset/retrieval_augmented_generation/pyproject.toml +++ b/examples/dataset/retrieval_augmented_generation/pyproject.toml @@ -9,8 +9,7 @@ license = "Apache-2.0" requires-python = ">=3.8,<3.12" dependencies = [ - "kolena", - "numpy>=1.24.4", + "kolena>=1.50.0,<2", ] [tool.uv] From 6621bcee6b6f3657a80f99752eab67e913ec18a5 Mon Sep 17 00:00:00 2001 From: Yoohee Choi Date: Wed, 8 Jan 2025 12:07:43 -0500 Subject: [PATCH 3/9] Added upload results and allow uploading dataset without GT --- .../retrieval_augmented_generation/README.md | 7 ++ .../pyproject.toml | 2 + .../constants.py | 5 ++ .../upload_dataset.py | 10 ++- .../upload_results.py | 71 +++++++++++++++++++ .../retrieval_augmented_generation/utils.py | 19 +++++ 6 files changed, 108 insertions(+), 6 deletions(-) create mode 100644 examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py create mode 100644 examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py diff --git a/examples/dataset/retrieval_augmented_generation/README.md b/examples/dataset/retrieval_augmented_generation/README.md index 27e83d5ac..4ec6d5529 100644 --- a/examples/dataset/retrieval_augmented_generation/README.md +++ b/examples/dataset/retrieval_augmented_generation/README.md @@ -23,6 +23,13 @@ This project defines three scripts that perform the following operations: 1. [`upload_dataset.py`](retrieval_augmented_generation/upload_dataset.py) creates the Financebench dataset on Kolena +To run it without ground truth, use `s3://kolena-public-examples/financebench/raw/financebench_without_gt.jsonl` +dataset jsonl file instead: + +```shell +uv run python retrieval_augmented_generation/upload_dataset.py --dataset-jsonl s3://kolena-public-examples/financebench/raw/financebench_without_gt.jsonl +``` + 2. [`upload_results.py`](retrieval_augmented_generation/upload_results.py) uploads a RAG system's raw inference on the Financebench dataset. diff --git a/examples/dataset/retrieval_augmented_generation/pyproject.toml b/examples/dataset/retrieval_augmented_generation/pyproject.toml index 672e81bca..bc4830152 100644 --- a/examples/dataset/retrieval_augmented_generation/pyproject.toml +++ b/examples/dataset/retrieval_augmented_generation/pyproject.toml @@ -9,7 +9,9 @@ license = "Apache-2.0" requires-python = ">=3.8,<3.12" dependencies = [ + "fsspec>=2024.12.0", "kolena>=1.50.0,<2", + "s3fs>=0.6.0", ] [tool.uv] diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py index da34f68e5..fd3bf8135 100644 --- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py @@ -15,3 +15,8 @@ DATASET = "financebench" TASK = "retrieval-augmented_generation" ID_FIELDS = ["financebench_id"] +MODEL_NAME = { + "baseline": "gpt-4o-baseline", + "qme": "gpt-4o-qme", + "query_decomp": "gpt-4o-qme-query-decomp", +} diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py index 4a0a82cdc..cc10c6276 100644 --- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py @@ -19,15 +19,12 @@ from retrieval_augmented_generation.constants import DATASET from retrieval_augmented_generation.constants import ID_FIELDS from retrieval_augmented_generation.constants import S3_BUCKET +from retrieval_augmented_generation.utils import to_locator from kolena.asset import DocumentAsset from kolena.dataset import upload_dataset -def to_locator(filename: str) -> str: - return f"{S3_BUCKET}/{DATASET}/data/{filename}.pdf" - - def to_document(evidence: list[dict[str, str]]) -> Optional[DocumentAsset]: if len(evidence) > 0: return DocumentAsset(to_locator(evidence[0]["doc_name"])) @@ -42,8 +39,9 @@ def get_pages(evidence: list[dict[str, str]]) -> str: def run(args: Namespace) -> None: df_dataset = pd.read_json(args.dataset_jsonl, lines=True) - df_dataset["document"] = df_dataset["evidence"].apply(to_document) - df_dataset["relevant_pages"] = df_dataset["evidence"].apply(get_pages) + if "evidence" in df_dataset.columns: + df_dataset["document"] = df_dataset["evidence"].apply(to_document) + df_dataset["relevant_pages"] = df_dataset["evidence"].apply(get_pages) upload_dataset(args.dataset_name, df_dataset, id_fields=ID_FIELDS) diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py new file mode 100644 index 000000000..e51ed6942 --- /dev/null +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py @@ -0,0 +1,71 @@ +# Copyright 2021-2024 Kolena Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from argparse import ArgumentParser +from argparse import Namespace + +import pandas as pd +from retrieval_augmented_generation.constants import DATASET +from retrieval_augmented_generation.constants import MODEL_NAME +from retrieval_augmented_generation.constants import S3_BUCKET +from retrieval_augmented_generation.utils import to_locator + +from kolena.asset import DocumentAsset +from kolena.dataset import upload_results + + +def to_documents(retrieved_contents: list[dict[str, str]]) -> list: + if not retrieved_contents: + return [] + + documents = [] + for content in retrieved_contents: + documents.append( + DocumentAsset( + locator=to_locator(content["doc_name"]), + content=content["content"], # type: ignore[call-arg] + page_number=content["page_number"], # type: ignore[call-arg] + ), + ) + + return documents + + +def run(args: Namespace) -> None: + model_name = MODEL_NAME[args.model] + df_results = pd.read_json(f"{S3_BUCKET}/{DATASET}/results/raw/{model_name}.jsonl", lines=True) + df_results["retrieved_contents"] = df_results["retrieved_contents"].apply(to_documents) + upload_results(args.dataset_name, model_name, df_results) + + +def main() -> None: + ap = ArgumentParser() + ap.add_argument( + "model", + type=str, + default="baseline", + nargs="?", + choices=list(MODEL_NAME.keys()), + help="Name of the model to test.", + ) + ap.add_argument( + "--dataset-name", + type=str, + default=DATASET, + help="Optionally specify a custom dataset name to test.", + ) + run(ap.parse_args()) + + +if __name__ == "__main__": + main() diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py new file mode 100644 index 000000000..756e3aa73 --- /dev/null +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py @@ -0,0 +1,19 @@ +# Copyright 2021-2024 Kolena Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from retrieval_augmented_generation.constants import DATASET +from retrieval_augmented_generation.constants import S3_BUCKET + + +def to_locator(filename: str) -> str: + return f"{S3_BUCKET}/{DATASET}/data/{filename}.pdf" From 8af92d6c78f62fe8afec7cc864764f962522d743 Mon Sep 17 00:00:00 2001 From: Yoohee Choi Date: Wed, 8 Jan 2025 15:19:40 -0500 Subject: [PATCH 4/9] Compute metrics in upload_results script --- .../retrieval_augmented_generation/README.md | 27 +++++++---- .../retrieval_augmented_generation/metrics.py | 46 +++++++++++++++++++ .../upload_dataset.py | 11 +++-- .../upload_results.py | 14 +++++- 4 files changed, 84 insertions(+), 14 deletions(-) create mode 100644 examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py diff --git a/examples/dataset/retrieval_augmented_generation/README.md b/examples/dataset/retrieval_augmented_generation/README.md index 4ec6d5529..c3725e1bd 100644 --- a/examples/dataset/retrieval_augmented_generation/README.md +++ b/examples/dataset/retrieval_augmented_generation/README.md @@ -33,24 +33,29 @@ uv run python retrieval_augmented_generation/upload_dataset.py --dataset-jsonl s 2. [`upload_results.py`](retrieval_augmented_generation/upload_results.py) uploads a RAG system's raw inference on the Financebench dataset. -3. [`evaluate.py`](retrieval_augmented_generation/evaluate.py) tests a RAG system on the Financebench dataset. This -script requires ground truth annotations. Make sure you have annotated your dataset on Kolena before running this script. - -The `upload_results.py` and `evaluate.py` script defines command line arguments to select which model to evaluate — run +The `upload_results.py` script defines command line arguments to select which model to evaluate — run using the `--help` flag for more information: ```shell $ uv run python3 retrieval_augmented_generation/upload_results.py --help -usage: upload_results.py [-h] [--dataset DATASET] {ann,logreg} +usage: upload_results.py [-h] [--dataset-name DATASET_NAME] [--evaluate] [{baseline,qme,query_decomp}] positional arguments: - {ann,logreg} Name of the model to test. + {baseline,qme,query_decomp} + Name of the model to test. optional arguments: - -h, --help show this help message and exit - --dataset DATASET Optionally specify a custom dataset name to test. + -h, --help show this help message and exit + --dataset-name DATASET_NAME + Optionally specify a custom dataset name to test. + --evaluate Computes metrics on the model results. Requires dataset with ground truth. ``` +3. Label your dataset on [Kolena]((https://app.kolena.com/redirect/)) + +4. Run evaluation by using `--evaluate` option from the `upload_results.py` script. It will compute metrics on the +model results and upload the model results including the metrics to Kolena. + ## Quality Standards Guide Once the dataset and results have been uploaded to Kolena, visit [Kolena](https://app.kolena.com/redirect/) to @@ -59,3 +64,9 @@ for details. Here are our [Quality Standards](https://docs.kolena.com/dataset/core-concepts/quality-standard/) recommendations for this workflow: + +### Metrics + +1. rate(`result.is_page_retrieved`=true): page-level retrieval rate +2. rate(`result.is_doc_retrieved`=true): doc-level retrieval rate +3. `is_correct` using [LLM prompt](https://docs.kolena.com/dataset/advanced-usage/llm-prompt-extraction/) diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py new file mode 100644 index 000000000..5887aa601 --- /dev/null +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py @@ -0,0 +1,46 @@ +# Copyright 2021-2024 Kolena Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pandas as pd +from retrieval_augmented_generation.constants import ID_FIELDS + + +def is_doc_retrieved(retrieved_contents: list, doc_name: str) -> bool: + return any([doc_name in content.locator for content in retrieved_contents]) + + +def is_page_retrieved(retrieved_contents: list, doc_name: str, relevant_pages: list) -> bool: + retrieved_pages = [content.page_number for content in retrieved_contents if doc_name in content.locator] + + # NOTE: all relevant pages must be retrieved to be considered correct. + return set(relevant_pages).issubset(retrieved_pages) + + +def compute_metrics(df_dataset: pd.DataFrame, df_results: pd.DataFrame) -> pd.DataFrame: + ground_truth_columns = ["doc_name", "relevant_pages", "financebench_id"] + assert set(ground_truth_columns).issubset( + df_dataset.columns, + ), f"ground truth columns {ground_truth_columns} cannot be found in dataset dataframe." + + df = df_results.merge(df_dataset[ground_truth_columns], on=ID_FIELDS, how="left") + + metrics = [] + for record in df.itertuples(): + metrics.append( + dict( + is_doc_retrieved=is_doc_retrieved(record.retrieved_contents, record.doc_name), + is_page_retrieved=is_page_retrieved(record.retrieved_contents, record.doc_name, record.relevant_pages), + ), + ) + + return pd.DataFrame(metrics) diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py index cc10c6276..07b242656 100644 --- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py @@ -13,6 +13,7 @@ # limitations under the License. from argparse import ArgumentParser from argparse import Namespace +from typing import Any from typing import Optional import pandas as pd @@ -25,16 +26,16 @@ from kolena.dataset import upload_dataset -def to_document(evidence: list[dict[str, str]]) -> Optional[DocumentAsset]: +def to_document(evidence: list[dict[str, Any]]) -> Optional[DocumentAsset]: if len(evidence) > 0: - return DocumentAsset(to_locator(evidence[0]["doc_name"])) + return DocumentAsset(to_locator(str(evidence[0]["doc_name"]))) return None -def get_pages(evidence: list[dict[str, str]]) -> str: - pages = [e["evidence_page_num"] for e in evidence] - return ", ".join(map(str, pages)) +def get_pages(evidence: list[dict[str, Any]]) -> list[int]: + pages = [e["evidence_page_num"] + 1 for e in evidence] # financebench page number starts from 0 index + return pages def run(args: Namespace) -> None: diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py index e51ed6942..07003df22 100644 --- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py @@ -13,18 +13,21 @@ # limitations under the License. from argparse import ArgumentParser from argparse import Namespace +from typing import Any import pandas as pd from retrieval_augmented_generation.constants import DATASET from retrieval_augmented_generation.constants import MODEL_NAME from retrieval_augmented_generation.constants import S3_BUCKET +from retrieval_augmented_generation.metrics import compute_metrics from retrieval_augmented_generation.utils import to_locator from kolena.asset import DocumentAsset +from kolena.dataset import download_dataset from kolena.dataset import upload_results -def to_documents(retrieved_contents: list[dict[str, str]]) -> list: +def to_documents(retrieved_contents: list[dict[str, Any]]) -> list: if not retrieved_contents: return [] @@ -45,6 +48,10 @@ def run(args: Namespace) -> None: model_name = MODEL_NAME[args.model] df_results = pd.read_json(f"{S3_BUCKET}/{DATASET}/results/raw/{model_name}.jsonl", lines=True) df_results["retrieved_contents"] = df_results["retrieved_contents"].apply(to_documents) + if args.evaluate: + df_dataset = download_dataset(args.dataset_name) + df_metrics = compute_metrics(df_dataset, df_results) + df_results = pd.concat([df_results, df_metrics], axis=1) upload_results(args.dataset_name, model_name, df_results) @@ -64,6 +71,11 @@ def main() -> None: default=DATASET, help="Optionally specify a custom dataset name to test.", ) + ap.add_argument( + "--evaluate", + action="store_true", + help="Computes metrics on the model results. Requires dataset with ground truth.", + ) run(ap.parse_args()) From 4486e0f0bda814c04765744642fb7a7b0984678c Mon Sep 17 00:00:00 2001 From: Yoohee Choi Date: Thu, 9 Jan 2025 09:55:22 -0500 Subject: [PATCH 5/9] remove relative dependency --- examples/dataset/retrieval_augmented_generation/pyproject.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/dataset/retrieval_augmented_generation/pyproject.toml b/examples/dataset/retrieval_augmented_generation/pyproject.toml index bc4830152..5d73afd79 100644 --- a/examples/dataset/retrieval_augmented_generation/pyproject.toml +++ b/examples/dataset/retrieval_augmented_generation/pyproject.toml @@ -21,9 +21,6 @@ dev-dependencies = [ "pytest-depends>=1.0.1,<8", ] -[tool.uv.sources] -kolena = { path = "../../../" } - [build-system] requires = ["hatchling"] build-backend = "hatchling.build" From 505567ebbd4ea4789bc4b4d62036963b724d927d Mon Sep 17 00:00:00 2001 From: Yoohee Choi Date: Thu, 9 Jan 2025 10:37:26 -0500 Subject: [PATCH 6/9] Added citation + updated dependency list --- .../retrieval_augmented_generation/README.md | 13 +++++++++++++ .../retrieval_augmented_generation/pyproject.toml | 3 +-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/examples/dataset/retrieval_augmented_generation/README.md b/examples/dataset/retrieval_augmented_generation/README.md index c3725e1bd..3d04386b6 100644 --- a/examples/dataset/retrieval_augmented_generation/README.md +++ b/examples/dataset/retrieval_augmented_generation/README.md @@ -70,3 +70,16 @@ this workflow: 1. rate(`result.is_page_retrieved`=true): page-level retrieval rate 2. rate(`result.is_doc_retrieved`=true): doc-level retrieval rate 3. `is_correct` using [LLM prompt](https://docs.kolena.com/dataset/advanced-usage/llm-prompt-extraction/) + +## Citation + +``` +@misc{islam2023financebench, + title={FinanceBench: A New Benchmark for Financial Question Answering}, + author={Pranab Islam and Anand Kannappan and Douwe Kiela and Rebecca Qian and Nino Scherrer and Bertie Vidgen}, + year={2023}, + eprint={2311.11944}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/examples/dataset/retrieval_augmented_generation/pyproject.toml b/examples/dataset/retrieval_augmented_generation/pyproject.toml index 5d73afd79..7ccaa0b8f 100644 --- a/examples/dataset/retrieval_augmented_generation/pyproject.toml +++ b/examples/dataset/retrieval_augmented_generation/pyproject.toml @@ -9,9 +9,8 @@ license = "Apache-2.0" requires-python = ">=3.8,<3.12" dependencies = [ - "fsspec>=2024.12.0", "kolena>=1.50.0,<2", - "s3fs>=0.6.0", + "s3fs>=2024.10.0", ] [tool.uv] From 66017afd229bba216be2b4e3b3415c1d98ad8fc3 Mon Sep 17 00:00:00 2001 From: Yoohee Choi Date: Thu, 9 Jan 2025 10:56:58 -0500 Subject: [PATCH 7/9] Updated README.md with more information + updated copyright year --- .../retrieval_augmented_generation/README.md | 29 +++++++++++++++++-- .../__init__.py | 2 +- .../constants.py | 2 +- .../retrieval_augmented_generation/metrics.py | 2 +- .../upload_dataset.py | 2 +- .../upload_results.py | 2 +- .../retrieval_augmented_generation/utils.py | 2 +- 7 files changed, 33 insertions(+), 8 deletions(-) diff --git a/examples/dataset/retrieval_augmented_generation/README.md b/examples/dataset/retrieval_augmented_generation/README.md index 3d04386b6..bc7f94b0d 100644 --- a/examples/dataset/retrieval_augmented_generation/README.md +++ b/examples/dataset/retrieval_augmented_generation/README.md @@ -27,12 +27,37 @@ To run it without ground truth, use `s3://kolena-public-examples/financebench/ra dataset jsonl file instead: ```shell -uv run python retrieval_augmented_generation/upload_dataset.py --dataset-jsonl s3://kolena-public-examples/financebench/raw/financebench_without_gt.jsonl +uv run python3 retrieval_augmented_generation/upload_dataset.py --dataset-jsonl s3://kolena-public-examples/financebench/raw/financebench_without_gt.jsonl ``` 2. [`upload_results.py`](retrieval_augmented_generation/upload_results.py) uploads a RAG system's raw inference on the Financebench dataset. +There are three example RAG systems (`baseline`, `qme`, and `query_decomp`) from which we have collected inferences. +The inferences are stored in jsonl format and uploaded to the s3 bucket. +[Here](https://kolena-public-examples.s3.us-west-2.amazonaws.com/financebench/results/raw/gpt-4o-baseline.jsonl) is a +link to download `baseline` system's inference jsonl file as an example. +An inference to a question is formatted in the following JSON: +``` +{ + "retrieved_contents":[ + { + "content":"...", + "doc_name":"3M_2017_10K", + "page_number":48 + }, + { + "content":"...", + "doc_name":"3M_2018_10K", + "page_number":47 + } + ], + "answer":"Answer from RAG goes here", + "query_time":8.1, + "financebench_id":"financebench_id_03029" +} +``` + The `upload_results.py` script defines command line arguments to select which model to evaluate — run using the `--help` flag for more information: @@ -59,7 +84,7 @@ model results and upload the model results including the metrics to Kolena. ## Quality Standards Guide Once the dataset and results have been uploaded to Kolena, visit [Kolena](https://app.kolena.com/redirect/) to -test the rain forecast models. See our [QuickStart](https://docs.kolena.com/dataset/quickstart/) guide +test the RAG systems. See our [QuickStart](https://docs.kolena.com/dataset/quickstart/) guide for details. Here are our [Quality Standards](https://docs.kolena.com/dataset/core-concepts/quality-standard/) recommendations for diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py index 5f584e024..c6d3f3da1 100644 --- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 Kolena Inc. +# Copyright 2021-2025 Kolena Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py index fd3bf8135..10303e53c 100644 --- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 Kolena Inc. +# Copyright 2021-2025 Kolena Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py index 5887aa601..3e1614c1f 100644 --- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 Kolena Inc. +# Copyright 2021-2025 Kolena Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py index 07b242656..19ba857ca 100644 --- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 Kolena Inc. +# Copyright 2021-2025 Kolena Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py index 07003df22..457cf8fa9 100644 --- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 Kolena Inc. +# Copyright 2021-2025 Kolena Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py index 756e3aa73..58d78358d 100644 --- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py +++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 Kolena Inc. +# Copyright 2021-2025 Kolena Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 29aeca69abb2969ccf546ea67968ec9a9ef81cc7 Mon Sep 17 00:00:00 2001 From: Yoohee Choi Date: Fri, 10 Jan 2025 10:41:52 -0500 Subject: [PATCH 8/9] Pinning pandera version on problematic integration test --- examples/workflow/speaker_diarization/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/workflow/speaker_diarization/pyproject.toml b/examples/workflow/speaker_diarization/pyproject.toml index c09a8d852..b363c20f0 100644 --- a/examples/workflow/speaker_diarization/pyproject.toml +++ b/examples/workflow/speaker_diarization/pyproject.toml @@ -14,6 +14,7 @@ pyannote-core = "^5.0.0" scipy = "^1.11.3" jiwer = "^3.0.3" numpy = "^1.19" +pandera = ">=0.22.1,<1" [tool.poetry.group.dev.dependencies] pre-commit = "^2.17" From 3d31b9cf178f7fa0d4cc95a6f66e8eed35842915 Mon Sep 17 00:00:00 2001 From: Yoohee Choi Date: Fri, 10 Jan 2025 11:11:29 -0500 Subject: [PATCH 9/9] Pinned multimethod on workflow/speaker_diarization example --- examples/workflow/speaker_diarization/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/workflow/speaker_diarization/pyproject.toml b/examples/workflow/speaker_diarization/pyproject.toml index b363c20f0..72d5a3a85 100644 --- a/examples/workflow/speaker_diarization/pyproject.toml +++ b/examples/workflow/speaker_diarization/pyproject.toml @@ -14,7 +14,7 @@ pyannote-core = "^5.0.0" scipy = "^1.11.3" jiwer = "^3.0.3" numpy = "^1.19" -pandera = ">=0.22.1,<1" +multimethod = "^1.10,<2" [tool.poetry.group.dev.dependencies] pre-commit = "^2.17"