From b774a8d36aca35ce684fae1aa0de65a54890a683 Mon Sep 17 00:00:00 2001
From: Yoohee Choi <yoohee@kolena.io>
Date: Wed, 8 Jan 2025 09:06:56 -0500
Subject: [PATCH 1/9] Upload dataset script

---
 .../retrieval_augmented_generation/README.md  | 54 +++++++++++++++
 .../pyproject.toml                            | 28 ++++++++
 .../__init__.py                               | 13 ++++
 .../constants.py                              | 17 +++++
 .../upload_dataset.py                         | 68 +++++++++++++++++++
 5 files changed, 180 insertions(+)
 create mode 100644 examples/dataset/retrieval_augmented_generation/README.md
 create mode 100644 examples/dataset/retrieval_augmented_generation/pyproject.toml
 create mode 100644 examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py
 create mode 100644 examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py
 create mode 100644 examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py

diff --git a/examples/dataset/retrieval_augmented_generation/README.md b/examples/dataset/retrieval_augmented_generation/README.md
new file mode 100644
index 000000000..27e83d5ac
--- /dev/null
+++ b/examples/dataset/retrieval_augmented_generation/README.md
@@ -0,0 +1,54 @@
+# Example Integration: Retrieval Augmented Generation (RAG)
+
+This example integration uses the [Financebench](https://github.com/patronus-ai/financebench) dataset to
+demonstrate testing RAG system on Kolena.
+
+## Setup
+
+This project uses [uv](https://docs.astral.sh/uv/) for packaging and Python dependency management. To get started,
+install project dependencies from [`pyproject.toml`](./pyproject.toml) by running:
+
+```shell
+uv sync
+```
+
+## Usage
+
+The data for this example integration lives in the publicly accessible S3 bucket `s3://kolena-public-examples`.
+
+First, ensure that the `KOLENA_TOKEN` environment variable is populated in your environment. See our
+[initialization documentation](https://docs.kolena.com/installing-kolena/#initialization) for details.
+
+This project defines three scripts that perform the following operations:
+
+1. [`upload_dataset.py`](retrieval_augmented_generation/upload_dataset.py) creates the Financebench dataset on Kolena
+
+2. [`upload_results.py`](retrieval_augmented_generation/upload_results.py) uploads a RAG system's raw inference
+on the Financebench dataset.
+
+3. [`evaluate.py`](retrieval_augmented_generation/evaluate.py) tests a RAG system on the Financebench dataset. This
+script requires ground truth annotations. Make sure you have annotated your dataset on Kolena before running this script.
+
+The `upload_results.py` and `evaluate.py` script defines command line arguments to select which model to evaluate — run
+using the `--help` flag for more information:
+
+```shell
+$ uv run python3 retrieval_augmented_generation/upload_results.py --help
+usage: upload_results.py [-h] [--dataset DATASET] {ann,logreg}
+
+positional arguments:
+  {ann,logreg}       Name of the model to test.
+
+optional arguments:
+  -h, --help         show this help message and exit
+  --dataset DATASET  Optionally specify a custom dataset name to test.
+```
+
+## Quality Standards Guide
+
+Once the dataset and results have been uploaded to Kolena, visit [Kolena](https://app.kolena.com/redirect/) to
+test the rain forecast models. See our [QuickStart](https://docs.kolena.com/dataset/quickstart/) guide
+for details.
+
+Here are our [Quality Standards](https://docs.kolena.com/dataset/core-concepts/quality-standard/) recommendations for
+this workflow:
diff --git a/examples/dataset/retrieval_augmented_generation/pyproject.toml b/examples/dataset/retrieval_augmented_generation/pyproject.toml
new file mode 100644
index 000000000..d4528ba02
--- /dev/null
+++ b/examples/dataset/retrieval_augmented_generation/pyproject.toml
@@ -0,0 +1,28 @@
+[project]
+name = "retrieval_augmented_generation"
+version = "0.1.0"
+description = "Kolena Datasets Example integration for RAG"
+authors = [
+    { name = "Kolena Engineering", email = "eng@kolena.com" }
+]
+license = "Apache-2.0"
+requires-python = ">=3.8,<3.12"
+
+dependencies = [
+    "kolena",
+    "numpy>=1.24.4",
+]
+
+[tool.uv]
+dev-dependencies = [
+    "pre-commit>=2.17,<3",
+    "pytest>=7,<8",
+    "pytest-depends>=1.0.1,<8",
+]
+
+[tool.uv.sources]
+kolena = { path = "../../../" }
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py
new file mode 100644
index 000000000..5f584e024
--- /dev/null
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2021-2024 Kolena Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py
new file mode 100644
index 000000000..da34f68e5
--- /dev/null
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py
@@ -0,0 +1,17 @@
+# Copyright 2021-2024 Kolena Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+S3_BUCKET = "s3://kolena-public-examples"
+DATASET = "financebench"
+TASK = "retrieval-augmented_generation"
+ID_FIELDS = ["financebench_id"]
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py
new file mode 100644
index 000000000..4a0a82cdc
--- /dev/null
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py
@@ -0,0 +1,68 @@
+# Copyright 2021-2024 Kolena Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from argparse import ArgumentParser
+from argparse import Namespace
+from typing import Optional
+
+import pandas as pd
+from retrieval_augmented_generation.constants import DATASET
+from retrieval_augmented_generation.constants import ID_FIELDS
+from retrieval_augmented_generation.constants import S3_BUCKET
+
+from kolena.asset import DocumentAsset
+from kolena.dataset import upload_dataset
+
+
+def to_locator(filename: str) -> str:
+    return f"{S3_BUCKET}/{DATASET}/data/{filename}.pdf"
+
+
+def to_document(evidence: list[dict[str, str]]) -> Optional[DocumentAsset]:
+    if len(evidence) > 0:
+        return DocumentAsset(to_locator(evidence[0]["doc_name"]))
+
+    return None
+
+
+def get_pages(evidence: list[dict[str, str]]) -> str:
+    pages = [e["evidence_page_num"] for e in evidence]
+    return ", ".join(map(str, pages))
+
+
+def run(args: Namespace) -> None:
+    df_dataset = pd.read_json(args.dataset_jsonl, lines=True)
+    df_dataset["document"] = df_dataset["evidence"].apply(to_document)
+    df_dataset["relevant_pages"] = df_dataset["evidence"].apply(get_pages)
+    upload_dataset(args.dataset_name, df_dataset, id_fields=ID_FIELDS)
+
+
+def main() -> None:
+    ap = ArgumentParser()
+    ap.add_argument(
+        "--dataset-jsonl",
+        type=str,
+        default=f"{S3_BUCKET}/{DATASET}/raw/financebench_open_source.jsonl",
+        help="JSONL file specifying dataset. See default JSONL for details",
+    )
+    ap.add_argument(
+        "--dataset-name",
+        type=str,
+        default=DATASET,
+        help="Optionally specify a name of the dataset",
+    )
+    run(ap.parse_args())
+
+
+if __name__ == "__main__":
+    main()

From 84c47a3a75f5bd2736eca0f521a4ff7d135688e5 Mon Sep 17 00:00:00 2001
From: Yoohee Choi <yoohee@kolena.io>
Date: Wed, 8 Jan 2025 09:09:43 -0500
Subject: [PATCH 2/9] Uses latest kolena package

---
 examples/dataset/retrieval_augmented_generation/pyproject.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/dataset/retrieval_augmented_generation/pyproject.toml b/examples/dataset/retrieval_augmented_generation/pyproject.toml
index d4528ba02..672e81bca 100644
--- a/examples/dataset/retrieval_augmented_generation/pyproject.toml
+++ b/examples/dataset/retrieval_augmented_generation/pyproject.toml
@@ -9,8 +9,7 @@ license = "Apache-2.0"
 requires-python = ">=3.8,<3.12"
 
 dependencies = [
-    "kolena",
-    "numpy>=1.24.4",
+    "kolena>=1.50.0,<2",
 ]
 
 [tool.uv]

From 6621bcee6b6f3657a80f99752eab67e913ec18a5 Mon Sep 17 00:00:00 2001
From: Yoohee Choi <yoohee@kolena.io>
Date: Wed, 8 Jan 2025 12:07:43 -0500
Subject: [PATCH 3/9] Added upload results and allow uploading dataset without
 GT

---
 .../retrieval_augmented_generation/README.md  |  7 ++
 .../pyproject.toml                            |  2 +
 .../constants.py                              |  5 ++
 .../upload_dataset.py                         | 10 ++-
 .../upload_results.py                         | 71 +++++++++++++++++++
 .../retrieval_augmented_generation/utils.py   | 19 +++++
 6 files changed, 108 insertions(+), 6 deletions(-)
 create mode 100644 examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py
 create mode 100644 examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py

diff --git a/examples/dataset/retrieval_augmented_generation/README.md b/examples/dataset/retrieval_augmented_generation/README.md
index 27e83d5ac..4ec6d5529 100644
--- a/examples/dataset/retrieval_augmented_generation/README.md
+++ b/examples/dataset/retrieval_augmented_generation/README.md
@@ -23,6 +23,13 @@ This project defines three scripts that perform the following operations:
 
 1. [`upload_dataset.py`](retrieval_augmented_generation/upload_dataset.py) creates the Financebench dataset on Kolena
 
+To run it without ground truth, use `s3://kolena-public-examples/financebench/raw/financebench_without_gt.jsonl`
+dataset jsonl file instead:
+
+```shell
+uv run python retrieval_augmented_generation/upload_dataset.py --dataset-jsonl s3://kolena-public-examples/financebench/raw/financebench_without_gt.jsonl
+```
+
 2. [`upload_results.py`](retrieval_augmented_generation/upload_results.py) uploads a RAG system's raw inference
 on the Financebench dataset.
 
diff --git a/examples/dataset/retrieval_augmented_generation/pyproject.toml b/examples/dataset/retrieval_augmented_generation/pyproject.toml
index 672e81bca..bc4830152 100644
--- a/examples/dataset/retrieval_augmented_generation/pyproject.toml
+++ b/examples/dataset/retrieval_augmented_generation/pyproject.toml
@@ -9,7 +9,9 @@ license = "Apache-2.0"
 requires-python = ">=3.8,<3.12"
 
 dependencies = [
+    "fsspec>=2024.12.0",
     "kolena>=1.50.0,<2",
+    "s3fs>=0.6.0",
 ]
 
 [tool.uv]
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py
index da34f68e5..fd3bf8135 100644
--- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py
@@ -15,3 +15,8 @@
 DATASET = "financebench"
 TASK = "retrieval-augmented_generation"
 ID_FIELDS = ["financebench_id"]
+MODEL_NAME = {
+    "baseline": "gpt-4o-baseline",
+    "qme": "gpt-4o-qme",
+    "query_decomp": "gpt-4o-qme-query-decomp",
+}
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py
index 4a0a82cdc..cc10c6276 100644
--- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py
@@ -19,15 +19,12 @@
 from retrieval_augmented_generation.constants import DATASET
 from retrieval_augmented_generation.constants import ID_FIELDS
 from retrieval_augmented_generation.constants import S3_BUCKET
+from retrieval_augmented_generation.utils import to_locator
 
 from kolena.asset import DocumentAsset
 from kolena.dataset import upload_dataset
 
 
-def to_locator(filename: str) -> str:
-    return f"{S3_BUCKET}/{DATASET}/data/{filename}.pdf"
-
-
 def to_document(evidence: list[dict[str, str]]) -> Optional[DocumentAsset]:
     if len(evidence) > 0:
         return DocumentAsset(to_locator(evidence[0]["doc_name"]))
@@ -42,8 +39,9 @@ def get_pages(evidence: list[dict[str, str]]) -> str:
 
 def run(args: Namespace) -> None:
     df_dataset = pd.read_json(args.dataset_jsonl, lines=True)
-    df_dataset["document"] = df_dataset["evidence"].apply(to_document)
-    df_dataset["relevant_pages"] = df_dataset["evidence"].apply(get_pages)
+    if "evidence" in df_dataset.columns:
+        df_dataset["document"] = df_dataset["evidence"].apply(to_document)
+        df_dataset["relevant_pages"] = df_dataset["evidence"].apply(get_pages)
     upload_dataset(args.dataset_name, df_dataset, id_fields=ID_FIELDS)
 
 
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py
new file mode 100644
index 000000000..e51ed6942
--- /dev/null
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py
@@ -0,0 +1,71 @@
+# Copyright 2021-2024 Kolena Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from argparse import ArgumentParser
+from argparse import Namespace
+
+import pandas as pd
+from retrieval_augmented_generation.constants import DATASET
+from retrieval_augmented_generation.constants import MODEL_NAME
+from retrieval_augmented_generation.constants import S3_BUCKET
+from retrieval_augmented_generation.utils import to_locator
+
+from kolena.asset import DocumentAsset
+from kolena.dataset import upload_results
+
+
+def to_documents(retrieved_contents: list[dict[str, str]]) -> list:
+    if not retrieved_contents:
+        return []
+
+    documents = []
+    for content in retrieved_contents:
+        documents.append(
+            DocumentAsset(
+                locator=to_locator(content["doc_name"]),
+                content=content["content"],  # type: ignore[call-arg]
+                page_number=content["page_number"],  # type: ignore[call-arg]
+            ),
+        )
+
+    return documents
+
+
+def run(args: Namespace) -> None:
+    model_name = MODEL_NAME[args.model]
+    df_results = pd.read_json(f"{S3_BUCKET}/{DATASET}/results/raw/{model_name}.jsonl", lines=True)
+    df_results["retrieved_contents"] = df_results["retrieved_contents"].apply(to_documents)
+    upload_results(args.dataset_name, model_name, df_results)
+
+
+def main() -> None:
+    ap = ArgumentParser()
+    ap.add_argument(
+        "model",
+        type=str,
+        default="baseline",
+        nargs="?",
+        choices=list(MODEL_NAME.keys()),
+        help="Name of the model to test.",
+    )
+    ap.add_argument(
+        "--dataset-name",
+        type=str,
+        default=DATASET,
+        help="Optionally specify a custom dataset name to test.",
+    )
+    run(ap.parse_args())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py
new file mode 100644
index 000000000..756e3aa73
--- /dev/null
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py
@@ -0,0 +1,19 @@
+# Copyright 2021-2024 Kolena Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from retrieval_augmented_generation.constants import DATASET
+from retrieval_augmented_generation.constants import S3_BUCKET
+
+
+def to_locator(filename: str) -> str:
+    return f"{S3_BUCKET}/{DATASET}/data/{filename}.pdf"

From 8af92d6c78f62fe8afec7cc864764f962522d743 Mon Sep 17 00:00:00 2001
From: Yoohee Choi <yoohee@kolena.io>
Date: Wed, 8 Jan 2025 15:19:40 -0500
Subject: [PATCH 4/9] Compute metrics in upload_results script

---
 .../retrieval_augmented_generation/README.md  | 27 +++++++----
 .../retrieval_augmented_generation/metrics.py | 46 +++++++++++++++++++
 .../upload_dataset.py                         | 11 +++--
 .../upload_results.py                         | 14 +++++-
 4 files changed, 84 insertions(+), 14 deletions(-)
 create mode 100644 examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py

diff --git a/examples/dataset/retrieval_augmented_generation/README.md b/examples/dataset/retrieval_augmented_generation/README.md
index 4ec6d5529..c3725e1bd 100644
--- a/examples/dataset/retrieval_augmented_generation/README.md
+++ b/examples/dataset/retrieval_augmented_generation/README.md
@@ -33,24 +33,29 @@ uv run python retrieval_augmented_generation/upload_dataset.py --dataset-jsonl s
 2. [`upload_results.py`](retrieval_augmented_generation/upload_results.py) uploads a RAG system's raw inference
 on the Financebench dataset.
 
-3. [`evaluate.py`](retrieval_augmented_generation/evaluate.py) tests a RAG system on the Financebench dataset. This
-script requires ground truth annotations. Make sure you have annotated your dataset on Kolena before running this script.
-
-The `upload_results.py` and `evaluate.py` script defines command line arguments to select which model to evaluate — run
+The `upload_results.py` script defines command line arguments to select which model to evaluate — run
 using the `--help` flag for more information:
 
 ```shell
 $ uv run python3 retrieval_augmented_generation/upload_results.py --help
-usage: upload_results.py [-h] [--dataset DATASET] {ann,logreg}
+usage: upload_results.py [-h] [--dataset-name DATASET_NAME] [--evaluate] [{baseline,qme,query_decomp}]
 
 positional arguments:
-  {ann,logreg}       Name of the model to test.
+  {baseline,qme,query_decomp}
+                        Name of the model to test.
 
 optional arguments:
-  -h, --help         show this help message and exit
-  --dataset DATASET  Optionally specify a custom dataset name to test.
+  -h, --help            show this help message and exit
+  --dataset-name DATASET_NAME
+                        Optionally specify a custom dataset name to test.
+  --evaluate            Computes metrics on the model results. Requires dataset with ground truth.
 ```
 
+3. Label your dataset on [Kolena]((https://app.kolena.com/redirect/))
+
+4. Run evaluation by using `--evaluate` option from the `upload_results.py` script. It will compute metrics on the
+model results and upload the model results including the metrics to Kolena.
+
 ## Quality Standards Guide
 
 Once the dataset and results have been uploaded to Kolena, visit [Kolena](https://app.kolena.com/redirect/) to
@@ -59,3 +64,9 @@ for details.
 
 Here are our [Quality Standards](https://docs.kolena.com/dataset/core-concepts/quality-standard/) recommendations for
 this workflow:
+
+### Metrics
+
+1. rate(`result.is_page_retrieved`=true): page-level retrieval rate
+2. rate(`result.is_doc_retrieved`=true): doc-level retrieval rate
+3. `is_correct` using [LLM prompt](https://docs.kolena.com/dataset/advanced-usage/llm-prompt-extraction/)
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py
new file mode 100644
index 000000000..5887aa601
--- /dev/null
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py
@@ -0,0 +1,46 @@
+# Copyright 2021-2024 Kolena Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pandas as pd
+from retrieval_augmented_generation.constants import ID_FIELDS
+
+
+def is_doc_retrieved(retrieved_contents: list, doc_name: str) -> bool:
+    return any([doc_name in content.locator for content in retrieved_contents])
+
+
+def is_page_retrieved(retrieved_contents: list, doc_name: str, relevant_pages: list) -> bool:
+    retrieved_pages = [content.page_number for content in retrieved_contents if doc_name in content.locator]
+
+    # NOTE: all relevant pages must be retrieved to be considered correct.
+    return set(relevant_pages).issubset(retrieved_pages)
+
+
+def compute_metrics(df_dataset: pd.DataFrame, df_results: pd.DataFrame) -> pd.DataFrame:
+    ground_truth_columns = ["doc_name", "relevant_pages", "financebench_id"]
+    assert set(ground_truth_columns).issubset(
+        df_dataset.columns,
+    ), f"ground truth columns {ground_truth_columns} cannot be found in dataset dataframe."
+
+    df = df_results.merge(df_dataset[ground_truth_columns], on=ID_FIELDS, how="left")
+
+    metrics = []
+    for record in df.itertuples():
+        metrics.append(
+            dict(
+                is_doc_retrieved=is_doc_retrieved(record.retrieved_contents, record.doc_name),
+                is_page_retrieved=is_page_retrieved(record.retrieved_contents, record.doc_name, record.relevant_pages),
+            ),
+        )
+
+    return pd.DataFrame(metrics)
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py
index cc10c6276..07b242656 100644
--- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from argparse import ArgumentParser
 from argparse import Namespace
+from typing import Any
 from typing import Optional
 
 import pandas as pd
@@ -25,16 +26,16 @@
 from kolena.dataset import upload_dataset
 
 
-def to_document(evidence: list[dict[str, str]]) -> Optional[DocumentAsset]:
+def to_document(evidence: list[dict[str, Any]]) -> Optional[DocumentAsset]:
     if len(evidence) > 0:
-        return DocumentAsset(to_locator(evidence[0]["doc_name"]))
+        return DocumentAsset(to_locator(str(evidence[0]["doc_name"])))
 
     return None
 
 
-def get_pages(evidence: list[dict[str, str]]) -> str:
-    pages = [e["evidence_page_num"] for e in evidence]
-    return ", ".join(map(str, pages))
+def get_pages(evidence: list[dict[str, Any]]) -> list[int]:
+    pages = [e["evidence_page_num"] + 1 for e in evidence]  # financebench page number starts from 0 index
+    return pages
 
 
 def run(args: Namespace) -> None:
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py
index e51ed6942..07003df22 100644
--- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py
@@ -13,18 +13,21 @@
 # limitations under the License.
 from argparse import ArgumentParser
 from argparse import Namespace
+from typing import Any
 
 import pandas as pd
 from retrieval_augmented_generation.constants import DATASET
 from retrieval_augmented_generation.constants import MODEL_NAME
 from retrieval_augmented_generation.constants import S3_BUCKET
+from retrieval_augmented_generation.metrics import compute_metrics
 from retrieval_augmented_generation.utils import to_locator
 
 from kolena.asset import DocumentAsset
+from kolena.dataset import download_dataset
 from kolena.dataset import upload_results
 
 
-def to_documents(retrieved_contents: list[dict[str, str]]) -> list:
+def to_documents(retrieved_contents: list[dict[str, Any]]) -> list:
     if not retrieved_contents:
         return []
 
@@ -45,6 +48,10 @@ def run(args: Namespace) -> None:
     model_name = MODEL_NAME[args.model]
     df_results = pd.read_json(f"{S3_BUCKET}/{DATASET}/results/raw/{model_name}.jsonl", lines=True)
     df_results["retrieved_contents"] = df_results["retrieved_contents"].apply(to_documents)
+    if args.evaluate:
+        df_dataset = download_dataset(args.dataset_name)
+        df_metrics = compute_metrics(df_dataset, df_results)
+        df_results = pd.concat([df_results, df_metrics], axis=1)
     upload_results(args.dataset_name, model_name, df_results)
 
 
@@ -64,6 +71,11 @@ def main() -> None:
         default=DATASET,
         help="Optionally specify a custom dataset name to test.",
     )
+    ap.add_argument(
+        "--evaluate",
+        action="store_true",
+        help="Computes metrics on the model results. Requires dataset with ground truth.",
+    )
     run(ap.parse_args())
 
 

From 4486e0f0bda814c04765744642fb7a7b0984678c Mon Sep 17 00:00:00 2001
From: Yoohee Choi <yoohee@kolena.io>
Date: Thu, 9 Jan 2025 09:55:22 -0500
Subject: [PATCH 5/9] remove relative dependency

---
 examples/dataset/retrieval_augmented_generation/pyproject.toml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/dataset/retrieval_augmented_generation/pyproject.toml b/examples/dataset/retrieval_augmented_generation/pyproject.toml
index bc4830152..5d73afd79 100644
--- a/examples/dataset/retrieval_augmented_generation/pyproject.toml
+++ b/examples/dataset/retrieval_augmented_generation/pyproject.toml
@@ -21,9 +21,6 @@ dev-dependencies = [
     "pytest-depends>=1.0.1,<8",
 ]
 
-[tool.uv.sources]
-kolena = { path = "../../../" }
-
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"

From 505567ebbd4ea4789bc4b4d62036963b724d927d Mon Sep 17 00:00:00 2001
From: Yoohee Choi <yoohee@kolena.io>
Date: Thu, 9 Jan 2025 10:37:26 -0500
Subject: [PATCH 6/9] Added citation + updated dependency list

---
 .../retrieval_augmented_generation/README.md        | 13 +++++++++++++
 .../retrieval_augmented_generation/pyproject.toml   |  3 +--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/examples/dataset/retrieval_augmented_generation/README.md b/examples/dataset/retrieval_augmented_generation/README.md
index c3725e1bd..3d04386b6 100644
--- a/examples/dataset/retrieval_augmented_generation/README.md
+++ b/examples/dataset/retrieval_augmented_generation/README.md
@@ -70,3 +70,16 @@ this workflow:
 1. rate(`result.is_page_retrieved`=true): page-level retrieval rate
 2. rate(`result.is_doc_retrieved`=true): doc-level retrieval rate
 3. `is_correct` using [LLM prompt](https://docs.kolena.com/dataset/advanced-usage/llm-prompt-extraction/)
+
+## Citation
+
+```
+@misc{islam2023financebench,
+      title={FinanceBench: A New Benchmark for Financial Question Answering},
+      author={Pranab Islam and Anand Kannappan and Douwe Kiela and Rebecca Qian and Nino Scherrer and Bertie Vidgen},
+      year={2023},
+      eprint={2311.11944},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
diff --git a/examples/dataset/retrieval_augmented_generation/pyproject.toml b/examples/dataset/retrieval_augmented_generation/pyproject.toml
index 5d73afd79..7ccaa0b8f 100644
--- a/examples/dataset/retrieval_augmented_generation/pyproject.toml
+++ b/examples/dataset/retrieval_augmented_generation/pyproject.toml
@@ -9,9 +9,8 @@ license = "Apache-2.0"
 requires-python = ">=3.8,<3.12"
 
 dependencies = [
-    "fsspec>=2024.12.0",
     "kolena>=1.50.0,<2",
-    "s3fs>=0.6.0",
+    "s3fs>=2024.10.0",
 ]
 
 [tool.uv]

From 66017afd229bba216be2b4e3b3415c1d98ad8fc3 Mon Sep 17 00:00:00 2001
From: Yoohee Choi <yoohee@kolena.io>
Date: Thu, 9 Jan 2025 10:56:58 -0500
Subject: [PATCH 7/9] Updated README.md with more information + updated
 copyright year

---
 .../retrieval_augmented_generation/README.md  | 29 +++++++++++++++++--
 .../__init__.py                               |  2 +-
 .../constants.py                              |  2 +-
 .../retrieval_augmented_generation/metrics.py |  2 +-
 .../upload_dataset.py                         |  2 +-
 .../upload_results.py                         |  2 +-
 .../retrieval_augmented_generation/utils.py   |  2 +-
 7 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/examples/dataset/retrieval_augmented_generation/README.md b/examples/dataset/retrieval_augmented_generation/README.md
index 3d04386b6..bc7f94b0d 100644
--- a/examples/dataset/retrieval_augmented_generation/README.md
+++ b/examples/dataset/retrieval_augmented_generation/README.md
@@ -27,12 +27,37 @@ To run it without ground truth, use `s3://kolena-public-examples/financebench/ra
 dataset jsonl file instead:
 
 ```shell
-uv run python retrieval_augmented_generation/upload_dataset.py --dataset-jsonl s3://kolena-public-examples/financebench/raw/financebench_without_gt.jsonl
+uv run python3 retrieval_augmented_generation/upload_dataset.py --dataset-jsonl s3://kolena-public-examples/financebench/raw/financebench_without_gt.jsonl
 ```
 
 2. [`upload_results.py`](retrieval_augmented_generation/upload_results.py) uploads a RAG system's raw inference
 on the Financebench dataset.
 
+There are three example RAG systems (`baseline`, `qme`, and `query_decomp`) from which we have collected inferences.
+The inferences are stored in jsonl format and uploaded to the s3 bucket.
+[Here](https://kolena-public-examples.s3.us-west-2.amazonaws.com/financebench/results/raw/gpt-4o-baseline.jsonl) is a
+link to download `baseline` system's inference jsonl file as an example.
+An inference to a question is formatted in the following JSON:
+```
+{
+  "retrieved_contents":[
+    {
+      "content":"...",
+      "doc_name":"3M_2017_10K",
+      "page_number":48
+    },
+    {
+      "content":"...",
+      "doc_name":"3M_2018_10K",
+      "page_number":47
+    }
+  ],
+  "answer":"Answer from RAG goes here",
+  "query_time":8.1,
+  "financebench_id":"financebench_id_03029"
+}
+```
+
 The `upload_results.py` script defines command line arguments to select which model to evaluate — run
 using the `--help` flag for more information:
 
@@ -59,7 +84,7 @@ model results and upload the model results including the metrics to Kolena.
 ## Quality Standards Guide
 
 Once the dataset and results have been uploaded to Kolena, visit [Kolena](https://app.kolena.com/redirect/) to
-test the rain forecast models. See our [QuickStart](https://docs.kolena.com/dataset/quickstart/) guide
+test the RAG systems. See our [QuickStart](https://docs.kolena.com/dataset/quickstart/) guide
 for details.
 
 Here are our [Quality Standards](https://docs.kolena.com/dataset/core-concepts/quality-standard/) recommendations for
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py
index 5f584e024..c6d3f3da1 100644
--- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 Kolena Inc.
+# Copyright 2021-2025 Kolena Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py
index fd3bf8135..10303e53c 100644
--- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/constants.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 Kolena Inc.
+# Copyright 2021-2025 Kolena Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py
index 5887aa601..3e1614c1f 100644
--- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/metrics.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 Kolena Inc.
+# Copyright 2021-2025 Kolena Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py
index 07b242656..19ba857ca 100644
--- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_dataset.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 Kolena Inc.
+# Copyright 2021-2025 Kolena Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py
index 07003df22..457cf8fa9 100644
--- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/upload_results.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 Kolena Inc.
+# Copyright 2021-2025 Kolena Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py
index 756e3aa73..58d78358d 100644
--- a/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py
+++ b/examples/dataset/retrieval_augmented_generation/retrieval_augmented_generation/utils.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 Kolena Inc.
+# Copyright 2021-2025 Kolena Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 29aeca69abb2969ccf546ea67968ec9a9ef81cc7 Mon Sep 17 00:00:00 2001
From: Yoohee Choi <yoohee@kolena.io>
Date: Fri, 10 Jan 2025 10:41:52 -0500
Subject: [PATCH 8/9] Pinning pandera version on problematic integration test

---
 examples/workflow/speaker_diarization/pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/workflow/speaker_diarization/pyproject.toml b/examples/workflow/speaker_diarization/pyproject.toml
index c09a8d852..b363c20f0 100644
--- a/examples/workflow/speaker_diarization/pyproject.toml
+++ b/examples/workflow/speaker_diarization/pyproject.toml
@@ -14,6 +14,7 @@ pyannote-core = "^5.0.0"
 scipy = "^1.11.3"
 jiwer = "^3.0.3"
 numpy = "^1.19"
+pandera = ">=0.22.1,<1"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^2.17"

From 3d31b9cf178f7fa0d4cc95a6f66e8eed35842915 Mon Sep 17 00:00:00 2001
From: Yoohee Choi <yoohee@kolena.io>
Date: Fri, 10 Jan 2025 11:11:29 -0500
Subject: [PATCH 9/9] Pinned multimethod on workflow/speaker_diarization
 example

---
 examples/workflow/speaker_diarization/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/workflow/speaker_diarization/pyproject.toml b/examples/workflow/speaker_diarization/pyproject.toml
index b363c20f0..72d5a3a85 100644
--- a/examples/workflow/speaker_diarization/pyproject.toml
+++ b/examples/workflow/speaker_diarization/pyproject.toml
@@ -14,7 +14,7 @@ pyannote-core = "^5.0.0"
 scipy = "^1.11.3"
 jiwer = "^3.0.3"
 numpy = "^1.19"
-pandera = ">=0.22.1,<1"
+multimethod = "^1.10,<2"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^2.17"