From 5fc2ec9141c5b59d5fcb99668061a225940012ec Mon Sep 17 00:00:00 2001
From: Dushyant Behl <dushyantbehl@in.ibm.com>
Date: Thu, 2 Jan 2025 18:54:54 +0530
Subject: [PATCH 1/3] Allow hf dataset id to be loaded by training_data_path

Signed-off-by: Dushyant Behl <dushyantbehl@in.ibm.com>
---
 tests/test_sft_trainer.py      | 34 ++++++++++++++++-
 tuning/data/data_handlers.py   |  2 +-
 tuning/data/data_processors.py | 69 ++++++++++++++++++++--------------
 3 files changed, 73 insertions(+), 32 deletions(-)

diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
index 529f21b66..0bca40afb 100644
--- a/tests/test_sft_trainer.py
+++ b/tests/test_sft_trainer.py
@@ -25,7 +25,7 @@
 import tempfile
 
 # Third Party
-from datasets.exceptions import DatasetGenerationError
+from datasets.exceptions import DatasetGenerationError, DatasetNotFoundError
 from transformers.trainer_callback import TrainerCallback
 import pytest
 import torch
@@ -326,7 +326,7 @@ def test_run_train_fails_training_data_path_not_exist():
     """Check fails when data path not found."""
     updated_data_path_args = copy.deepcopy(DATA_ARGS)
     updated_data_path_args.training_data_path = "fake/path"
-    with pytest.raises(ValueError):
+    with pytest.raises(DatasetNotFoundError):
         sft_trainer.train(MODEL_ARGS, updated_data_path_args, TRAIN_ARGS, None)
 
 
@@ -998,6 +998,36 @@ def test_run_chat_style_ft_using_dataconfig(datafiles, dataconfigfile):
         assert 'Provide two rhyming words for the word "love"' in output_inference
 
 
+@pytest.mark.parametrize(
+    "data_args",
+    [
+        (
+            # sample hugging face dataset id
+            configs.DataArguments(
+                training_data_path="lhoestq/demo1",
+                data_formatter_template="### Text:{{review}} \n\n### Stars: {{star}}",
+                response_template="\n### Stars:",
+            )
+        )
+    ],
+)
+def test_run_e2e_with_hf_dataset_id(data_args):
+    """
+    Check if we can run an e2e test with a hf dataset id as training_data_path.
+    """
+    with tempfile.TemporaryDirectory() as tempdir:
+        train_args = copy.deepcopy(TRAIN_ARGS)
+        train_args.output_dir = tempdir
+
+        sft_trainer.train(MODEL_ARGS, data_args, train_args)
+
+        # validate ft tuning configs
+        _validate_training(tempdir)
+
+        # validate inference
+        _test_run_inference(checkpoint_path=_get_checkpoint_path(tempdir))
+
+
 ############################# Helper functions #############################
 def _test_run_causallm_ft(training_args, model_args, data_args, tempdir):
     train_args = copy.deepcopy(training_args)
diff --git a/tuning/data/data_handlers.py b/tuning/data/data_handlers.py
index d666a6e76..5b80dc4bb 100644
--- a/tuning/data/data_handlers.py
+++ b/tuning/data/data_handlers.py
@@ -130,7 +130,7 @@ def replace_text(match_obj):
         if index_object not in element:
             raise KeyError("Requested template string is not a valid key in dict")
 
-        return element[index_object]
+        return str(element[index_object])
 
     return {
         dataset_text_field: re.sub(r"{{([\s0-9a-zA-Z_\-\.]+)}}", replace_text, template)
diff --git a/tuning/data/data_processors.py b/tuning/data/data_processors.py
index 170bc2a81..bf69a3f3d 100644
--- a/tuning/data/data_processors.py
+++ b/tuning/data/data_processors.py
@@ -130,42 +130,53 @@ def _load_dataset(data_path=None, builder=None, data_files=None, data_dir=None):
                     f"Failed to generate the dataset from the provided {context}."
                 ) from e
 
-        if datafile:
-            loader = get_loader_for_filepath(file_path=datafile)
-            if loader in (None, ""):
-                raise ValueError(f"data path is invalid [{datafile}]")
-            return _load_dataset(builder=loader, data_files=[datafile])
-
-        data_paths = datasetconfig.data_paths
-        builder = datasetconfig.builder
-        all_datasets = []
+        def _try_load_dataset(dataset_path, dataset_builder):
+            """
+             Helper funciton to call load dataset on case by case basis to ensure we handle
+             directories and files (with or without builders) and hf datasets.
 
-        for data_path in data_paths:
+            Args:
+                data_path: The path argument for load_dataset (directory, file, pattern, dataset_id)
+                builder: Optional builder to use if provided.
+            Returns: dataset
+            """
             # CASE 1: User passes directory
-            if os.path.isdir(data_path):  # Checks if path exists and isdirectory
+            if os.path.isdir(dataset_path):  # Checks if path exists and it is a dir
                 # Directory case
-                if builder:
+                if dataset_builder:
                     # Load using a builder with a data_dir
-                    dataset = _load_dataset(builder=builder, data_dir=data_path)
-                else:
-                    # Load directly from the directory
-                    dataset = _load_dataset(data_path=data_path)
-            else:
-                # Non-directory (file, pattern, HF dataset name)
-                # If no builder provided, attempt to infer one
-                effective_builder = (
-                    builder if builder else get_loader_for_filepath(data_path)
+                    return _load_dataset(builder=dataset_builder, data_dir=dataset_path)
+
+                # If no builder then load directly from the directory
+                return _load_dataset(data_path=dataset_path)
+
+            # Non-directory (file, pattern, HF dataset name)
+            # If no builder provided, attempt to infer one
+            effective_builder = (
+                dataset_builder
+                if dataset_builder
+                else get_loader_for_filepath(dataset_path)
+            )
+
+            if effective_builder:
+                # CASE 2: Files passed with builder. Load using the builder and specific files
+                return _load_dataset(
+                    builder=effective_builder, data_files=[dataset_path]
                 )
 
-                if effective_builder:
-                    # CASE 2: Files passed with builder. Load using the builder and specific files
-                    dataset = _load_dataset(
-                        builder=effective_builder, data_files=[data_path]
-                    )
-                else:
-                    # CASE 3: User passes files/folder/pattern/HF_dataset which has no builder
-                    dataset = _load_dataset(data_path=data_path)
+            # CASE 3: User passes files/folder/pattern/HF_dataset which has no builder
+            # Still no builder, try if this is a dataset id
+            return _load_dataset(data_path=dataset_path)
 
+        if datafile:
+            return _try_load_dataset(datafile, None)
+
+        data_paths = datasetconfig.data_paths
+        builder = datasetconfig.builder
+        all_datasets = []
+
+        for data_path in data_paths:
+            dataset = _try_load_dataset(data_path, builder)
             all_datasets.append(dataset)
 
         # Logs warning if datasets have different columns

From ef5021a6ddf5bc3418c0389d4b0b7e6063117247 Mon Sep 17 00:00:00 2001
From: Dushyant Behl <dushyantbehl@in.ibm.com>
Date: Thu, 2 Jan 2025 19:27:23 +0530
Subject: [PATCH 2/3] update README

Signed-off-by: Dushyant Behl <dushyantbehl@in.ibm.com>
---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 47e84a848..cbacc6b58 100644
--- a/README.md
+++ b/README.md
@@ -62,13 +62,13 @@ pip install fms-hf-tuning[aim]
 For more details on how to enable and use the trackers, Please see, [the experiment tracking section below](#experiment-tracking).
 
 ## Data Support
-Users can pass training data in a single file using the `--training_data_path` argument along with other arguments required for various [use cases](#use-cases-supported-with-training_data_path-argument) (see details below) and the file can be in any of the [supported formats](#supported-data-formats). Alternatively, you can use our powerful [data preprocessing backend](./docs/advanced-data-preprocessing.md) to preprocess datasets on the fly.
+Users can pass training data either a single file or a hugging face dataset id, using the `--training_data_path` argument along with other arguments required for various [use cases](#use-cases-supported-with-training_data_path-argument) (see details below) and if user choose to pass the file, it can be in any of the [supported formats](#supported-data-formats). Alternatively, you can use our powerful [data preprocessing backend](./docs/advanced-data-preprocessing.md) to preprocess datasets on the fly.
 
 
 Below, we mention the list of supported data usecases via `--training_data_path` argument. For details of our advanced data preprocessing see more details in [Advanced Data Preprocessing](./docs/advanced-data-preprocessing.md).
 
 ## Supported Data Formats
-We support the following data formats via `--training_data_path` argument
+We support the following file formats via `--training_data_path` argument
 
 Data Format | Tested Support
 ------------|---------------
@@ -77,6 +77,8 @@ JSONL       |   ✅
 PARQUET     |   ✅
 ARROW       |   ✅
 
+As iterated above, we also support passing a HF dataset ID directly via `--training_data_path` argument.
+
 ## Use cases supported with `training_data_path` argument
 
 ### 1. Data formats with a single sequence and a specified response_template to use for masking on completion.

From b4ed260b9c90c66fcc1bbf575a2ec6a86b71694f Mon Sep 17 00:00:00 2001
From: Abhishek <maurya.abhishek@ibm.com>
Date: Thu, 2 Jan 2025 10:31:52 -0500
Subject: [PATCH 3/3] minor changes

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
---
 README.md                      |  2 +-
 tuning/data/data_processors.py | 11 +++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index cbacc6b58..c3198083a 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ pip install fms-hf-tuning[aim]
 For more details on how to enable and use the trackers, Please see, [the experiment tracking section below](#experiment-tracking).
 
 ## Data Support
-Users can pass training data either a single file or a hugging face dataset id, using the `--training_data_path` argument along with other arguments required for various [use cases](#use-cases-supported-with-training_data_path-argument) (see details below) and if user choose to pass the file, it can be in any of the [supported formats](#supported-data-formats). Alternatively, you can use our powerful [data preprocessing backend](./docs/advanced-data-preprocessing.md) to preprocess datasets on the fly.
+Users can pass training data as either a single file or a Hugging Face dataset ID using the `--training_data_path` argument along with other arguments required for various [use cases](#use-cases-supported-with-training_data_path-argument) (see details below). If user choose to pass a file, it can be in any of the [supported formats](#supported-data-formats). Alternatively, you can use our powerful [data preprocessing backend](./docs/advanced-data-preprocessing.md) to preprocess datasets on the fly.
 
 
 Below, we mention the list of supported data usecases via `--training_data_path` argument. For details of our advanced data preprocessing see more details in [Advanced Data Preprocessing](./docs/advanced-data-preprocessing.md).
diff --git a/tuning/data/data_processors.py b/tuning/data/data_processors.py
index bf69a3f3d..bdac6947b 100644
--- a/tuning/data/data_processors.py
+++ b/tuning/data/data_processors.py
@@ -132,14 +132,17 @@ def _load_dataset(data_path=None, builder=None, data_files=None, data_dir=None):
 
         def _try_load_dataset(dataset_path, dataset_builder):
             """
-             Helper funciton to call load dataset on case by case basis to ensure we handle
-             directories and files (with or without builders) and hf datasets.
+            Helper function to call load dataset on case by case basis to ensure we handle
+            directories and files (with or without builders) and hf datasets.
 
             Args:
-                data_path: The path argument for load_dataset (directory, file, pattern, dataset_id)
-                builder: Optional builder to use if provided.
+                dataset_path: Path of directory/file, pattern, or hf dataset id.
+                dataset_builder: Optional builder to use if provided.
             Returns: dataset
             """
+            if not dataset_path:
+                raise ValueError("Invalid dataset path")
+
             # CASE 1: User passes directory
             if os.path.isdir(dataset_path):  # Checks if path exists and it is a dir
                 # Directory case