Optionally include the pipeline script in the hub when pushing your d…

…istiset (#762) * Add option to include the pipeline script as another artifact when pushing a distiset to the hub * Add documentation for the pipeline script uploaded * Inform of the new pieline script uploaded to the repository in the README * Add docs explaining how to run a pipeline using the CLI * Run python file with distilabel pipeline from CLI * Update docs with new running method * Run script by importing the pipeline from the remote module * Update src/distilabel/cli/pipeline/app.py Co-authored-by: Gabriel Martín Blázquez <[email protected]> * Update src/distilabel/cli/pipeline/utils.py Co-authored-by: Gabriel Martín Blázquez <[email protected]> * Update docs/sections/how_to_guides/advanced/cli/index.md Co-authored-by: Gabriel Martín Blázquez <[email protected]> * Update to importerror as per code review * Add missing import --------- Co-authored-by: Gabriel Martín Blázquez <[email protected]>
argilla-io · Jul 4, 2024 · cc36fa5 · cc36fa5
1 parent 647d040
commit cc36fa5
Show file tree

Hide file tree

Showing 6 changed files with 238 additions and 42 deletions.
diff --git a/docs/sections/how_to_guides/advanced/cli/index.md b/docs/sections/how_to_guides/advanced/cli/index.md
@@ -55,7 +55,7 @@ The pipeline information includes the steps used in the `Pipeline` along with th
 
 ### `distilabel pipeline run`
 
-We can also run a `Pipeline` from the CLI just pointing to the same `pipeline.yaml` file or an URL pointing to it and calling `distilabel pipeline run`:
+We can also run a `Pipeline` from the CLI just pointing to the same `pipeline.yaml` file or an URL pointing to it and calling `distilabel pipeline run`. Alternatively, an URL pointing to a Python script containing a distilabel pipeline can be used:
 
 ```bash
 $ distilabel pipeline run --help
@@ -64,28 +64,34 @@ $ distilabel pipeline run --help
 
  Run a Distilabel pipeline.
 
-╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ *  --config                                 TEXT                 Path or URL to the Distilabel pipeline configuration file.   │
-│                                                                  [default: None]                                              │
-│                                                                  [required]                                                   │
-│    --param                                  PARSE_RUNTIME_PARAM  [default: (dynamic)]                                         │
-│    --ignore-cache      --no-ignore-cache                         Whether to ignore the cache and re-run the pipeline from     │
-│                                                                  scratch.                                                     │
-│                                                                  [default: no-ignore-cache]                                   │
-│    --repo-id                                TEXT                 The Hugging Face Hub repository ID to push the resulting     │
-│                                                                  dataset to.                                                  │
-│                                                                  [default: None]                                              │
-│    --commit-message                         TEXT                 The commit message to use when pushing the dataset.          │
-│                                                                  [default: None]                                              │
-│    --private           --no-private                              Whether to make the resulting dataset private on the Hub.    │
-│                                                                  [default: no-private]                                        │
-│    --token                                  TEXT                 The Hugging Face Hub API token to use when pushing the       │
-│                                                                  dataset.                                                     │
-│                                                                  [default: None]                                              │
-│    --help                                                        Show this message and exit.                                  │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+╭─ Options ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ --param                                          PARSE_RUNTIME_PARAM  [default: (dynamic)]                                         │
+│ --config                                         TEXT                 Path or URL to the Distilabel pipeline configuration file.   │
+│                                                                       [default: None]                                              │
+│ --script                                         TEXT                 URL pointing to a python script containing a distilabel      │
+│                                                                       pipeline.                                                    │
+│                                                                       [default: None]                                              │
+│ --pipeline-variable-name                         TEXT                 Name of the pipeline in a script. I.e. the 'pipeline'        │
+│                                                                       variable in `with Pipeline(...) as pipeline:...`.            │
+│                                                                       [default: pipeline]                                          │
+│ --ignore-cache              --no-ignore-cache                         Whether to ignore the cache and re-run the pipeline from     │
+│                                                                       scratch.                                                     │
+│                                                                       [default: no-ignore-cache]                                   │
+│ --repo-id                                        TEXT                 The Hugging Face Hub repository ID to push the resulting     │
+│                                                                       dataset to.                                                  │
+│                                                                       [default: None]                                              │
+│ --commit-message                                 TEXT                 The commit message to use when pushing the dataset.          │
+│                                                                       [default: None]                                              │
+│ --private                   --no-private                              Whether to make the resulting dataset private on the Hub.    │
+│                                                                       [default: no-private]                                        │
+│ --token                                          TEXT                 The Hugging Face Hub API token to use when pushing the       │
+│                                                                       dataset.                                                     │
+│                                                                       [default: None]                                              │
+│ --help                                                                Show this message and exit.                                  │
+╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 ```
 
+Using `--config` option, we must pass a path with a `pipeline.yaml` file.
 To specify the runtime parameters of the steps we will need to use the `--param` option and the value of the parameter in the following format:
 
 ```bash
@@ -98,4 +104,12 @@ distilabel pipeline run --config "https://huggingface.co/datasets/distilabel-int
 	--param to_argilla.dataset_workspace=admin
 ```
 
+Or using `--script` we can pass directly a remote python script (keep in mind `--config` and `--script` are exclusive):
+
+```bash
+distilabel pipeline run --script "https://huggingface.co/datasets/distilabel-internal-testing/pipe_nothing_test/raw/main/pipe_nothing.py"
+```
+
+You can also pass runtime parameters to the python script as we saw with `--config` option.
+
 Again, this helps with the reproducibility of the results, and simplifies sharing not only the final dataset but also the process to generate it.
diff --git a/docs/sections/how_to_guides/advanced/distiset.md b/docs/sections/how_to_guides/advanced/distiset.md
@@ -67,9 +67,26 @@ distiset.push_to_hub(
     commit_message="Initial commit",
     private=False,
     token=os.getenv("HF_TOKEN"),
+    generate_card=True,
+    include_script=False
 )
 ```
 
+!!! info "New since version 1.3.0"
+    Since version `1.3.0` you can automatically push the script that created your pipeline to the same repository. For example, assuming you have a file like the following:
+
+    ``` py title="sample_pipe.py"
+    with Pipeline() as pipe:
+        ...
+    distiset = pipe.run()
+    distiset.push_to_hub(
+        "my-org/my-dataset,
+        include_script=True
+    )
+    ```
+
+    After running the command, you could visit the repository and the file `sample_pipe.py` will be stored to simplify sharing your pipeline with the community.
+
 ### Save and load from disk
 
 Take into account that these methods work as `datasets.load_from_disk` and `datasets.Dataset.save_to_disk` so the arguments are directly passed to those methods. This means you can also make use of `storage_options` argument to save your [`Distiset`][distilabel.distiset.Distiset] in your cloud provider, including the distilabel artifacts (`pipeline.yaml`, `pipeline.log` and the `README.md` with the dataset card). You can read more in `datasets` documentation [here](https://huggingface.co/docs/datasets/filesystems#saving-serialized-datasets).

diff --git a/src/distilabel/cli/pipeline/app.py b/src/distilabel/cli/pipeline/app.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import re
-from typing import Any, List, Tuple
+from typing import Any, List, Optional, Tuple
 
 import typer
 from typing_extensions import Annotated
@@ -39,12 +39,22 @@ def parse_runtime_param(value: str) -> Tuple[List[str], str]:
 
 @app.command(name="run", help="Run a Distilabel pipeline.")
 def run(
-    config: ConfigOption,
     # `param` is `List[Tuple[Tuple[str, ...], str]]` after parsing
     param: Annotated[
         List[Any],
         typer.Option(help="", parser=parse_runtime_param, default_factory=list),
     ],
+    config: Optional[str] = typer.Option(
+        None, help="Path or URL to the distilabel pipeline configuration file."
+    ),
+    script: Optional[str] = typer.Option(
+        None,
+        help="URL pointing to a python script containing a distilabel pipeline.",
+    ),
+    pipeline_variable_name: str = typer.Option(
+        default="pipeline",
+        help="Name of the pipeline in a script. I.e. the 'pipeline' variable in `with Pipeline(...) as pipeline:...`.",
+    ),
     ignore_cache: bool = typer.Option(
         False, help="Whether to ignore the cache and re-run the pipeline from scratch."
     ),
@@ -64,8 +74,27 @@ def run(
 ) -> None:
     from distilabel.cli.pipeline.utils import get_pipeline, parse_runtime_parameters
 
+    if script:
+        if config:
+            typer.secho(
+                "Only one of `--config` or `--script` can be informed.",
+                fg=typer.colors.RED,
+                bold=True,
+            )
+            raise typer.Exit(code=1)
+        do_run = typer.prompt("This will run a remote script, are you sure? (y/n)")
+        if do_run.lower() != "y":
+            raise typer.Exit(code=0)
+    if not config and not script:
+        typer.secho(
+            "`--config` or `--script` must be informed.",
+            fg=typer.colors.RED,
+            bold=True,
+        )
+        raise typer.Exit(code=1)
+
     try:
-        pipeline = get_pipeline(config)
+        pipeline = get_pipeline(config or script, pipeline_name=pipeline_variable_name)
     except Exception as e:
         typer.secho(str(e), fg=typer.colors.RED, bold=True)
         raise typer.Exit(code=1) from e

diff --git a/src/distilabel/cli/pipeline/utils.py b/src/distilabel/cli/pipeline/utils.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib
 import os
+import sys
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple
 
@@ -76,6 +78,24 @@ def valid_http_url(url: str) -> bool:
     return True
 
 
+def _download_remote_file(url: str) -> str:
+    """Downloads a file from a Hugging Face Hub repository.
+
+    Args:
+        url: URL of the file to download.
+
+    Returns:
+        The content of the file.
+    """
+    if "huggingface.co" in url and "HF_TOKEN" in os.environ:
+        headers = {"Authorization": f"Bearer {os.environ['HF_TOKEN']}"}
+    else:
+        headers = None
+    response = requests.get(url, headers=headers)
+    response.raise_for_status()
+    return response
+
+
 def get_config_from_url(url: str) -> Dict[str, Any]:
     """Loads the pipeline configuration from a URL pointing to a JSON or YAML file.
 
@@ -92,12 +112,7 @@ def get_config_from_url(url: str) -> Dict[str, Any]:
         raise ValueError(
             f"Unsupported file format for '{url}'. Only JSON and YAML are supported"
         )
-    if "huggingface.co" in url and "HF_TOKEN" in os.environ:
-        headers = {"Authorization": f"Bearer {os.environ['HF_TOKEN']}"}
-    else:
-        headers = None
-    response = requests.get(url, headers=headers)
-    response.raise_for_status()
+    response = _download_remote_file(url)
 
     if url.endswith((".yaml", ".yml")):
         content = response.content.decode("utf-8")
@@ -106,11 +121,53 @@ def get_config_from_url(url: str) -> Dict[str, Any]:
     return response.json()
 
 
-def get_pipeline(config: str) -> "BasePipeline":
-    """Get a pipeline from a configuration file.
+def get_pipeline_from_url(url: str, pipeline_name: str = "pipeline") -> "BasePipeline":
+    """Downloads the file to the current working directory and loads the pipeline object
+    from a python script.
+
+    Args:
+        url: The URL pointing to the python script with the pipeline definition.
+        pipeline_name: The name of the pipeline in the script.
+            I.e: `with Pipeline(...) as pipeline:...`.
+
+    Returns:
+        The pipeline instantiated.
+
+    Raises:
+        ValueError: If the file format is not supported.
+    """
+    if not url.endswith(".py"):
+        raise ValueError(
+            f"Unsupported file format for '{url}'. It must be a python file."
+        )
+    response = _download_remote_file(url)
+
+    content = response.content.decode("utf-8")
+    script_local = Path.cwd() / Path(url).name
+    script_local.write_text(content)
+
+    # Add the current working directory to sys.path
+    sys.path.insert(0, os.getcwd())
+    module = importlib.import_module(str(Path(url).stem))
+    pipeline = getattr(module, pipeline_name, None)
+    if not pipeline:
+        raise ImportError(
+            f"The script must contain an object with the pipeline named: '{pipeline_name}' that can be imported"
+        )
+
+    return pipeline
+
+
+def get_pipeline(
+    config_or_script: str, pipeline_name: str = "pipeline"
+) -> "BasePipeline":
+    """Get a pipeline from a configuration file or a remote python script.
 
     Args:
-        config: The path or URL to the pipeline configuration file.
+        config_or_script: The path or URL to the pipeline configuration file
+            or URL to a python script.
+        pipeline_name: The name of the pipeline in the script.
+            I.e: `with Pipeline(...) as pipeline:...`.
 
     Returns:
         The pipeline.
@@ -119,14 +176,31 @@ def get_pipeline(config: str) -> "BasePipeline":
         ValueError: If the file format is not supported.
         FileNotFoundError: If the configuration file does not exist.
     """
-    if valid_http_url(config):
-        data = get_config_from_url(config)
-        return Pipeline.from_dict(data)
+    config = script = None
+    if config_or_script.endswith((".json", ".yaml", ".yml")):
+        config = config_or_script
+    elif config_or_script.endswith(".py"):
+        script = config_or_script
+    else:
+        raise ValueError(
+            "The file must be a valid config file or python script with a pipeline."
+        )
+
+    if valid_http_url(config_or_script):
+        if config:
+            data = get_config_from_url(config)
+            return Pipeline.from_dict(data)
+        return get_pipeline_from_url(script, pipeline_name=pipeline_name)
+
+    if not config:
+        raise ValueError(
+            f"To run a pipeline from a python script, run it as `python {script}`"
+        )
 
     if Path(config).is_file():
         return Pipeline.from_file(config)
 
-    raise FileNotFoundError(f"Config file '{config}' does not exist.")
+    raise FileNotFoundError(f"File '{config_or_script}' does not exist.")
 
 
 def display_pipeline_information(pipeline: "BasePipeline") -> None: