Add colab demo (#290)

* add demo.ipynb * add avatar * new readme * new readme * new link * Revisions to the colab demo (#293) * Update instructions for retrieval (#295) * add instruction to create .env * fix event loop already running in asyncio * Add process dataset list (#298) * add process_generated_and_retrieved_datasets * change parameters * change notebook * add variable names * lint --------- Co-authored-by: zhaochen20 <[email protected]> * Fix some wording and asyncio (#297) * Fix some wording and asyncio * Update notebook_demo.ipynb Co-authored-by: Eren Chenyang Zhao <[email protected]> --------- Co-authored-by: Eren Chenyang Zhao <[email protected]> * Add wrap input (#300) * add wrap_input * add wrap_input * Update notebook_demo.ipynb Co-authored-by: Graham Neubig <[email protected]> --------- Co-authored-by: zhaochen20 <[email protected]> Co-authored-by: Graham Neubig <[email protected]> * Wording modifications to notebook demo (#299) * Made some modifications to wording * Revert batch size * Small modifications * Rename demo files to prompt2model_demo (#307) * Make a directory for the dataset retriever * Squash a few bugs * Mention A100 GPUs * Increase executor batch size * Fix typo * Fix bug in try it out * Update prompt2model_demo.ipynb * Update tests/dataset_processor_test.py Co-authored-by: Vijay Viswanathan <[email protected]> --------- Co-authored-by: zhaochen20 <[email protected]> Co-authored-by: Graham Neubig <[email protected]> Co-authored-by: Vijay Viswanathan <[email protected]>
neulab · Aug 29, 2023 · 3b420bf · 3b420bf
1 parent dcac753
commit 3b420bf
Show file tree

Hide file tree

Showing 11 changed files with 914 additions and 56 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@ __pycache__
 build
 dist
 prompt2model.egg-info
+.env
 .vscode
 .mypy_cache
 .pytest_cache
@@ -17,6 +18,11 @@ tests/wandb
 cached_generated_dataset/
 generated_dataset/
 huggingface_data/huggingface_datasets/dataset_index.json
+huggingface_data/huggingface_datasets/huggingface_datasets_datafinder_index
 huggingface_data/huggingface_models/
 retrieved_dataset_dict/
 status.yaml
+
+# Outputs generated by the colab demo
+trained_model/
+trained_tokenizer/
diff --git a/README.md b/README.md
@@ -1,9 +1,10 @@
-# prompt2model - Generate Deployable Models from Instructions
+# Prompt2Model - Generate Deployable Models from Instructions
 
 [![PyPI version](https://badge.fury.io/py/prompt2model.svg)](https://badge.fury.io/py/prompt2model)
 ![Github Actions CI tests](https://github.com/neulab/prompt2model/actions/workflows/ci.yml/badge.svg)
 [![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)](https://lbesson.mit-license.org/)
 [![Discord](https://img.shields.io/discord/1144245269001678959)](https://discord.gg/UCy9csEmFc)
+[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/neulab/prompt2model/blob/main/prompt2model_demo.ipynb)
 
 `Prompt2Model` is a system that takes a natural
 language task description (like the prompts used for
@@ -14,11 +15,22 @@ special-purpose model that is conducive for deployment.
 
 ## Quick Start
 
+### Notebook
+
+You can run our demo of `Prompt2Model` through a notebook:
+
+- [Open Locally](./prompt2model_demo.ipynb)
+- [Open in Colab](https://colab.research.google.com/github/neulab/prompt2model/blob/main/prompt2model_demo.ipynb)
+
+### Command Line
+
+You can also run through the command line.
+
 ```bash
 pip install prompt2model
 ```
 
-Our current `prompt2model` implementation uses
+Our current `Prompt2Model` implementation uses
 the OpenAI API. Accordingly, you need to:
 
 - Sign up on the OpenAI website and obtain an
@@ -36,11 +48,10 @@ export OPENAI_API_KEY=<your key>
 You can then run
 
 ```bash
-python cli_demo.py
+python prompt2model_demo.py
 ```
 
-to
-create a small model from a prompt, as shown in
+to create a small model from a prompt, as shown in
 the demo video below. This script must be run on a
 device with an internet connection to access the OpenAI
 API. For best results, run

diff --git a/prompt2model/dataset_generator/openai_gpt.py b/prompt2model/dataset_generator/openai_gpt.py
@@ -11,6 +11,7 @@
 from dataclasses import dataclass
 from pathlib import Path
 
+import nest_asyncio
 import openai
 from datasets import Dataset
 from tqdm import tqdm
@@ -26,6 +27,7 @@
     handle_openai_error,
 )
 
+nest_asyncio.apply()
 logger = get_formatted_logger("DatasetGenerator")
 
 

diff --git a/prompt2model/dataset_processor/base.py b/prompt2model/dataset_processor/base.py
@@ -26,7 +26,7 @@ def __init__(self, has_encoder: bool, eos_token: str | None = None) -> None:
 
     @staticmethod
     @abstractmethod
-    def post_process_example(
+    def _post_process_example(
         example: dict,
         instruction: str,
         task_id: int,
@@ -83,13 +83,13 @@ def filter_empty_strings(example: dict) -> bool:
                 "input_col" in example and "output_col" in example
             ), "Example dictionary must have 'input_col' and 'output_col' keys."
             # Check if 'input_col' and 'output_col' are both non-empty strings
-            return bool(example["input_col"]) and bool(example["output_col"])
+            return bool(str(example["input_col"])) and bool(str(example["output_col"]))
 
         for task_id, dataset_dict in enumerate(dataset_dicts):
             modified_dataset_dict = {}
             for dataset_split in list(dataset_dict.keys()):
                 mapping_function = partial(
-                    self.post_process_example,
+                    self._post_process_example,
                     instruction=instruction,
                     task_id=task_id,
                     has_encoder=self.has_encoder,
@@ -104,3 +104,110 @@ def filter_empty_strings(example: dict) -> bool:
             modified_dataset_dict = datasets.DatasetDict(modified_dataset_dict)
             modified_dataset_dicts.append(modified_dataset_dict)
         return modified_dataset_dicts
+
+    @staticmethod
+    def _split_dataset_into_dataset_dict(
+        dataset,
+        train_proportion: float = 0.8,
+        val_proportion: float = 0.1,
+        maximum_example_num: int | None = None,
+    ) -> datasets.DatasetDict:
+        """Split a given dataset into `train`, `val`, and `test` splits.
+
+        This function takes a dataset and splits it based on specified
+        proportions for train, val and test. It respects a maximum
+        number of examples to be included in each set, if specified.
+
+        Args:
+            dataset: The original dataset to be split.
+            train_proportion: Proportion of examples for the `train` set.
+            val_proportion: Proportion of examples for the `val` set.
+            maximum_example_num: Maximum number of examples
+                to include in each set.
+
+        Returns:
+            datasets.DatasetDict: A dictionary containing the `train`,
+                `val`, and `test` datasets.
+        """
+        num_of_examples = len(dataset)
+        train_num = int(train_proportion * num_of_examples)
+        val_num = int(val_proportion * num_of_examples)
+        test_num = num_of_examples - train_num - val_num
+
+        if maximum_example_num is not None:
+            train_num = min(train_num, maximum_example_num)
+            val_num = min(val_num, maximum_example_num)
+            test_num = min(test_num, maximum_example_num)
+
+        train_dataset = datasets.Dataset.from_dict(dataset[:train_num])
+        val_dataset = datasets.Dataset.from_dict(
+            dataset[train_num : train_num + val_num]
+        )
+        test_dataset = datasets.Dataset.from_dict(
+            dataset[train_num + val_num : train_num + val_num + test_num]
+        )
+
+        dataset_dict = datasets.DatasetDict(
+            {"train": train_dataset, "val": val_dataset, "test": test_dataset}
+        )
+        return dataset_dict
+
+    @staticmethod
+    def wrap_single_input(instruction: str, input: str):
+        """Wrap an input string into text2text fashion to be the input of model.
+
+        Args:
+            instruction: The instruction used as a prefix to explain the task.
+            input: An input string to be wrapped.
+
+        Return:
+                A wrapped input string.
+        """
+        return f"<task 0>{instruction}\nExample:\n{input}\nLabel:\n"
+
+    def process_dataset_lists(
+        self,
+        instruction: str,
+        dataset_list: list[datasets.Dataset],
+        train_proportion: float = 0.8,
+        val_proportion: float = 0.1,
+        maximum_example_num: int | None = None,
+    ) -> list[datasets.DatasetDict]:
+        """Post-processes both the generated and retrieved datasets.
+
+        This function takes in datasets generated by `DatasetGenerator`
+        and retrieved by `DatasetRetriever`. It modifies these datasets
+        based on a given instruction, converting all examples into a
+        text-to-text format.
+
+        Args:
+            instruction: The instruction used as a prefix to explain the task.
+            dataset_list: A list of datasets. It can be either generated by
+                the DatasetGenerator or retrieved by the DatasetRetriever.
+            train_proportion: The proportion of examples used for `train`.
+            val_proportion: The proportion of examples used for `val`.
+            maxium_example_num: The maximum number of examples to
+                be used for `train`, `val` and `test`.
+
+        Returns:
+            list[datasets.DatasetDict]: A list of DatasetDicts, all examples
+                are converted into text2text fashion.
+
+        Note:
+            The DatasetRetriever returns a DatasetDict with multiple splits.
+                Any of these splits can be passed into this function.
+            The remaining proportion after allocating to `train` and
+                `val` will be used for the `test` set.
+        """
+        if train_proportion + val_proportion >= 1:
+            raise ValueError(
+                f"train_proportion {train_proportion} + val_proportion {val_proportion} must be less than 1."  # noqa E501
+            )
+
+        dataset_dicts = [
+            self._split_dataset_into_dataset_dict(
+                each, train_proportion, val_proportion, maximum_example_num
+            )
+            for each in dataset_list
+        ]
+        return self.process_dataset_dict(instruction, dataset_dicts)
diff --git a/prompt2model/dataset_processor/mock.py b/prompt2model/dataset_processor/mock.py
@@ -24,7 +24,7 @@ def process_dataset_dict(
         return dataset_dicts
 
     @staticmethod
-    def post_process_example(
+    def _post_process_example(
         example: dict,
         instruction: str,
         task_id: int,

diff --git a/prompt2model/dataset_processor/textualize.py b/prompt2model/dataset_processor/textualize.py
@@ -41,7 +41,7 @@ def __init__(self, has_encoder: bool, eos_token: str | None = None) -> None:
             )
 
     @staticmethod
-    def post_process_example(
+    def _post_process_example(
         example: dict,
         instruction: str,
         task_id: int,

diff --git a/prompt2model/demo_creator/create.py b/prompt2model/demo_creator/create.py
@@ -3,6 +3,7 @@
 import gradio as gr
 import mdtex2html
 
+from prompt2model.dataset_processor import TextualizeProcessor
 from prompt2model.model_executor import GenerationModelExecutor
 from prompt2model.prompt_parser import OpenAIInstructionParser
 
@@ -35,7 +36,12 @@ def postprocess(self, y):
 
     gr.Chatbot.postprocess = postprocess
 
-    def response(message):
+    def response(message: str):
+        if not message.startswith("<task 0>"):
+            dataset_processor = TextualizeProcessor(has_encoder=True)
+            message = dataset_processor.wrap_single_input(
+                prompt_parser.instruction, message
+            )
         response = model_executor.make_single_prediction(message)
         prediction = response.prediction
         return prediction