Skip to content

Commit

Permalink
Add colab demo (#290)
Browse files Browse the repository at this point in the history
* add demo.ipynb

* add avatar

* new readme

* new readme

* new link

* Revisions to the colab demo (#293)

* Update instructions for retrieval (#295)

* add instruction to create .env

* fix event loop already running in asyncio

* Add process dataset list (#298)

* add process_generated_and_retrieved_datasets

* change parameters

* change notebook

* add variable names

* lint

---------

Co-authored-by: zhaochen20 <[email protected]>

* Fix some wording and asyncio (#297)

* Fix some wording and asyncio

* Update notebook_demo.ipynb

Co-authored-by: Eren Chenyang Zhao <[email protected]>

---------

Co-authored-by: Eren Chenyang Zhao <[email protected]>

* Add wrap input (#300)

* add wrap_input

* add wrap_input

* Update notebook_demo.ipynb

Co-authored-by: Graham Neubig <[email protected]>

---------

Co-authored-by: zhaochen20 <[email protected]>
Co-authored-by: Graham Neubig <[email protected]>

* Wording modifications to notebook demo (#299)

* Made some modifications to wording

* Revert batch size

* Small modifications

* Rename demo files to prompt2model_demo (#307)

* Make a directory for the dataset retriever

* Squash a few bugs

* Mention A100 GPUs

* Increase executor batch size

* Fix typo

* Fix bug in try it out

* Update prompt2model_demo.ipynb

* Update tests/dataset_processor_test.py

Co-authored-by: Vijay Viswanathan <[email protected]>

---------

Co-authored-by: zhaochen20 <[email protected]>
Co-authored-by: Graham Neubig <[email protected]>
Co-authored-by: Vijay Viswanathan <[email protected]>
  • Loading branch information
4 people authored Aug 29, 2023
1 parent dcac753 commit 3b420bf
Show file tree
Hide file tree
Showing 11 changed files with 914 additions and 56 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ __pycache__
build
dist
prompt2model.egg-info
.env
.vscode
.mypy_cache
.pytest_cache
Expand All @@ -17,6 +18,11 @@ tests/wandb
cached_generated_dataset/
generated_dataset/
huggingface_data/huggingface_datasets/dataset_index.json
huggingface_data/huggingface_datasets/huggingface_datasets_datafinder_index
huggingface_data/huggingface_models/
retrieved_dataset_dict/
status.yaml

# Outputs generated by the colab demo
trained_model/
trained_tokenizer/
21 changes: 16 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# prompt2model - Generate Deployable Models from Instructions
# Prompt2Model - Generate Deployable Models from Instructions

[![PyPI version](https://badge.fury.io/py/prompt2model.svg)](https://badge.fury.io/py/prompt2model)
![Github Actions CI tests](https://github.com/neulab/prompt2model/actions/workflows/ci.yml/badge.svg)
[![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)](https://lbesson.mit-license.org/)
[![Discord](https://img.shields.io/discord/1144245269001678959)](https://discord.gg/UCy9csEmFc)
[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/neulab/prompt2model/blob/main/prompt2model_demo.ipynb)

`Prompt2Model` is a system that takes a natural
language task description (like the prompts used for
Expand All @@ -14,11 +15,22 @@ special-purpose model that is conducive for deployment.

## Quick Start

### Notebook

You can run our demo of `Prompt2Model` through a notebook:

- [Open Locally](./prompt2model_demo.ipynb)
- [Open in Colab](https://colab.research.google.com/github/neulab/prompt2model/blob/main/prompt2model_demo.ipynb)

### Command Line

You can also run through the command line.

```bash
pip install prompt2model
```

Our current `prompt2model` implementation uses
Our current `Prompt2Model` implementation uses
the OpenAI API. Accordingly, you need to:

- Sign up on the OpenAI website and obtain an
Expand All @@ -36,11 +48,10 @@ export OPENAI_API_KEY=<your key>
You can then run

```bash
python cli_demo.py
python prompt2model_demo.py
```

to
create a small model from a prompt, as shown in
to create a small model from a prompt, as shown in
the demo video below. This script must be run on a
device with an internet connection to access the OpenAI
API. For best results, run
Expand Down
2 changes: 2 additions & 0 deletions prompt2model/dataset_generator/openai_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from dataclasses import dataclass
from pathlib import Path

import nest_asyncio
import openai
from datasets import Dataset
from tqdm import tqdm
Expand All @@ -26,6 +27,7 @@
handle_openai_error,
)

nest_asyncio.apply()
logger = get_formatted_logger("DatasetGenerator")


Expand Down
113 changes: 110 additions & 3 deletions prompt2model/dataset_processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(self, has_encoder: bool, eos_token: str | None = None) -> None:

@staticmethod
@abstractmethod
def post_process_example(
def _post_process_example(
example: dict,
instruction: str,
task_id: int,
Expand Down Expand Up @@ -83,13 +83,13 @@ def filter_empty_strings(example: dict) -> bool:
"input_col" in example and "output_col" in example
), "Example dictionary must have 'input_col' and 'output_col' keys."
# Check if 'input_col' and 'output_col' are both non-empty strings
return bool(example["input_col"]) and bool(example["output_col"])
return bool(str(example["input_col"])) and bool(str(example["output_col"]))

for task_id, dataset_dict in enumerate(dataset_dicts):
modified_dataset_dict = {}
for dataset_split in list(dataset_dict.keys()):
mapping_function = partial(
self.post_process_example,
self._post_process_example,
instruction=instruction,
task_id=task_id,
has_encoder=self.has_encoder,
Expand All @@ -104,3 +104,110 @@ def filter_empty_strings(example: dict) -> bool:
modified_dataset_dict = datasets.DatasetDict(modified_dataset_dict)
modified_dataset_dicts.append(modified_dataset_dict)
return modified_dataset_dicts

@staticmethod
def _split_dataset_into_dataset_dict(
dataset,
train_proportion: float = 0.8,
val_proportion: float = 0.1,
maximum_example_num: int | None = None,
) -> datasets.DatasetDict:
"""Split a given dataset into `train`, `val`, and `test` splits.
This function takes a dataset and splits it based on specified
proportions for train, val and test. It respects a maximum
number of examples to be included in each set, if specified.
Args:
dataset: The original dataset to be split.
train_proportion: Proportion of examples for the `train` set.
val_proportion: Proportion of examples for the `val` set.
maximum_example_num: Maximum number of examples
to include in each set.
Returns:
datasets.DatasetDict: A dictionary containing the `train`,
`val`, and `test` datasets.
"""
num_of_examples = len(dataset)
train_num = int(train_proportion * num_of_examples)
val_num = int(val_proportion * num_of_examples)
test_num = num_of_examples - train_num - val_num

if maximum_example_num is not None:
train_num = min(train_num, maximum_example_num)
val_num = min(val_num, maximum_example_num)
test_num = min(test_num, maximum_example_num)

train_dataset = datasets.Dataset.from_dict(dataset[:train_num])
val_dataset = datasets.Dataset.from_dict(
dataset[train_num : train_num + val_num]
)
test_dataset = datasets.Dataset.from_dict(
dataset[train_num + val_num : train_num + val_num + test_num]
)

dataset_dict = datasets.DatasetDict(
{"train": train_dataset, "val": val_dataset, "test": test_dataset}
)
return dataset_dict

@staticmethod
def wrap_single_input(instruction: str, input: str):
"""Wrap an input string into text2text fashion to be the input of model.
Args:
instruction: The instruction used as a prefix to explain the task.
input: An input string to be wrapped.
Return:
A wrapped input string.
"""
return f"<task 0>{instruction}\nExample:\n{input}\nLabel:\n"

def process_dataset_lists(
self,
instruction: str,
dataset_list: list[datasets.Dataset],
train_proportion: float = 0.8,
val_proportion: float = 0.1,
maximum_example_num: int | None = None,
) -> list[datasets.DatasetDict]:
"""Post-processes both the generated and retrieved datasets.
This function takes in datasets generated by `DatasetGenerator`
and retrieved by `DatasetRetriever`. It modifies these datasets
based on a given instruction, converting all examples into a
text-to-text format.
Args:
instruction: The instruction used as a prefix to explain the task.
dataset_list: A list of datasets. It can be either generated by
the DatasetGenerator or retrieved by the DatasetRetriever.
train_proportion: The proportion of examples used for `train`.
val_proportion: The proportion of examples used for `val`.
maxium_example_num: The maximum number of examples to
be used for `train`, `val` and `test`.
Returns:
list[datasets.DatasetDict]: A list of DatasetDicts, all examples
are converted into text2text fashion.
Note:
The DatasetRetriever returns a DatasetDict with multiple splits.
Any of these splits can be passed into this function.
The remaining proportion after allocating to `train` and
`val` will be used for the `test` set.
"""
if train_proportion + val_proportion >= 1:
raise ValueError(
f"train_proportion {train_proportion} + val_proportion {val_proportion} must be less than 1." # noqa E501
)

dataset_dicts = [
self._split_dataset_into_dataset_dict(
each, train_proportion, val_proportion, maximum_example_num
)
for each in dataset_list
]
return self.process_dataset_dict(instruction, dataset_dicts)
2 changes: 1 addition & 1 deletion prompt2model/dataset_processor/mock.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def process_dataset_dict(
return dataset_dicts

@staticmethod
def post_process_example(
def _post_process_example(
example: dict,
instruction: str,
task_id: int,
Expand Down
2 changes: 1 addition & 1 deletion prompt2model/dataset_processor/textualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self, has_encoder: bool, eos_token: str | None = None) -> None:
)

@staticmethod
def post_process_example(
def _post_process_example(
example: dict,
instruction: str,
task_id: int,
Expand Down
8 changes: 7 additions & 1 deletion prompt2model/demo_creator/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import gradio as gr
import mdtex2html

from prompt2model.dataset_processor import TextualizeProcessor
from prompt2model.model_executor import GenerationModelExecutor
from prompt2model.prompt_parser import OpenAIInstructionParser

Expand Down Expand Up @@ -35,7 +36,12 @@ def postprocess(self, y):

gr.Chatbot.postprocess = postprocess

def response(message):
def response(message: str):
if not message.startswith("<task 0>"):
dataset_processor = TextualizeProcessor(has_encoder=True)
message = dataset_processor.wrap_single_input(
prompt_parser.instruction, message
)
response = model_executor.make_single_prediction(message)
prediction = response.prediction
return prediction
Expand Down
Loading

0 comments on commit 3b420bf

Please sign in to comment.