diff --git a/README.md b/README.md index 7dff701eb..013203067 100644 --- a/README.md +++ b/README.md @@ -94,16 +94,16 @@ In addition, the following extras are available: ### Example -To run the following example you must install `distilabel` with both `openai` extra: +To run the following example you must install `distilabel` with the `hf-inference-endpoints` extra: ```sh -pip install "distilabel[openai]" --upgrade +pip install "distilabel[hf-inference-endpoints]" --upgrade ``` Then run: ```python -from distilabel.llms import OpenAILLM +from distilabel.llms import InferenceEndpointsLLM from distilabel.pipeline import Pipeline from distilabel.steps import LoadDataFromHub from distilabel.steps.tasks import TextGeneration @@ -114,9 +114,14 @@ with Pipeline( ) as pipeline: load_dataset = LoadDataFromHub(output_mappings={"prompt": "instruction"}) - generate_with_openai = TextGeneration(llm=OpenAILLM(model="gpt-3.5-turbo")) + text_generation = TextGeneration( + llm=InferenceEndpointsLLM( + model_id="meta-llama/Meta-Llama-3.1-8B-Instruct", + tokenizer_id="meta-llama/Meta-Llama-3.1-8B-Instruct", + ), + ) - load_dataset >> generate_with_openai + load_dataset >> text_generation if __name__ == "__main__": distiset = pipeline.run( @@ -125,7 +130,7 @@ if __name__ == "__main__": "repo_id": "distilabel-internal-testing/instruction-dataset-mini", "split": "test", }, - generate_with_openai.name: { + text_generation.name: { "llm": { "generation_kwargs": { "temperature": 0.7, @@ -135,6 +140,7 @@ if __name__ == "__main__": }, }, ) + distiset.push_to_hub(repo_id="distilabel-example") ``` ## Badges diff --git a/docs/api/embedding/embedding_gallery.md b/docs/api/embedding/embedding_gallery.md new file mode 100644 index 000000000..3eed3ab50 --- /dev/null +++ b/docs/api/embedding/embedding_gallery.md @@ -0,0 +1,8 @@ +# Embedding Gallery + +This section contains the existing [`Embeddings`][distilabel.embeddings] subclasses implemented in `distilabel`. + +::: distilabel.embeddings + options: + filters: + - "!^Embeddings$" \ No newline at end of file diff --git a/docs/api/embedding/index.md b/docs/api/embedding/index.md new file mode 100644 index 000000000..675593e18 --- /dev/null +++ b/docs/api/embedding/index.md @@ -0,0 +1,7 @@ +# Embedding + +This section contains the API reference for the `distilabel` embeddings. + +For more information on how the [`Embeddings`][distilabel.steps.tasks.Task] works and see some examples. + +::: distilabel.embeddings.base \ No newline at end of file diff --git a/docs/api/llm/anthropic.md b/docs/api/llm/anthropic.md deleted file mode 100644 index 400571c6e..000000000 --- a/docs/api/llm/anthropic.md +++ /dev/null @@ -1,3 +0,0 @@ -# AnthropicLLM - -::: distilabel.llms.anthropic diff --git a/docs/api/llm/anyscale.md b/docs/api/llm/anyscale.md deleted file mode 100644 index 90aa0cd6e..000000000 --- a/docs/api/llm/anyscale.md +++ /dev/null @@ -1,3 +0,0 @@ -# AnyscaleLLM - -::: distilabel.llms.anyscale diff --git a/docs/api/llm/azure.md b/docs/api/llm/azure.md deleted file mode 100644 index faa127d5b..000000000 --- a/docs/api/llm/azure.md +++ /dev/null @@ -1,3 +0,0 @@ -# AzureOpenAILLM - -::: distilabel.llms.azure diff --git a/docs/api/llm/cohere.md b/docs/api/llm/cohere.md deleted file mode 100644 index c7064b7a7..000000000 --- a/docs/api/llm/cohere.md +++ /dev/null @@ -1,3 +0,0 @@ -# CohereLLM - -::: distilabel.llms.cohere diff --git a/docs/api/llm/groq.md b/docs/api/llm/groq.md deleted file mode 100644 index 0a5264a77..000000000 --- a/docs/api/llm/groq.md +++ /dev/null @@ -1,3 +0,0 @@ -# GroqLLM - -::: distilabel.llms.groq diff --git a/docs/api/llm/huggingface.md b/docs/api/llm/huggingface.md deleted file mode 100644 index 30920255f..000000000 --- a/docs/api/llm/huggingface.md +++ /dev/null @@ -1,6 +0,0 @@ -# Hugging Face - -This section contains the reference for Hugging Face integrations: - -::: distilabel.llms.huggingface.inference_endpoints -::: distilabel.llms.huggingface.transformers diff --git a/docs/api/llm/litellm.md b/docs/api/llm/litellm.md deleted file mode 100644 index 90a4d2d63..000000000 --- a/docs/api/llm/litellm.md +++ /dev/null @@ -1,3 +0,0 @@ -# LiteLLM - -::: distilabel.llms.litellm diff --git a/docs/api/llm/llamacpp.md b/docs/api/llm/llamacpp.md deleted file mode 100644 index 02598c1a6..000000000 --- a/docs/api/llm/llamacpp.md +++ /dev/null @@ -1,3 +0,0 @@ -# LlamaCppLLM - -::: distilabel.llms.llamacpp diff --git a/docs/api/llm/llm_gallery.md b/docs/api/llm/llm_gallery.md new file mode 100644 index 000000000..ad0b1b75f --- /dev/null +++ b/docs/api/llm/llm_gallery.md @@ -0,0 +1,10 @@ +# LLM Gallery + +This section contains the existing [`LLM`][distilabel.llms] subclasses implemented in `distilabel`. + +::: distilabel.llms + options: + filters: + - "!^LLM$" + - "!^AsyncLLM$" + - "!typing" \ No newline at end of file diff --git a/docs/api/llm/mistral.md b/docs/api/llm/mistral.md deleted file mode 100644 index 069488ead..000000000 --- a/docs/api/llm/mistral.md +++ /dev/null @@ -1,3 +0,0 @@ -# MistralLLM - -::: distilabel.llms.mistral diff --git a/docs/api/llm/ollama.md b/docs/api/llm/ollama.md deleted file mode 100644 index 25e4b662a..000000000 --- a/docs/api/llm/ollama.md +++ /dev/null @@ -1,3 +0,0 @@ -# OllamaLLM - -::: distilabel.llms.ollama diff --git a/docs/api/llm/openai.md b/docs/api/llm/openai.md deleted file mode 100644 index 381306ad5..000000000 --- a/docs/api/llm/openai.md +++ /dev/null @@ -1,3 +0,0 @@ -# OpenAILLM - -::: distilabel.llms.openai diff --git a/docs/api/llm/together.md b/docs/api/llm/together.md deleted file mode 100644 index 653016520..000000000 --- a/docs/api/llm/together.md +++ /dev/null @@ -1,3 +0,0 @@ -# TogetherLLM - -::: distilabel.llms.together diff --git a/docs/api/llm/vertexai.md b/docs/api/llm/vertexai.md deleted file mode 100644 index f8990605d..000000000 --- a/docs/api/llm/vertexai.md +++ /dev/null @@ -1,3 +0,0 @@ -# VertexAILLM - -::: distilabel.llms.vertexai diff --git a/docs/api/llm/vllm.md b/docs/api/llm/vllm.md deleted file mode 100644 index 053b8535b..000000000 --- a/docs/api/llm/vllm.md +++ /dev/null @@ -1,3 +0,0 @@ -# vLLM - -::: distilabel.llms.vllm diff --git a/docs/api/step/typing.md b/docs/api/step/typing.md new file mode 100644 index 000000000..1a86e7dac --- /dev/null +++ b/docs/api/step/typing.md @@ -0,0 +1,3 @@ +# Step Typing + +::: distilabel.steps.typing \ No newline at end of file diff --git a/docs/api/step_gallery/extra.md b/docs/api/step_gallery/extra.md index e310e45d4..3d3e6f9c5 100644 --- a/docs/api/step_gallery/extra.md +++ b/docs/api/step_gallery/extra.md @@ -1,6 +1,11 @@ # Extra -::: distilabel.steps.generators.data -::: distilabel.steps.deita -::: distilabel.steps.formatting -::: distilabel.steps.typing +::: distilabel.steps + options: + filters: + - "!Argilla" + - "!Columns" + - "!From(Disk|FileSystem)" + - "!Hub" + - "![Ss]tep" + - "!typing" diff --git a/docs/api/step_gallery/hugging_face.md b/docs/api/step_gallery/hugging_face.md index 42fb85e79..c801aca86 100644 --- a/docs/api/step_gallery/hugging_face.md +++ b/docs/api/step_gallery/hugging_face.md @@ -5,3 +5,4 @@ This section contains the existing steps integrated with `Hugging Face` so as to ::: distilabel.steps.LoadDataFromDisk ::: distilabel.steps.LoadDataFromFileSystem ::: distilabel.steps.LoadDataFromHub +::: distilabel.steps.PushToHub \ No newline at end of file diff --git a/docs/api/task_gallery/index.md b/docs/api/task/task_gallery.md similarity index 100% rename from docs/api/task_gallery/index.md rename to docs/api/task/task_gallery.md diff --git a/docs/assets/images/sections/community/compare-pull-request.PNG b/docs/assets/images/sections/community/compare-pull-request.PNG new file mode 100644 index 000000000..ace5c010b Binary files /dev/null and b/docs/assets/images/sections/community/compare-pull-request.PNG differ diff --git a/docs/assets/images/sections/community/create-branch.PNG b/docs/assets/images/sections/community/create-branch.PNG new file mode 100644 index 000000000..24dfc1975 Binary files /dev/null and b/docs/assets/images/sections/community/create-branch.PNG differ diff --git a/docs/assets/images/sections/community/edit-file.PNG b/docs/assets/images/sections/community/edit-file.PNG new file mode 100644 index 000000000..c76e535d7 Binary files /dev/null and b/docs/assets/images/sections/community/edit-file.PNG differ diff --git a/docs/index.md b/docs/index.md index 37cf6f9fd..ce76c9695 100644 --- a/docs/index.md +++ b/docs/index.md @@ -38,21 +38,39 @@ hide: Distilabel is the framework for synthetic data and AI feedback for engineers who need fast, reliable and scalable pipelines based on verified research papers. -If you just want to get started, we recommend you check the [documentation](http://distilabel.argilla.io/). Curious, and want to know more? Keep reading! +
Improve your AI output quality through data quality
Compute is expensive and output quality is important. We help you **focus on data quality**, which tackles the root cause of both of these problems at once. Distilabel helps you to synthesize and judge data to let you spend your valuable time **achieving and keeping high-quality standards for your synthetic data**. -### Take control of your data and models +Take control of your data and models
**Ownership of data for fine-tuning your own LLMs** is not easy but distilabel can help you to get started. We integrate **AI feedback from any LLM provider out there** using one unified API. -### Improve efficiency by quickly iterating on the right research and LLMs +Improve efficiency by quickly iterating on the right data and models
Synthesize and judge data with **latest research papers** while ensuring **flexibility, scalability and fault tolerance**. So you can focus on improving your data and training your models. diff --git a/docs/sections/community/contributor.md b/docs/sections/community/contributor.md new file mode 100644 index 000000000..bfc5f287c --- /dev/null +++ b/docs/sections/community/contributor.md @@ -0,0 +1,159 @@ +--- +description: This is a step-by-step guide to help you contribute to the distilabel project. We are excited to have you on board! π +hide: + - footer +--- + +Thank you for investing your time in contributing to the project! Any contribution you make will be reflected in the most recent version of distilabel π€©. + +??? Question "New to contributing in general?" + If you're a new contributor, read the [README](https://github.com/argilla-io/distilabel/blob/develop/README.md) to get an overview of the project. In addition, here are some resources to help you get started with open-source contributions: + + * **Discord**: You are welcome to join the [distilabel Discord community](http://hf.co/join/discord), where you can keep in touch with other users, contributors and the distilabel team. In the following [section](#first-contact-in-discord), you can find more information on how to get started in Discord. + * **Git**: This is a very useful tool to keep track of the changes in your files. Using the command-line interface (CLI), you can make your contributions easily. For that, you need to have it [installed and updated](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) on your computer. + * **GitHub**: It is a platform and cloud-based service that uses git and allows developers to collaborate on projects. To contribute to distilabel, you'll need to create an account. Check the [Contributor Workflow with Git and Github](#contributor-workflow-with-git-and-github) for more info. + * **Developer Documentation**: To collaborate, you'll need to set up an efficient environment. Check the [Installation](../getting_started/installation.md) guide to know how to do it. + +## First Contact in Discord + +Discord is a handy tool for more casual conversations and to answer day-to-day questions. As part of Hugging Face, we have set up some distilabel channels on the server. Click [here](http://hf.co/join/discord) to join the Hugging Face Discord community effortlessly. + +When part of the Hugging Face Discord, you can select "Channels & roles" and select "Argilla" along with any of the other groups that are interesting to you. "Argilla" will cover anything about argilla and distilabel. You can join the following channels: + +* **#argilla-distilabel-announcements**: π£ Stay up-to-date. +* **#argilla-distilabel-general**: π¬ For general discussions. +* **#argilla-distilabel-help**: πββοΈ Need assistance? We're always here to help. Select the appropriate label (argilla or distilabel) for your issue and post it. + +So now there is only one thing left to do: introduce yourself and talk to the community. You'll always be welcome! π€π + + +## Contributor Workflow with Git and GitHub + +If you're working with distilabel and suddenly a new idea comes to your mind or you find an issue that can be improved, it's time to actively participate and contribute to the project! + +### Report an issue + +If you spot a problem, [search if an issue already exists](https://github.com/argilla-io/distilabel/issues?q=is%3Aissue), you can use the `Label` filter. If that is the case, participate in the conversation. If it does not exist, create an issue by clicking on `New Issue`. This will show various templates; choose the one that best suits your issue. Once you choose one, you will need to fill it in following the guidelines. Try to be as clear as possible. In addition, you can assign yourself to the issue and add or choose the right labels. Finally, click on `Submit new issue`. + + +### Work with a fork + +#### Fork the distilabel repository + +After having reported the issue, you can start working on it. For that, you will need to create a fork of the project. To do that, click on the `Fork` button. Now, fill in the information. Remember to uncheck the `Copy develop branch only` if you are going to work in or from another branch (for instance, to fix documentation, the `main` branch is used). Then, click on `Create fork`. + +You will be redirected to your fork. You can see that you are in your fork because the name of the repository will be your `username/distilabel`, and it will indicate `forked from argilla-io/distilabel`. + + +#### Clone your forked repository + +In order to make the required adjustments, clone the forked repository to your local machine. Choose the destination folder and run the following command: + +```sh +git clone https://github.com/[your-github-username]/distilabel.git +cd distilabel +``` + +To keep your forkβs main/develop branch up to date with our repo, add it as an upstream remote branch. + +```sh +git remote add upstream https://github.com/argilla-io/distilabel.git +``` + + +### Create a new branch + +For each issue you're addressing, it's advisable to create a new branch. GitHub offers a straightforward method to streamline this process. + +> β οΈ Never work directly on the `main` or `develop` branch. Always create a new branch for your changes. + +Navigate to your issue, and on the right column, select `Create a branch`. + +![Create a branch](../../assets/images/sections/community/create-branch.PNG) + +After the new window pops up, the branch will be named after the issue and include a prefix such as feature/, bug/, or docs/ to facilitate quick recognition of the issue type. In the `Repository destination`, pick your fork ( [your-github-username]/distilabel), and then select `Change branch source` to specify the source branch for creating the new one. Complete the process by clicking `Create branch`. + +> π€ Remember that the `main` branch is only used to work with the documentation. For any other changes, use the `develop` branch. + +Now, locally, change to the new branch you just created. + +```sh +git fetch origin +git checkout [branch-name] +``` + +### Make changes and push them + +Make the changes you want in your local repository, and test that everything works and you are following the guidelines. + +Once you have finished, you can check the status of your repository and synchronize with the upstreaming repo with the following command: + +```sh +# Check the status of your repository +git status + +# Synchronize with the upstreaming repo +git checkout [branch-name] +git rebase [default-branch] +``` + +If everything is right, we need to commit and push the changes to your fork. For that, run the following commands: + +```sh +# Add the changes to the staging area +git add filename + +# Commit the changes by writing a proper message +git commit -m "commit-message" + +# Push the changes to your fork +git push origin [branch-name] +``` + +When pushing, you will be asked to enter your GitHub login credentials. Once the push is complete, all local commits will be on your GitHub repository. + + +### Create a pull request + +Come back to GitHub, navigate to the original repository where you created your fork, and click on `Compare & pull request`. + +![compare-and-pr](../../assets/images/sections/community/compare-pull-request.PNG) + +First, click on `compare across forks` and select the right repositories and branches. + +> In the base repository, keep in mind that you should select either `main` or `develop` based on the modifications made. In the head repository, indicate your forked repository and the branch corresponding to the issue. + +Then, fill in the pull request template. You should add a prefix to the PR name, as we did with the branch above. If you are working on a new feature, you can name your PR as `feat: TITLE`. If your PR consists of a solution for a bug, you can name your PR as `bug: TITLE`. And, if your work is for improving the documentation, you can name your PR as `docs: TITLE`. + +In addition, on the right side, you can select a reviewer (for instance, if you discussed the issue with a member of the team) and assign the pull request to yourself. It is highly advisable to add labels to PR as well. You can do this again by the labels section right on the screen. For instance, if you are addressing a bug, add the `bug` label, or if the PR is related to the documentation, add the `documentation` label. This way, PRs can be easily filtered. + +Finally, fill in the template carefully and follow the guidelines. Remember to link the original issue and enable the checkbox to allow maintainer edits so the branch can be updated for a merge. Then, click on `Create pull request`. + + +### Review your pull request + +Once you submit your PR, a team member will review your proposal. We may ask questions, request additional information, or ask for changes to be made before a PR can be merged, either using [suggested changes](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/incorporating-feedback-in-your-pull-request) or pull request comments. + +You can apply the changes directly through the UI (check the files changed and click on the right-corner three dots; see image below) or from your fork, and then commit them to your branch. The PR will be updated automatically, and the suggestions will appear as `outdated`. + +![edit-file-from-UI](../../assets/images/sections/community/edit-file.PNG) + +> If you run into any merge issues, check out this [git tutorial](https://github.com/skills/resolve-merge-conflicts) to help you resolve merge conflicts and other issues. + + +### Your PR is merged! + +Congratulations ππ We thank you π€© + +Once your PR is merged, your contributions will be publicly visible on the [distilabel GitHub](https://github.com/argilla-io/distilabel#contributors). + +Additionally, we will include your changes in the next release based on our [development branch](https://github.com/argilla-io/argilla/tree/develop). + +## Additional resources + +Here are some helpful resources for your reference. + +* [Configuring Discord](https://support.discord.com/hc/en-us/categories/115000217151), a guide to learning how to get started with Discord. +* [Pro Git](https://git-scm.com/book/en/v2), a book to learn Git. +* [Git in VSCode](https://code.visualstudio.com/docs/sourcecontrol/overview), a guide to learning how to easily use Git in VSCode. +* [GitHub Skills](https://skills.github.com/), an interactive course for learning GitHub. \ No newline at end of file diff --git a/docs/sections/getting_started/faq.md b/docs/sections/getting_started/faq.md index 27768a3c6..16ad84075 100644 --- a/docs/sections/getting_started/faq.md +++ b/docs/sections/getting_started/faq.md @@ -7,20 +7,20 @@ hide: # Frequent Asked Questions (FAQ) ??? faq "How can I rename the columns in a batch?" - Every [`Step`][distilabel.steps.base.Step] has both `input_mappings` and `output_mappings` attributes, that can be used to rename the columns in each batch. + Every [`Step`][distilabel.steps.base.Step] has both `input_mappings` and `output_mappings` attributes that can be used to rename the columns in each batch. - But `input_mappings` will only map, meaning that if you have a batch with the column `A` and you want to rename to `B`, you should use `input_mappings={"A": "B"}`, but that will only be applied to that specific [`Step`][distilabel.steps.base.Step] meaning that the next step in the pipeline will still have the column `A` instead of `B`. + But `input_mappings` will only map, meaning that if you have a batch with the column `A` and you want to rename it to `B`, you should use `input_mappings={"A": "B"}`, but that will only be applied to that specific [`Step`][distilabel.steps.base.Step] meaning that the next step in the pipeline will still have the column `A` instead of `B`. While `output_mappings` will indeed apply the rename, meaning that if the [`Step`][distilabel.steps.base.Step] produces the column `A` and you want to rename to `B`, you should use `output_mappings={"A": "B"}`, and that will be applied to the next [`Step`][distilabel.steps.base.Step] in the pipeline. ??? faq "Will the API Keys be exposed when sharing the pipeline?" No, those will be masked out using `pydantic.SecretStr`, meaning that those won't be exposed when sharing the pipeline. - This also means that if you want to re-run your own pipeline and the API keys have not been provided via environment variable but either via attribute or runtime parameter, you will need to provide them again. + This also means that if you want to re-run your own pipeline and the API keys have not been provided via environment variable but either via an attribute or runtime parameter, you will need to provide them again. ??? faq "Does it work for Windows?" - Yes, but you may need to set the `multiprocessing` context in advance, to ensure that the `spawn` method is used, since the default method `fork` is not available on Windows. + Yes, but you may need to set the `multiprocessing` context in advance to ensure that the `spawn` method is used since the default method `fork` is not available on Windows. ```python import multiprocessing as mp @@ -29,16 +29,16 @@ hide: ``` ??? faq "Will the custom Steps / Tasks / LLMs be serialized too?" - No, at the moment only the references to the classes within the `distilabel` library will be serialized, meaning that if you define a custom class used within the pipeline, the serialization won't break, but the deserialize will fail since the class won't be available, unless used from the same file. + No, at the moment, only the references to the classes within the `distilabel` library will be serialized, meaning that if you define a custom class used within the pipeline, the serialization won't break, but the deserialize will fail since the class won't be available unless used from the same file. ??? faq "What happens if `Pipeline.run` fails? Do I lose all the data?" - No, indeed we're using a cache mechanism to store all the intermediate results in disk, so that if a [`Step`][distilabel.steps.base.Step] fails, the pipeline can be re-run from that point without losing the data, only if nothing is changed in the `Pipeline`. + No, indeed, we're using a cache mechanism to store all the intermediate results in the disk so, if a [`Step`][distilabel.steps.base.Step] fails; the pipeline can be re-run from that point without losing the data, only if nothing is changed in the `Pipeline`. All the data will be stored in `.cache/distilabel`, but the only data that will persist at the end of the `Pipeline.run` execution is the one from the leaf step/s, so bear that in mind. For more information on the caching mechanism in `distilabel`, you can check the [Learn - Advanced - Caching](../how_to_guides/advanced/caching.md) section. - Also note that when running a [`Step`][distilabel.steps.base.Step] or a [`Task`][distilabel.steps.tasks.Task] standalone, the cache mechanism won't be used, so if you want to use that, you should use the `Pipeline` context manager. + Also, note that when running a [`Step`][distilabel.steps.base.Step] or a [`Task`][distilabel.steps.tasks.Task] standalone, the cache mechanism won't be used, so if you want to use that, you should use the `Pipeline` context manager. ??? faq "How can I use the same `LLM` across several tasks without having to load it several times?" You can serve the LLM using a solution like TGI or vLLM, and then connect to it using an `AsyncLLM` client like `InferenceEndpointsLLM` or `OpenAILLM`. Please refer to [Serving LLMs guide](../how_to_guides/advanced/serving_an_llm_for_reuse.md) for more information. diff --git a/docs/sections/getting_started/installation.md b/docs/sections/getting_started/installation.md index 804aa8de7..a16924172 100644 --- a/docs/sections/getting_started/installation.md +++ b/docs/sections/getting_started/installation.md @@ -6,9 +6,6 @@ hide: # Installation -!!! NOTE - Since `distilabel` v1.0.0 was recently released, we refactored most of the stuff, so the installation below only applies to `distilabel` v1.0.0 and above. - You will need to have at least Python 3.9 or higher, up to Python 3.12, since support for the latter is still a work in progress. To install the latest release of the package from PyPI you can use the following command: @@ -42,6 +39,8 @@ Additionally, as part of `distilabel` some extra dependencies are available, mai - `hf-transformers`: for using models available in [transformers](https://github.com/huggingface/transformers) package via the `TransformersLLM` integration. +- `instructor`: for using structured generation of LLMs with [Instructor](https://github.com/jxnl/instructor/). + - `litellm`: for using [`LiteLLM`](https://github.com/BerriAI/litellm) to call any LLM using OpenAI format via the `LiteLLM` integration. - `llama-cpp`: for using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) Python bindings for `llama.cpp` via the `LlamaCppLLM` integration. @@ -52,15 +51,23 @@ Additionally, as part of `distilabel` some extra dependencies are available, mai - `openai`: for using [OpenAI API](https://openai.com/blog/openai-api) models via the `OpenAILLM` integration, or the rest of the integrations based on OpenAI and relying on its client as `AnyscaleLLM`, `AzureOpenAILLM`, and `TogetherLLM`. +- `outlines`: for using structured generation of LLMs with [outlines](https://github.com/outlines-dev/outlines). + +- `ray`: for scaling and distributing a pipeline with [Ray](https://github.com/ray-project/ray). + - `vertexai`: for using [Google Vertex AI](https://cloud.google.com/vertex-ai) proprietary models via the `VertexAILLM` integration. - `vllm`: for using [vllm](https://github.com/vllm-project/vllm) serving engine via the `vLLM` integration. +- `sentence-transformers`: for generating sentence embeddings using [sentence-transformers](https://github.com/UKPLab/sentence-transformers). + +- `faiss-cpu` and `faiss-gpu`: for generating sentence embeddings using [faiss](https://github.com/facebookresearch/faiss). + ## Recommendations / Notes The [`mistralai`](https://github.com/mistralai/client-python) dependency requires Python 3.9 or higher, so if you're willing to use the `distilabel.llms.MistralLLM` implementation, you will need to have Python 3.9 or higher. -In some cases like [`transformers`](https://github.com/huggingface/transformers) and [`vllm`](https://github.com/vllm-project/vllm) the installation of [`flash-attn`](https://github.com/Dao-AILab/flash-attention) is recommended if you are using a GPU accelerator, since it will speed up the inference process, but the installation needs to be done separately, as it's not included in the `distilabel` dependencies. +In some cases like [`transformers`](https://github.com/huggingface/transformers) and [`vllm`](https://github.com/vllm-project/vllm), the installation of [`flash-attn`](https://github.com/Dao-AILab/flash-attention) is recommended if you are using a GPU accelerator since it will speed up the inference process, but the installation needs to be done separately, as it's not included in the `distilabel` dependencies. ```sh pip install flash-attn --no-build-isolation diff --git a/docs/sections/getting_started/quickstart.md b/docs/sections/getting_started/quickstart.md index 4fd7de607..bf3780739 100644 --- a/docs/sections/getting_started/quickstart.md +++ b/docs/sections/getting_started/quickstart.md @@ -4,14 +4,20 @@ hide: - toc --- + + + + # Quickstart To start off, `distilabel` is a framework for building pipelines for generating synthetic data using LLMs, that defines a [`Pipeline`][distilabel.pipeline.Pipeline] which orchestrates the execution of the [`Step`][distilabel.steps.base.Step] subclasses, and those will be connected as nodes in a Direct Acyclic Graph (DAG). -That being said, in this guide we will walk you through the process of creating a simple pipeline that uses the [`OpenAILLM`][distilabel.llms.OpenAILLM] class to generate text. The [`Pipeline`][distilabel.pipeline.Pipeline] will load a dataset that contains a column named `prompt` from the Hugging Face Hub via the step [`LoadDataFromHub`][distilabel.steps.LoadDataFromHub] and then use the [`OpenAILLM`][distilabel.llms.OpenAILLM] class to generate text based on the dataset using the [`TextGeneration`][distilabel.steps.tasks.TextGeneration] task. +In this guide we will walk you through the process of creating a simple pipeline that uses the [`InferenceEndpointsLLM`][distilabel.llms.InferenceEndpointsLLM] class to generate text. The [`Pipeline`][distilabel.pipeline.Pipeline] will load a dataset that contains a column named `prompt` from the Hugging Face Hub via the step [`LoadDataFromHub`][distilabel.steps.LoadDataFromHub] and then use the [`InferenceEndpointsLLM`][distilabel.llms.InferenceEndpointsLLM] class to generate text based on the dataset using the [`TextGeneration`][distilabel.steps.tasks.TextGeneration] task. + +> You can check the available models in the [Hugging Face Model Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending) and filter by `Inference status`. ```python -from distilabel.llms import OpenAILLM +from distilabel.llms import InferenceEndpointsLLM from distilabel.pipeline import Pipeline from distilabel.steps import LoadDataFromHub from distilabel.steps.tasks import TextGeneration @@ -21,13 +27,14 @@ with Pipeline( # (1) description="A simple text generation pipeline", ) as pipeline: # (2) load_dataset = LoadDataFromHub( # (3) - name="load_dataset", output_mappings={"prompt": "instruction"}, ) text_generation = TextGeneration( # (4) - name="text_generation", - llm=OpenAILLM(model="gpt-3.5-turbo"), # (5) + llm=InferenceEndpointsLLM( + model_id="meta-llama/Meta-Llama-3.1-8B-Instruct", + tokenizer_id="meta-llama/Meta-Llama-3.1-8B-Instruct", + ), # (5) ) load_dataset >> text_generation # (6) @@ -56,11 +63,11 @@ if __name__ == "__main__": 2. We are using the [`Pipeline`][distilabel.pipeline.Pipeline] context manager, meaning that every [`Step`][distilabel.steps.base.Step] subclass that is defined within the context manager will be added to the pipeline automatically. -3. We define a [`LoadDataFromHub`][distilabel.steps.LoadDataFromHub] step named `load_dataset` that will load a dataset from the Hugging Face Hub, as provided via runtime parameters in the `pipeline.run` method below, but it can also be defined within the class instance via the arg `repo_id=...`. This step will basically produce output batches with the rows from the dataset, and the column `prompt` will be mapped to the `instruction` field. +3. We define a [`LoadDataFromHub`][distilabel.steps.LoadDataFromHub] step named `load_dataset` that will load a dataset from the Hugging Face Hub, as provided via runtime parameters in the `pipeline.run` method below, but it can also be defined within the class instance via the arg `repo_id=...`. This step will produce output batches with the rows from the dataset, and the column `prompt` will be mapped to the `instruction` field. -4. We define a [`TextGeneration`][distilabel.steps.tasks.TextGeneration] task named `text_generation` that will generate text based on the `instruction` field from the dataset. This task will use the [`OpenAILLM`][distilabel.llms.OpenAILLM] class with the model `gpt-3.5-turbo`. +4. We define a [`TextGeneration`][distilabel.steps.tasks.TextGeneration] task named `text_generation` that will generate text based on the `instruction` field from the dataset. This task will use the [`InferenceEndpointsLLM`][distilabel.llms.InferenceEndpointsLLM] class with the model `Meta-Llama-3.1-8B-Instruct`. -5. We define the [`OpenAILLM`][distilabel.llms.OpenAILLM] class with the model `gpt-3.5-turbo` that will be used by the [`TextGeneration`][distilabel.steps.tasks.TextGeneration] task. In this case, since the [`OpenAILLM`][distilabel.llms.OpenAILLM] is used, we assume that the `OPENAI_API_KEY` environment variable is set, and the OpenAI API will be used to generate the text. +5. We define the [`InferenceEndpointsLLM`][distilabel.llms.InferenceEndpointsLLM] class with the model `Meta-Llama-3.1-8B-Instruct` that will be used by the [`TextGeneration`][distilabel.steps.tasks.TextGeneration] task. In this case, since the [`InferenceEndpointsLLM`][distilabel.llms.InferenceEndpointsLLM] is used, we assume that the `HF_TOKEN` environment variable is set. 6. We connect the `load_dataset` step to the `text_generation` task using the `rshift` operator, meaning that the output from the `load_dataset` step will be used as input for the `text_generation` task. diff --git a/docs/sections/pipeline_samples/examples/benchmarking_with_distilabel.md b/docs/sections/pipeline_samples/examples/benchmarking_with_distilabel.md index f1f18b415..6d5e594aa 100644 --- a/docs/sections/pipeline_samples/examples/benchmarking_with_distilabel.md +++ b/docs/sections/pipeline_samples/examples/benchmarking_with_distilabel.md @@ -1,7 +1,7 @@ --- hide: toc --- -# [Benchmarking with `distilabel`: Arena Hard](#benchmarking-with-distilabel-arena-hard) +# Benchmarking with `distilabel`: Arena Hard Benchmark LLMs with `distilabel`: reproducing the Arena Hard benchmark. diff --git a/docs/sections/pipeline_samples/examples/llama_cpp_with_outlines.md b/docs/sections/pipeline_samples/examples/llama_cpp_with_outlines.md index 38ac6bb6f..bbd9c97ef 100644 --- a/docs/sections/pipeline_samples/examples/llama_cpp_with_outlines.md +++ b/docs/sections/pipeline_samples/examples/llama_cpp_with_outlines.md @@ -1,7 +1,7 @@ --- hide: toc --- -# [llama.cpp with `outlines`](#llamacpp-with-outlines) +# llama.cpp with `outlines` Generate RPG characters following a `pydantic.BaseModel` with `outlines` in `distilabel`. diff --git a/docs/sections/pipeline_samples/examples/mistralai_with_instructor.md b/docs/sections/pipeline_samples/examples/mistralai_with_instructor.md index 3b39d51e3..9d862a005 100644 --- a/docs/sections/pipeline_samples/examples/mistralai_with_instructor.md +++ b/docs/sections/pipeline_samples/examples/mistralai_with_instructor.md @@ -1,7 +1,7 @@ --- hide: toc --- -# [MistralAI with `instructor`](#mistralai-with-instructor) +# MistralAI with `instructor` Answer instructions with knowledge graphs defined as `pydantic.BaseModel` objects using `instructor` in `distilabel`. diff --git a/docs/sections/pipeline_samples/index.md b/docs/sections/pipeline_samples/index.md index 800d4342b..d983f1682 100644 --- a/docs/sections/pipeline_samples/index.md +++ b/docs/sections/pipeline_samples/index.md @@ -95,7 +95,7 @@ hide: toc Learn about reproducing the Arena Hard benchmark with disitlabel. - [:octicons-arrow-right-24: Example](benchmarking_with_distilabel.md) + [:octicons-arrow-right-24: Example](examples/benchmarking_with_distilabel.md) - __llama.cpp with outlines__ @@ -103,7 +103,7 @@ hide: toc Learn about generating RPG characters following a pydantic.BaseModel with outlines in distilabel. - [:octicons-arrow-right-24: Example](llama_cpp_with_outlines.md) + [:octicons-arrow-right-24: Example](examples/llama_cpp_with_outlines.md) - __MistralAI with instructor__ @@ -111,7 +111,7 @@ hide: toc Learn about answering instructions with knowledge graphs defined as pydantic.BaseModel objects using instructor in distilabel. - [:octicons-arrow-right-24: Example](papers/prometheus.md) + [:octicons-arrow-right-24: Example](examples/mistralai_with_instructor.md) diff --git a/mkdocs.yml b/mkdocs.yml index 41df47a0d..705f27996 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -136,19 +136,29 @@ plugins: - mkdocstrings: handlers: python: - selection: - inherited_members: true # Allow looking up inherited methods + setup_commands: + - import sys; sys.path.insert(0, 'src') # API references are built from source options: - show_protected_members: true - show_private_members: true - rendering: - show_root_heading: true # actually display anything at all... - # show_root_full_path: true # display "diffrax.asdf" not just "asdf" - show_if_no_docstring: true - show_signature_annotations: true - show_source: false # don't include source code + show_inheritance_diagram: false + show_source: true # include source code + # Headings + heading_level: 3 + show_root_heading: true # show the python path of the class + show_root_toc_entry: true # show the toc entry for the root class + show_root_full_path: false # display "diffrax.asdf" not just "asdf" + show_object_full_path: false # display "diffrax.asdf" not just "asdf" + show_symbol_type_heading: true + show_symbol_type_toc: true + # Members + inherited_members: false # allow looking up inherited methods members_order: source # order methods according to their order of definition in the source code, not alphabetical order - heading_level: 4 + show_labels : true + # Docstring + docstring_style: google # more info: https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html + show_if_no_docstring: false + # Signature + separate_signature: false + show_signature_annotations: false - social - mknotebooks - distilabel/components-gallery: @@ -212,28 +222,18 @@ nav: - Hugging Face: "api/step_gallery/hugging_face.md" - Columns: "api/step_gallery/columns.md" - Extra: "api/step_gallery/extra.md" + - Typing: "api/step/typing.md" - Task: - "api/task/index.md" - GeneratorTask: "api/task/generator_task.md" - - Task Gallery: "api/task_gallery/index.md" + - Task Gallery: "api/task/task_gallery.md" - Typing: "api/task/typing.md" - LLM: - "api/llm/index.md" - - LLM Gallery: - - Anthropic: "api/llm/anthropic.md" - - Anyscale: "api/llm/anyscale.md" - - Azure (via OpenAI): "api/llm/azure.md" - - Cohere: "api/llm/cohere.md" - - Groq: "api/llm/groq.md" - - Hugging Face: "api/llm/huggingface.md" - - LiteLLM: "api/llm/litellm.md" - - llama.cpp: "api/llm/llamacpp.md" - - Mistral: "api/llm/mistral.md" - - Ollama: "api/llm/ollama.md" - - OpenAI: "api/llm/openai.md" - - Together AI: "api/llm/together.md" - - Google Vertex AI: "api/llm/vertexai.md" - - vLLM: "api/llm/vllm.md" + - LLM Gallery: "api/llm/llm_gallery.md" + - Embedding: + - "api/embedding/index.md" + - Embedding Gallery: "api/embedding/embedding_gallery.md" - Pipeline: - "api/pipeline/index.md" - Routing Batch Function: "api/pipeline/routing_batch_function.md" @@ -247,4 +247,5 @@ nav: - CLI: "api/cli.md" - Community: - sections/community/index.md + - How to contribute?: sections/community/contributor.md - Issue dashboard: sections/community/popular_issues.md diff --git a/pyproject.toml b/pyproject.toml index 820620375..f910ba778 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,7 +93,7 @@ vllm = [ ] sentence-transformers = ["sentence-transformers >= 3.0.0"] faiss-cpu = ["faiss-cpu >= 1.8.0"] -faiss-gpu = ["faiss-cpu >= 1.7.2"] +faiss-gpu = ["faiss-gpu >= 1.7.2"] [project.urls] Documentation = "https://distilabel.argilla.io/" diff --git a/src/distilabel/distiset.py b/src/distilabel/distiset.py index 2263bc155..b6b31f7a5 100644 --- a/src/distilabel/distiset.py +++ b/src/distilabel/distiset.py @@ -379,17 +379,17 @@ def save_to_disk( Examples: ```python # Save your distiset in a local folder: - >>> distiset.save_to_disk(distiset_path="my-distiset") + distiset.save_to_disk(distiset_path="my-distiset") # Save your distiset in a remote storage: - >>> storage_options = { - ... "key": os.environ["S3_ACCESS_KEY"], - ... "secret": os.environ["S3_SECRET_KEY"], - ... "client_kwargs": { - ... "endpoint_url": os.environ["S3_ENDPOINT_URL"], - ... "region_name": os.environ["S3_REGION"], - ... }, - ... } - >>> distiset.save_to_disk(distiset_path="my-distiset", storage_options=storage_options) + storage_options = { + "key": os.environ["S3_ACCESS_KEY"], + "secret": os.environ["S3_SECRET_KEY"], + "client_kwargs": { + "endpoint_url": os.environ["S3_ENDPOINT_URL"], + "region_name": os.environ["S3_REGION"], + }, + } + distiset.save_to_disk(distiset_path="my-distiset", storage_options=storage_options) ``` """ distiset_path = str(distiset_path) @@ -606,10 +606,9 @@ def create_distiset( # noqa: C901 correspond to different configurations of the dataset. Examples: - ```python - >>> from pathlib import Path - >>> distiset = create_distiset(Path.home() / ".cache/distilabel/pipelines/path-to-pipe-hashname") + from pathlib import Path + distiset = create_distiset(Path.home() / ".cache/distilabel/pipelines/path-to-pipe-hashname") ``` """ from distilabel.constants import DISTILABEL_METADATA_KEY diff --git a/src/distilabel/embeddings/sentence_transformers.py b/src/distilabel/embeddings/sentence_transformers.py index 08b3465ad..85baea3de 100644 --- a/src/distilabel/embeddings/sentence_transformers.py +++ b/src/distilabel/embeddings/sentence_transformers.py @@ -55,7 +55,6 @@ class SentenceTransformerEmbeddings(Embeddings, CudaDevicePlacementMixin): of 1. Defaults to `None`. Examples: - Generating sentence embeddings: ```python diff --git a/src/distilabel/embeddings/vllm.py b/src/distilabel/embeddings/vllm.py index 8e0c7caed..cbbadd69a 100644 --- a/src/distilabel/embeddings/vllm.py +++ b/src/distilabel/embeddings/vllm.py @@ -46,7 +46,6 @@ class vLLMEmbeddings(Embeddings, CudaDevicePlacementMixin): - [Offline inference embeddings](https://docs.vllm.ai/en/latest/getting_started/examples/offline_inference_embedding.html) Examples: - Generating sentence embeddings: ```python diff --git a/src/distilabel/errors.py b/src/distilabel/errors.py index 4a1bad0cf..41660a637 100644 --- a/src/distilabel/errors.py +++ b/src/distilabel/errors.py @@ -28,7 +28,6 @@ class DistilabelError: page: An optional error code from PydanticErrorCodes enum. Examples: - ```python raise DistilabelUserError("This is an error message.") This is an error message. diff --git a/src/distilabel/llms/anthropic.py b/src/distilabel/llms/anthropic.py index 843b14b21..05e01c233 100644 --- a/src/distilabel/llms/anthropic.py +++ b/src/distilabel/llms/anthropic.py @@ -75,7 +75,6 @@ class AnthropicLLM(AsyncLLM): Defaults to `6`. Examples: - Generate text: ```python diff --git a/src/distilabel/llms/anyscale.py b/src/distilabel/llms/anyscale.py index 54b777b8a..d7db41060 100644 --- a/src/distilabel/llms/anyscale.py +++ b/src/distilabel/llms/anyscale.py @@ -40,7 +40,6 @@ class AnyscaleLLM(OpenAILLM): It is meant to be used internally. Examples: - Generate text: ```python diff --git a/src/distilabel/llms/azure.py b/src/distilabel/llms/azure.py index 80c080757..8d83a8251 100644 --- a/src/distilabel/llms/azure.py +++ b/src/distilabel/llms/azure.py @@ -48,7 +48,6 @@ class AzureOpenAILLM(OpenAILLM): `:material-microsoft-azure:` Examples: - Generate text: ```python diff --git a/src/distilabel/llms/cohere.py b/src/distilabel/llms/cohere.py index a1295a9ba..e28f62fed 100644 --- a/src/distilabel/llms/cohere.py +++ b/src/distilabel/llms/cohere.py @@ -70,7 +70,6 @@ class CohereLLM(AsyncLLM): `"distilabel"`. Examples: - Generate text: ```python diff --git a/src/distilabel/llms/groq.py b/src/distilabel/llms/groq.py index 3a362951e..009f0a07e 100644 --- a/src/distilabel/llms/groq.py +++ b/src/distilabel/llms/groq.py @@ -63,7 +63,6 @@ class GroqLLM(AsyncLLM): to `120`. Examples: - Generate text: ```python diff --git a/src/distilabel/llms/huggingface/inference_endpoints.py b/src/distilabel/llms/huggingface/inference_endpoints.py index edfc50824..5e8294784 100644 --- a/src/distilabel/llms/huggingface/inference_endpoints.py +++ b/src/distilabel/llms/huggingface/inference_endpoints.py @@ -74,7 +74,6 @@ class InferenceEndpointsLLM(AsyncLLM, MagpieChatTemplateMixin): `:hugging:` Examples: - Free serverless Inference API: ```python diff --git a/src/distilabel/llms/huggingface/transformers.py b/src/distilabel/llms/huggingface/transformers.py index d3d16f6ac..e939ddfca 100644 --- a/src/distilabel/llms/huggingface/transformers.py +++ b/src/distilabel/llms/huggingface/transformers.py @@ -76,7 +76,6 @@ class TransformersLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin): `:hugging:` Examples: - Generate text: ```python diff --git a/src/distilabel/llms/litellm.py b/src/distilabel/llms/litellm.py index 71a73365b..43c5975e9 100644 --- a/src/distilabel/llms/litellm.py +++ b/src/distilabel/llms/litellm.py @@ -41,7 +41,6 @@ class LiteLLM(AsyncLLM): - `verbose`: whether to log the LiteLLM client's logs. Defaults to `False`. Examples: - Generate text: ```python diff --git a/src/distilabel/llms/llamacpp.py b/src/distilabel/llms/llamacpp.py index f66eb214b..5b310d2b6 100644 --- a/src/distilabel/llms/llamacpp.py +++ b/src/distilabel/llms/llamacpp.py @@ -59,7 +59,6 @@ class LlamaCppLLM(LLM): - [`llama-cpp-python`](https://github.com/abetlen/llama-cpp-python) Examples: - Generate text: ```python diff --git a/src/distilabel/llms/mistral.py b/src/distilabel/llms/mistral.py index 73b5fc13a..d1457209e 100644 --- a/src/distilabel/llms/mistral.py +++ b/src/distilabel/llms/mistral.py @@ -62,7 +62,6 @@ class MistralLLM(AsyncLLM): Defaults to `64`. Examples: - Generate text: ```python diff --git a/src/distilabel/llms/moa.py b/src/distilabel/llms/moa.py index d139da87e..be32199ad 100644 --- a/src/distilabel/llms/moa.py +++ b/src/distilabel/llms/moa.py @@ -61,7 +61,6 @@ class MixtureOfAgentsLLM(AsyncLLM): - [Mixture-of-Agents Enhances Large Language Model Capabilities](https://arxiv.org/abs/2406.04692) Examples: - Generate text: ```python diff --git a/src/distilabel/llms/ollama.py b/src/distilabel/llms/ollama.py index bd664b30d..fc3abd605 100644 --- a/src/distilabel/llms/ollama.py +++ b/src/distilabel/llms/ollama.py @@ -79,6 +79,20 @@ class OllamaLLM(AsyncLLM): Runtime parameters: - `host`: the Ollama server host. - `timeout`: the client timeout for the Ollama API. Defaults to `120`. + + Examples: + Generate text: + + ```python + from distilabel.llms import OllamaLLM + + llm = OllamaLLM(model="llama3") + + llm.load() + + # Call the model + output = llm.generate(inputs=[[{"role": "user", "content": "Hello world!"}]]) + ``` """ model: str diff --git a/src/distilabel/llms/openai.py b/src/distilabel/llms/openai.py index 39644e281..91a3c165c 100644 --- a/src/distilabel/llms/openai.py +++ b/src/distilabel/llms/openai.py @@ -62,7 +62,6 @@ class OpenAILLM(AsyncLLM): `:simple-openai:` Examples: - Generate text: ```python diff --git a/src/distilabel/llms/together.py b/src/distilabel/llms/together.py index aa63ae1ad..d4ba0eb47 100644 --- a/src/distilabel/llms/together.py +++ b/src/distilabel/llms/together.py @@ -39,7 +39,6 @@ class TogetherLLM(OpenAILLM): is meant to be used internally. Examples: - Generate text: ```python diff --git a/src/distilabel/llms/vertexai.py b/src/distilabel/llms/vertexai.py index f89a7b091..0c49fa393 100644 --- a/src/distilabel/llms/vertexai.py +++ b/src/distilabel/llms/vertexai.py @@ -43,6 +43,20 @@ class VertexAILLM(AsyncLLM): Icon: `:simple-googlecloud:` + + Examples: + Generate text: + + ```python + from distilabel.llms import VertexAILLM + + llm = VertexAILLM(model="gemini-1.5-pro") + + llm.load() + + # Call the model + output = llm.generate(inputs=[[{"role": "user", "content": "Hello world!"}]]) + ``` """ model: str diff --git a/src/distilabel/llms/vllm.py b/src/distilabel/llms/vllm.py index b45e72b93..1ba6ab370 100644 --- a/src/distilabel/llms/vllm.py +++ b/src/distilabel/llms/vllm.py @@ -92,7 +92,6 @@ class vLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin): the `LLM` class of `vllm` library. Examples: - Generate text: ```python @@ -415,7 +414,6 @@ class ClientvLLM(OpenAILLM, MagpieChatTemplateMixin): created to comunicate with the `vLLM` server. Defaults to `None`. Examples: - Generate text: ```python diff --git a/src/distilabel/steps/argilla/preference.py b/src/distilabel/steps/argilla/preference.py index 0843f2f55..210cca208 100644 --- a/src/distilabel/steps/argilla/preference.py +++ b/src/distilabel/steps/argilla/preference.py @@ -71,7 +71,6 @@ class PreferenceToArgilla(ArgillaBase): generated rationales won't be pushed to Argilla. Examples: - Push a preference dataset to an Argilla instance: ```python diff --git a/src/distilabel/steps/argilla/text_generation.py b/src/distilabel/steps/argilla/text_generation.py index 529841284..ad5323b0b 100644 --- a/src/distilabel/steps/argilla/text_generation.py +++ b/src/distilabel/steps/argilla/text_generation.py @@ -61,7 +61,6 @@ class TextGenerationToArgilla(ArgillaBase): - generation (`str` or `List[str]`): The completions that were generated based on the input instruction. Examples: - Push a text generation dataset to an Argilla instance: ```python diff --git a/src/distilabel/steps/columns/expand.py b/src/distilabel/steps/columns/expand.py index 84cec7712..bb7d2fe9f 100644 --- a/src/distilabel/steps/columns/expand.py +++ b/src/distilabel/steps/columns/expand.py @@ -43,7 +43,6 @@ class ExpandColumns(Step): - dynamic (determined by `columns` attribute): The expanded columns. Examples: - Expand the selected columns into multiple rows: ```python diff --git a/src/distilabel/steps/columns/group.py b/src/distilabel/steps/columns/group.py index 87afe174d..852b1c520 100644 --- a/src/distilabel/steps/columns/group.py +++ b/src/distilabel/steps/columns/group.py @@ -44,7 +44,6 @@ class GroupColumns(Step): that were grouped. Examples: - Combine columns of a dataset: ```python @@ -125,6 +124,8 @@ def process(self, *inputs: StepInput) -> "StepOutput": class CombineColumns(GroupColumns): + """`CombineColumns` is deprecated and will be removed in version 1.5.0, use `GroupColumns` instead.""" + def __init__(self, **data: Any) -> None: warnings.warn( "`CombineColumns` is deprecated and will be removed in version 1.5.0, use `GroupColumns` instead.", diff --git a/src/distilabel/steps/columns/keep.py b/src/distilabel/steps/columns/keep.py index 84f889072..88ae7d540 100644 --- a/src/distilabel/steps/columns/keep.py +++ b/src/distilabel/steps/columns/keep.py @@ -45,7 +45,6 @@ class KeepColumns(Step): - dynamic (determined by `columns` attribute): The columns that were kept. Examples: - Select the columns to keep: ```python diff --git a/src/distilabel/steps/columns/merge.py b/src/distilabel/steps/columns/merge.py index 3b9f295c9..802b17a7d 100644 --- a/src/distilabel/steps/columns/merge.py +++ b/src/distilabel/steps/columns/merge.py @@ -48,7 +48,6 @@ class MergeColumns(Step): that were merged. Examples: - Combine columns in rows of a dataset: ```python diff --git a/src/distilabel/steps/deita.py b/src/distilabel/steps/deita.py index 5d98355c5..f817a4c26 100644 --- a/src/distilabel/steps/deita.py +++ b/src/distilabel/steps/deita.py @@ -66,7 +66,6 @@ class DeitaFiltering(GlobalStep): - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685) Examples: - Filter the dataset based on the DEITA score and the cosine distance between the embeddings: ```python @@ -102,7 +101,6 @@ class DeitaFiltering(GlobalStep): ``` Citations: - ``` @misc{liu2024makesgooddataalignment, title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning}, diff --git a/src/distilabel/steps/embeddings/embedding_generation.py b/src/distilabel/steps/embeddings/embedding_generation.py index d97e1da03..30dff63ef 100644 --- a/src/distilabel/steps/embeddings/embedding_generation.py +++ b/src/distilabel/steps/embeddings/embedding_generation.py @@ -37,7 +37,6 @@ class EmbeddingGeneration(Step): - embedding (`List[Union[float, int]]`): the generated sentence embedding. Examples: - Generate sentence embeddings with Sentence Transformers: ```python diff --git a/src/distilabel/steps/embeddings/nearest_neighbour.py b/src/distilabel/steps/embeddings/nearest_neighbour.py index 6d548bc94..adf3fe6c0 100644 --- a/src/distilabel/steps/embeddings/nearest_neighbour.py +++ b/src/distilabel/steps/embeddings/nearest_neighbour.py @@ -77,7 +77,6 @@ class FaissNearestNeighbour(GlobalStep): - [`The Faiss library`](https://arxiv.org/abs/2401.08281) Examples: - Generating embeddings and getting the nearest neighbours: ```python @@ -111,7 +110,6 @@ class FaissNearestNeighbour(GlobalStep): ``` Citations: - ``` @misc{douze2024faisslibrary, title={The Faiss library}, diff --git a/src/distilabel/steps/formatting/conversation.py b/src/distilabel/steps/formatting/conversation.py index 95c2369ad..29381521b 100644 --- a/src/distilabel/steps/formatting/conversation.py +++ b/src/distilabel/steps/formatting/conversation.py @@ -36,7 +36,6 @@ class ConversationTemplate(Step): - template Examples: - Create a conversation from an instruction and a response: ```python diff --git a/src/distilabel/steps/formatting/dpo.py b/src/distilabel/steps/formatting/dpo.py index 3c0e7355d..72253eb19 100644 --- a/src/distilabel/steps/formatting/dpo.py +++ b/src/distilabel/steps/formatting/dpo.py @@ -65,7 +65,6 @@ class FormatTextGenerationDPO(Step): - generations Examples: - Format your dataset for DPO fine tuning: ```python @@ -197,12 +196,11 @@ def process(self, *inputs: StepInput) -> "StepOutput": # type: ignore class FormatChatGenerationDPO(Step): - """Format the output of a combination of a `ChatGeneration` + a preference task such as - `UltraFeedback`, for Direct Preference Optimization (DPO) following the standard formatting - from frameworks such as `axolotl` or `alignment-handbook`. + """Format the output of a combination of a `ChatGeneration` + a preference task for Direct Preference Optimization (DPO). `FormatChatGenerationDPO` is a `Step` that formats the output of the combination of a `ChatGeneration` - task with a preference `Task` i.e. a task generating `ratings`, so that those are used to rank the + task with a preference `Task` i.e. a task generating `ratings` such as `UltraFeedback` following the standard + formatting from frameworks such as `axolotl` or `alignment-handbook`., so that those are used to rank the existing generations and provide the `chosen` and `rejected` generations based on the `ratings`. Note: @@ -239,7 +237,6 @@ class FormatChatGenerationDPO(Step): - generations Examples: - Format your dataset for DPO fine tuning: ```python diff --git a/src/distilabel/steps/formatting/sft.py b/src/distilabel/steps/formatting/sft.py index 838512f85..2793b212e 100644 --- a/src/distilabel/steps/formatting/sft.py +++ b/src/distilabel/steps/formatting/sft.py @@ -50,7 +50,6 @@ class FormatTextGenerationSFT(Step): - generation Examples: - Format your dataset for SFT fine tuning: ```python @@ -143,8 +142,7 @@ def process(self, *inputs: StepInput) -> "StepOutput": # type: ignore class FormatChatGenerationSFT(Step): - """Format the output of a `ChatGeneration` task for Supervised Fine-Tuning (SFT) following the - standard formatting from frameworks such as `axolotl` or `alignment-handbook`. + """Format the output of a `ChatGeneration` task for Supervised Fine-Tuning (SFT). `FormatChatGenerationSFT` is a `Step` that formats the output of a `ChatGeneration` task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as `axolotl` @@ -172,8 +170,7 @@ class FormatChatGenerationSFT(Step): - generation Examples: - - Format your dataset for Supervised Fine Tuning (SFT): + Format your dataset for SFT: ```python from distilabel.steps import FormatChatGenerationSFT diff --git a/src/distilabel/steps/generators/data.py b/src/distilabel/steps/generators/data.py index fbf29ec7f..1d26ed885 100644 --- a/src/distilabel/steps/generators/data.py +++ b/src/distilabel/steps/generators/data.py @@ -42,7 +42,6 @@ class LoadDataFromDicts(GeneratorStep): - load Examples: - Load data from a list of dictionaries: ```python diff --git a/src/distilabel/steps/generators/huggingface.py b/src/distilabel/steps/generators/huggingface.py index 6701eafb5..5912c774e 100644 --- a/src/distilabel/steps/generators/huggingface.py +++ b/src/distilabel/steps/generators/huggingface.py @@ -89,7 +89,6 @@ class LoadDataFromHub(GeneratorStep): - load Examples: - Load data from a dataset in Hugging Face Hub: ```python @@ -289,7 +288,6 @@ class LoadDataFromFileSystem(LoadDataFromHub): - load Examples: - Load data from a Hugging Face dataset in your file system: ```python @@ -484,7 +482,6 @@ class LoadDataFromDisk(LoadDataFromHub): - load Examples: - Load data from a Hugging Face Dataset: ```python diff --git a/src/distilabel/steps/globals/huggingface.py b/src/distilabel/steps/globals/huggingface.py index 28ef3932b..82e7f35ab 100644 --- a/src/distilabel/steps/globals/huggingface.py +++ b/src/distilabel/steps/globals/huggingface.py @@ -58,7 +58,6 @@ class PushToHub(GlobalStep): - huggingface Examples: - Push batches of your dataset to the Hugging Face Hub repository: ```python diff --git a/src/distilabel/steps/reward_model.py b/src/distilabel/steps/reward_model.py index 9a88d8d65..49ddc065d 100644 --- a/src/distilabel/steps/reward_model.py +++ b/src/distilabel/steps/reward_model.py @@ -68,7 +68,6 @@ class RewardModelScore(Step, CudaDevicePlacementMixin): - scorer Examples: - Assigning an score for an instruction-response pair: ```python diff --git a/src/distilabel/steps/tasks/complexity_scorer.py b/src/distilabel/steps/tasks/complexity_scorer.py index d7ddc2362..5f972eb67 100644 --- a/src/distilabel/steps/tasks/complexity_scorer.py +++ b/src/distilabel/steps/tasks/complexity_scorer.py @@ -63,7 +63,6 @@ class ComplexityScorer(Task): - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685) Examples: - Evaluate the complexity of your instructions: ```python @@ -110,11 +109,10 @@ class ComplexityScorer(Task): ) ) # result - # [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 2], 'distilabel_metadata': {'raw_output_complexity_scorer_0': '{ \n "scores": [\n 1, \n 2\n ]\n}'}}] + # [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 2], 'distilabel_metadata': {'raw_output_complexity_scorer_0': '{ \\n "scores": [\\n 1, \\n 2\\n ]\\n}'}}] ``` Citations: - ``` @misc{liu2024makesgooddataalignment, title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning}, diff --git a/src/distilabel/steps/tasks/evol_instruct/base.py b/src/distilabel/steps/tasks/evol_instruct/base.py index 71da27155..ea73f2503 100644 --- a/src/distilabel/steps/tasks/evol_instruct/base.py +++ b/src/distilabel/steps/tasks/evol_instruct/base.py @@ -71,7 +71,6 @@ class EvolInstruct(Task): - [GitHub: h2oai/h2o-wizardlm](https://github.com/h2oai/h2o-wizardlm) Examples: - Evolve an instruction using an LLM: ```python @@ -151,7 +150,6 @@ class EvolInstruct(Task): ``` Citations: - ``` @misc{xu2023wizardlmempoweringlargelanguage, title={WizardLM: Empowering Large Language Models to Follow Complex Instructions}, diff --git a/src/distilabel/steps/tasks/evol_instruct/evol_complexity/base.py b/src/distilabel/steps/tasks/evol_instruct/evol_complexity/base.py index 1619db422..a7e46b154 100644 --- a/src/distilabel/steps/tasks/evol_instruct/evol_complexity/base.py +++ b/src/distilabel/steps/tasks/evol_instruct/evol_complexity/base.py @@ -63,7 +63,6 @@ class EvolComplexity(EvolInstruct): - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244) Examples: - Evolve an instruction using an LLM: ```python @@ -86,7 +85,6 @@ class EvolComplexity(EvolInstruct): ``` Citations: - ``` @misc{liu2024makesgooddataalignment, title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning}, diff --git a/src/distilabel/steps/tasks/evol_instruct/evol_complexity/generator.py b/src/distilabel/steps/tasks/evol_instruct/evol_complexity/generator.py index 8749fcd8c..f1965d9e8 100644 --- a/src/distilabel/steps/tasks/evol_instruct/evol_complexity/generator.py +++ b/src/distilabel/steps/tasks/evol_instruct/evol_complexity/generator.py @@ -61,7 +61,6 @@ class EvolComplexityGenerator(EvolInstructGenerator): - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244) Examples: - Generate evolved instructions without initial instructions: ```python @@ -84,7 +83,6 @@ class EvolComplexityGenerator(EvolInstructGenerator): ``` Citations: - ``` @misc{liu2024makesgooddataalignment, title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning}, diff --git a/src/distilabel/steps/tasks/evol_instruct/generator.py b/src/distilabel/steps/tasks/evol_instruct/generator.py index 1eea138a6..bc15655ba 100644 --- a/src/distilabel/steps/tasks/evol_instruct/generator.py +++ b/src/distilabel/steps/tasks/evol_instruct/generator.py @@ -77,7 +77,6 @@ class EvolInstructGenerator(GeneratorTask): - [GitHub: h2oai/h2o-wizardlm](https://github.com/h2oai/h2o-wizardlm) Examples: - Generate evolved instructions without initial instructions: ```python @@ -100,7 +99,6 @@ class EvolInstructGenerator(GeneratorTask): ``` Citations: - ``` @misc{xu2023wizardlmempoweringlargelanguage, title={WizardLM: Empowering Large Language Models to Follow Complex Instructions}, diff --git a/src/distilabel/steps/tasks/evol_quality/base.py b/src/distilabel/steps/tasks/evol_quality/base.py index 1c0d6c4d5..743deeb4f 100644 --- a/src/distilabel/steps/tasks/evol_quality/base.py +++ b/src/distilabel/steps/tasks/evol_quality/base.py @@ -67,7 +67,6 @@ class EvolQuality(Task): - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685) Examples: - Evolve the quality of the responses given a prompt: ```python @@ -103,7 +102,6 @@ class EvolQuality(Task): ``` Citations: - ``` @misc{liu2024makesgooddataalignment, title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning}, diff --git a/src/distilabel/steps/tasks/generate_embeddings.py b/src/distilabel/steps/tasks/generate_embeddings.py index 297f3c3be..85db623d9 100644 --- a/src/distilabel/steps/tasks/generate_embeddings.py +++ b/src/distilabel/steps/tasks/generate_embeddings.py @@ -50,7 +50,6 @@ class GenerateEmbeddings(Step): - [What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning](https://arxiv.org/abs/2312.15685) Examples: - Rank LLM candidates: ```python @@ -77,7 +76,6 @@ class GenerateEmbeddings(Step): ``` Citations: - ``` @misc{liu2024makesgooddataalignment, title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning}, diff --git a/src/distilabel/steps/tasks/genstruct.py b/src/distilabel/steps/tasks/genstruct.py index 27fd813cf..02a065733 100644 --- a/src/distilabel/steps/tasks/genstruct.py +++ b/src/distilabel/steps/tasks/genstruct.py @@ -69,7 +69,6 @@ class Genstruct(Task): - [Ada-Instruct: Adapting Instruction Generators for Complex Reasoning](https://arxiv.org/abs/2310.04484) Examples: - Generate instructions from raw documents using the title and content: ```python @@ -105,7 +104,6 @@ class Genstruct(Task): ``` Citations: - ``` @misc{cui2023adainstructadaptinginstructiongenerators, title={Ada-Instruct: Adapting Instruction Generators for Complex Reasoning}, diff --git a/src/distilabel/steps/tasks/improving_text_embeddings.py b/src/distilabel/steps/tasks/improving_text_embeddings.py index 77ec51d22..1908a9685 100644 --- a/src/distilabel/steps/tasks/improving_text_embeddings.py +++ b/src/distilabel/steps/tasks/improving_text_embeddings.py @@ -265,7 +265,6 @@ class EmbeddingTaskGenerator(GeneratorTask): - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368) Examples: - Generate embedding tasks for text retrieval: ```python @@ -285,7 +284,6 @@ class EmbeddingTaskGenerator(GeneratorTask): ``` Citations: - ``` @misc{wang2024improvingtextembeddingslarge, title={Improving Text Embeddings with Large Language Models}, @@ -445,7 +443,6 @@ class GenerateTextRetrievalData(_EmbeddingDataGeneration): - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368) Examples: - Generate synthetic text retrieval data for training embedding models: ```python @@ -568,7 +565,6 @@ class GenerateShortTextMatchingData(_EmbeddingDataGeneration): - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368) Examples: - Generate synthetic short text matching data for training embedding models: ```python @@ -658,7 +654,6 @@ class GenerateLongTextMatchingData(_EmbeddingDataGeneration): - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368) Examples: - Generate synthetic long text matching data for training embedding models: ```python @@ -752,7 +747,6 @@ class GenerateTextClassificationData(_EmbeddingDataGeneration): - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368) Examples: - Generate synthetic text classification data for training embedding models: ```python @@ -851,7 +845,6 @@ class MonolingualTripletGenerator(_EmbeddingDataGenerator): - model_name (`str`): the name of the model used to generate the monolingual triplets. Examples: - Generate monolingual triplets for training embedding models: ```python @@ -943,7 +936,6 @@ class BitextRetrievalGenerator(_EmbeddingDataGenerator): data. Examples: - Generate bitext retrieval data for training embedding models: ```python diff --git a/src/distilabel/steps/tasks/instruction_backtranslation.py b/src/distilabel/steps/tasks/instruction_backtranslation.py index 383333319..9322aa4f2 100644 --- a/src/distilabel/steps/tasks/instruction_backtranslation.py +++ b/src/distilabel/steps/tasks/instruction_backtranslation.py @@ -50,8 +50,43 @@ class InstructionBacktranslation(Task): References: - [`Self-Alignment with Instruction Backtranslation`](https://arxiv.org/abs/2308.06259) - Citations: + Examples: + Generate a score and reason for a given instruction and generation: + + ```python + from distilabel.steps.tasks import InstructionBacktranslation + + instruction_backtranslation = InstructionBacktranslation( + name="instruction_backtranslation", + llm=llm, + input_batch_size=10, + output_mappings={"model_name": "scoring_model"}, + ) + instruction_backtranslation.load() + + result = next( + instruction_backtranslation.process( + [ + { + "instruction": "How much is 2+2?", + "generation": "4", + } + ] + ) + ) + # result + # [ + # { + # "instruction": "How much is 2+2?", + # "generation": "4", + # "score": 3, + # "reason": "Reason for the generation.", + # "model_name": "meta-llama/Meta-Llama-3.1-8B-Instruct", + # } + # ] + ``` + Citations: ``` @misc{li2024selfalignmentinstructionbacktranslation, title={Self-Alignment with Instruction Backtranslation}, diff --git a/src/distilabel/steps/tasks/magpie/base.py b/src/distilabel/steps/tasks/magpie/base.py index dfd798377..29d04c29a 100644 --- a/src/distilabel/steps/tasks/magpie/base.py +++ b/src/distilabel/steps/tasks/magpie/base.py @@ -46,7 +46,6 @@ class MagpieBase(RuntimeParametersMixin): - [Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing](https://arxiv.org/abs/2406.08464) Citations: - ``` @misc{xu2024magpiealignmentdatasynthesis, title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing}, @@ -328,7 +327,6 @@ class Magpie(Task, MagpieBase): - [Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing](https://arxiv.org/abs/2406.08464) Examples: - Generating instructions with Llama 3 8B Instruct and TransformersLLM: ```python diff --git a/src/distilabel/steps/tasks/magpie/generator.py b/src/distilabel/steps/tasks/magpie/generator.py index e7cf77c60..dc4fe9938 100644 --- a/src/distilabel/steps/tasks/magpie/generator.py +++ b/src/distilabel/steps/tasks/magpie/generator.py @@ -91,7 +91,6 @@ class MagpieGenerator(GeneratorTask, MagpieBase): - [Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing](https://arxiv.org/abs/2406.08464) Examples: - Generating instructions with Llama 3 8B Instruct and TransformersLLM: ```python @@ -205,7 +204,6 @@ class MagpieGenerator(GeneratorTask, MagpieBase): ``` Citations: - ``` @misc{xu2024magpiealignmentdatasynthesis, title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing}, diff --git a/src/distilabel/steps/tasks/pair_rm.py b/src/distilabel/steps/tasks/pair_rm.py index c9ec217fd..23262a533 100644 --- a/src/distilabel/steps/tasks/pair_rm.py +++ b/src/distilabel/steps/tasks/pair_rm.py @@ -51,7 +51,6 @@ class PairRM(Step): currently, and we will use a specific `LLM`. Examples: - Rank LLM candidates: ```python @@ -82,7 +81,6 @@ class PairRM(Step): ``` Citations: - ``` @misc{jiang2023llmblenderensemblinglargelanguage, title={LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion}, diff --git a/src/distilabel/steps/tasks/prometheus_eval.py b/src/distilabel/steps/tasks/prometheus_eval.py index e902756e8..27cd9622e 100644 --- a/src/distilabel/steps/tasks/prometheus_eval.py +++ b/src/distilabel/steps/tasks/prometheus_eval.py @@ -134,7 +134,6 @@ class PrometheusEval(Task): - [prometheus-eval: Evaluate your LLM's response with Prometheus π―](https://github.com/prometheus-eval/prometheus-eval) Examples: - Critique and evaluate LLM generation quality using Prometheus 2_0: ```python @@ -145,7 +144,7 @@ class PrometheusEval(Task): prometheus = PrometheusEval( llm=vLLM( model="prometheus-eval/prometheus-7b-v2.0", - chat_template="[INST] {{ messages[0]\"content\" }}\n{{ messages[1]\"content\" }}[/INST]", + chat_template="[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]", ), mode="absolute", rubric="factual-validity" @@ -182,7 +181,7 @@ class PrometheusEval(Task): prometheus = PrometheusEval( llm=vLLM( model="prometheus-eval/prometheus-7b-v2.0", - chat_template="[INST] {{ messages[0]\"content\" }}\n{{ messages[1]\"content\" }}[/INST]", + chat_template="[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]", ), mode="relative", rubric="honesty" @@ -219,12 +218,12 @@ class PrometheusEval(Task): prometheus = PrometheusEval( llm=vLLM( model="prometheus-eval/prometheus-7b-v2.0", - chat_template="[INST] {{ messages[0]\"content\" }}\n{{ messages[1]\"content\" }}[/INST]", + chat_template="[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]", ), mode="absolute", rubric="custom", rubrics={ - "custom": "[A]\nScore 1: A\nScore 2: B\nScore 3: C\nScore 4: D\nScore 5: E" + "custom": "[A]\\nScore 1: A\\nScore 2: B\\nScore 3: C\\nScore 4: D\\nScore 5: E" } ) @@ -259,7 +258,7 @@ class PrometheusEval(Task): prometheus = PrometheusEval( llm=vLLM( model="prometheus-eval/prometheus-7b-v2.0", - chat_template="[INST] {{ messages[0]\"content\" }}\n{{ messages[1]\"content\" }}[/INST]", + chat_template="[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]", ), mode="absolute", rubric="helpfulness", @@ -293,7 +292,6 @@ class PrometheusEval(Task): ``` Citations: - ``` @misc{kim2024prometheus2opensource, title={Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models}, diff --git a/src/distilabel/steps/tasks/quality_scorer.py b/src/distilabel/steps/tasks/quality_scorer.py index 3e8857c1b..ff3f199db 100644 --- a/src/distilabel/steps/tasks/quality_scorer.py +++ b/src/distilabel/steps/tasks/quality_scorer.py @@ -63,7 +63,6 @@ class QualityScorer(Task): - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685) Examples: - Evaluate the quality of your instructions: ```python @@ -134,7 +133,6 @@ class QualityScorer(Task): ``` Citations: - ``` @misc{liu2024makesgooddataalignment, title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning}, diff --git a/src/distilabel/steps/tasks/self_instruct.py b/src/distilabel/steps/tasks/self_instruct.py index 27bcecc0b..e59cad4b6 100644 --- a/src/distilabel/steps/tasks/self_instruct.py +++ b/src/distilabel/steps/tasks/self_instruct.py @@ -62,7 +62,6 @@ class SelfInstruct(Task): - [`Self-Instruct: Aligning Language Models with Self-Generated Instructions`](https://arxiv.org/abs/2212.10560) Examples: - Generate instructions based on a given input: ```python @@ -90,7 +89,6 @@ class SelfInstruct(Task): ``` Citations: - ``` @misc{wang2023selfinstructaligninglanguagemodels, title={Self-Instruct: Aligning Language Models with Self-Generated Instructions}, diff --git a/src/distilabel/steps/tasks/sentence_transformers.py b/src/distilabel/steps/tasks/sentence_transformers.py index 59c504200..82f7a2a94 100644 --- a/src/distilabel/steps/tasks/sentence_transformers.py +++ b/src/distilabel/steps/tasks/sentence_transformers.py @@ -104,7 +104,6 @@ class GenerateSentencePair(Task): - embedding Examples: - Paraphrasing: ```python @@ -358,9 +357,11 @@ def format_output( if self.triplet: return { "positive": groups[0].strip(), - "negative": groups[1].strip() - if len(groups) > 1 and groups[1] is not None - else None, + "negative": ( + groups[1].strip() + if len(groups) > 1 and groups[1] is not None + else None + ), } return {"positive": groups[0].strip()} diff --git a/src/distilabel/steps/tasks/structured_generation.py b/src/distilabel/steps/tasks/structured_generation.py index cbea279eb..81ee74bd8 100644 --- a/src/distilabel/steps/tasks/structured_generation.py +++ b/src/distilabel/steps/tasks/structured_generation.py @@ -48,7 +48,6 @@ class StructuredGeneration(Task): - structured-generation Examples: - Generate structured output from a JSON schema: ```python diff --git a/src/distilabel/steps/tasks/text_generation.py b/src/distilabel/steps/tasks/text_generation.py index 6f4d5d758..4f6b681d1 100644 --- a/src/distilabel/steps/tasks/text_generation.py +++ b/src/distilabel/steps/tasks/text_generation.py @@ -47,7 +47,6 @@ class TextGeneration(Task): - text-generation Examples: - Generate text from an instruction: ```python @@ -152,7 +151,6 @@ class ChatGeneration(Task): `:material-chat:` Examples: - Generate text from a conversation in OpenAI chat format: ```python diff --git a/src/distilabel/steps/tasks/ultrafeedback.py b/src/distilabel/steps/tasks/ultrafeedback.py index dae68bb48..091d99599 100644 --- a/src/distilabel/steps/tasks/ultrafeedback.py +++ b/src/distilabel/steps/tasks/ultrafeedback.py @@ -59,7 +59,6 @@ class UltraFeedback(Task): - [`UltraFeedback - GitHub Repository`](https://github.com/OpenBMB/UltraFeedback) Examples: - Rate generations from different LLMs based on the selected aspect: ```python @@ -130,7 +129,7 @@ class UltraFeedback(Task): # 'ratings': [5, 1], # 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.', # "The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer."], - # 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{"ratings": [\n 5,\n 1\n] \n\n,"rationales": [\n "The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.",\n "The response is confidently incorrect, as it provides unrelated information (\'a car\') and does not address the question. The model shows no uncertainty or indication that it does not know the answer."\n] }'}, + # 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{"ratings": [\\n 5,\\n 1\\n] \\n\\n,"rationales": [\\n "The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.",\\n "The response is confidently incorrect, as it provides unrelated information (\'a car\') and does not address the question. The model shows no uncertainty or indication that it does not know the answer."\\n] }'}, # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}] ``` @@ -170,12 +169,11 @@ class UltraFeedback(Task): # 'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.', # 'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'], # 'types': [1, 3, 1], - # 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \n "ratings": [\n 1,\n 5\n ]\n ,\n "rationales": [\n "Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.",\n "Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question."\n ]\n ,\n "rationales_for_rating": [\n "Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.",\n "Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question."\n ]\n ,\n "types": [\n 1, 3,\n 1\n ]\n }'}, + # 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \\n "ratings": [\\n 1,\\n 5\\n ]\\n ,\\n "rationales": [\\n "Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.",\\n "Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question."\\n ]\\n ,\\n "rationales_for_rating": [\\n "Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.",\\n "Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question."\\n ]\\n ,\\n "types": [\\n 1, 3,\\n 1\\n ]\\n }'}, # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}] ``` Citations: - ``` @misc{cui2024ultrafeedbackboostinglanguagemodels, title={UltraFeedback: Boosting Language Models with Scaled AI Feedback}, @@ -313,9 +311,11 @@ def _format_ratings_rationales_output( formatted_outputs.append( { - "ratings": int(re.findall(r"\b\d+\b", matches.group(1))[0]) - if matches.group(1) not in ["None", "N/A"] - else None, + "ratings": ( + int(re.findall(r"\b\d+\b", matches.group(1))[0]) + if matches.group(1) not in ["None", "N/A"] + else None + ), "rationales": matches.group(2), } ) @@ -358,13 +358,17 @@ def _format_types_ratings_rationales_output( formatted_outputs.append( { - "types": int(re.findall(r"\b\d+\b", matches.group(1))[0]) - if matches.group(1) not in ["None", "N/A"] - else None, + "types": ( + int(re.findall(r"\b\d+\b", matches.group(1))[0]) + if matches.group(1) not in ["None", "N/A"] + else None + ), "rationales": matches.group(2), - "ratings": int(re.findall(r"\b\d+\b", matches.group(3))[0]) - if matches.group(3) not in ["None", "N/A"] - else None, + "ratings": ( + int(re.findall(r"\b\d+\b", matches.group(3))[0]) + if matches.group(3) not in ["None", "N/A"] + else None + ), "rationales-for-ratings": matches.group(4), } ) diff --git a/src/distilabel/steps/tasks/urial.py b/src/distilabel/steps/tasks/urial.py index ed0e72d96..705b9c488 100644 --- a/src/distilabel/steps/tasks/urial.py +++ b/src/distilabel/steps/tasks/urial.py @@ -47,7 +47,6 @@ class URIAL(Task): - [The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning](https://arxiv.org/abs/2312.01552) Examples: - Generate text from an instruction: ```python diff --git a/src/distilabel/steps/truncate.py b/src/distilabel/steps/truncate.py index bb1785b8a..6e68af663 100644 --- a/src/distilabel/steps/truncate.py +++ b/src/distilabel/steps/truncate.py @@ -51,7 +51,6 @@ class TruncateTextColumn(Step): - text-manipulation Examples: - Truncating a row to a given number of tokens: ```python diff --git a/src/distilabel/utils/docstring.py b/src/distilabel/utils/docstring.py index 899425e26..913b94700 100644 --- a/src/distilabel/utils/docstring.py +++ b/src/distilabel/utils/docstring.py @@ -165,7 +165,7 @@ def parse_google_docstring(func: Callable) -> Docstring: # noqa: C901 elif section_name == "examples": # Parse examples into a dictionary example_items = re.findall( - r"(\w[\w\s]*?):\s*\n\s*```python\n(.*?)\n\s*```", + r"(\w[\w\s]*?):\s*\n?\s*```python\n(.*?)\n\s*```", section_content, re.DOTALL, ) @@ -217,7 +217,6 @@ def get_bibtex(ref: str) -> str: The bibtex style citation. Examples: - ```python cite = get_bibtex(r"https://arxiv.org/abs/2406.18518") @misc{other,