beme248 · beme248 · Feb 12, 2024 · Feb 12, 2024 · Feb 13, 2024 · Feb 13, 2024
diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml
@@ -0,0 +1,61 @@
+name: PyPI release
+on:
+  workflow_dispatch:
+
+jobs:
+  testing:
+    uses: ./.github/workflows/testing.yml
+  release:
+    needs: testing
+    runs-on: ubuntu-latest
+    env:
+      TWINE_USERNAME: __token__
+
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v3
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install build dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -U twine build
+
+      - name: Build the dist files
+        run: python -m build .
+
+      - name: Publish to the test PyPI
+        env:
+          TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN }}
+        run: twine upload dist/* --repository=testpypi
+
+      - name: Test installing from test PyPI and running tests
+        run: |
+          pip install -i https://testpypi.python.org/pypi --extra-index-url https://pypi.org/simple datatrove[testing]
+          python -m nltk.downloader punkt
+          make test
+
+      - name: Get tag name
+        id: get_tag_name
+        run: |
+          echo TAG_NAME=$(grep '^version' pyproject.toml | head -1 | cut -d '"' -f 2) >> $GITHUB_OUTPUT
+
+      - name: Tag the release
+        uses: actions/github-script@v7
+        with:
+          script: |
+            github.rest.git.createRef({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              ref: 'refs/tags/v${{ steps.get_tag_name.outputs.TAG_NAME }}',
+              sha: context.sha
+            })
+
+      - name: Publish to PyPI
+        env:
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+        run: twine upload dist/* --repository=pypi
diff --git a/.github/workflows/ci.yml → .github/workflows/testing.yml b/.github/workflows/ci.yml → .github/workflows/testing.yml
@@ -1,4 +1,4 @@
-name: CI
+name: Test & Check Code Quality
 
 on:
   pull_request:
@@ -7,6 +7,7 @@ on:
   push:
     branches:
       - main
+  workflow_call:
 
 jobs:
   check_code_quality:
@@ -23,8 +24,8 @@ jobs:
           pip install .[quality]
       - name: Check quality
         run: |
-          ruff check tests src # linter
-          ruff format --check tests src # formatter
+          ruff check tests src examples # linter
+          ruff format --check tests src examples # formatter
 
   test:
     runs-on: ubuntu-latest

diff --git a/examples/fineweb.py b/examples/fineweb.py
@@ -0,0 +1,176 @@
+"""
+This file contains the code used to process and create the
+FineWeb dataset (https://huggingface.co/datasets/HuggingFaceFW/fineweb)
+"""
+
+from datatrove.executor.slurm import SlurmPipelineExecutor
+from datatrove.pipeline.dedup import MinhashDedupCluster, MinhashDedupFilter, MinhashDedupSignature
+from datatrove.pipeline.dedup.minhash import MinhashConfig, MinhashDedupBuckets
+from datatrove.pipeline.extractors import Trafilatura
+from datatrove.pipeline.filters import (
+    C4QualityFilter,
+    FineWebQualityFilter,
+    GopherQualityFilter,
+    GopherRepetitionFilter,
+    LanguageFilter,
+    URLFilter,
+)
+from datatrove.pipeline.formatters import PIIFormatter
+from datatrove.pipeline.readers import JsonlReader, WarcReader
+from datatrove.pipeline.tokens import TokensCounter
+from datatrove.pipeline.writers.jsonl import JsonlWriter
+
+
+"""
+    we first ran the following pipeline for each dump
+"""
+DUMP_TO_PROCESS = "CC-MAIN-2O23-5O"  # example
+
+MAIN_OUTPUT_PATH = "s3://some_s3_bucket"
+FILTERING_OUTPUT_PATH = f"{MAIN_OUTPUT_PATH}/base_processing"
+
+main_processing_executor = SlurmPipelineExecutor(
+    job_name=f"cc_{DUMP_TO_PROCESS}",
+    pipeline=[
+        WarcReader(
+            f"s3://commoncrawl/crawl-data/{DUMP_TO_PROCESS}/segments/",
+            glob_pattern="*/warc/*",  # we want the warc files
+            default_metadata={"dump": DUMP_TO_PROCESS},
+        ),
+        URLFilter(exclusion_writer=JsonlWriter(f"{FILTERING_OUTPUT_PATH}/removed/1_url/{DUMP_TO_PROCESS}")),
+        Trafilatura(favour_precision=True),
+        LanguageFilter(
+            exclusion_writer=JsonlWriter(
+                f"{FILTERING_OUTPUT_PATH}/2_non_english/",
+                output_filename="${language}/" + DUMP_TO_PROCESS + "/${rank}.jsonl.gz",
+                # folder structure: language/dump/file
+            )
+        ),
+        GopherRepetitionFilter(
+            exclusion_writer=JsonlWriter(f"{FILTERING_OUTPUT_PATH}/removed/3_gopher_rep/{DUMP_TO_PROCESS}")
+        ),
+        GopherQualityFilter(
+            exclusion_writer=JsonlWriter(f"{FILTERING_OUTPUT_PATH}/removed/4_gopher_qual/{DUMP_TO_PROCESS}")
+        ),
+        C4QualityFilter(
+            filter_no_terminal_punct=False,
+            exclusion_writer=JsonlWriter(f"{FILTERING_OUTPUT_PATH}/removed/5_c4/{DUMP_TO_PROCESS}"),
+        ),
+        FineWebQualityFilter(
+            exclusion_writer=JsonlWriter(f"{FILTERING_OUTPUT_PATH}/removed/6_fineweb_qual/{DUMP_TO_PROCESS}")
+        ),
+        JsonlWriter(f"{FILTERING_OUTPUT_PATH}/output/{DUMP_TO_PROCESS}"),
+    ],
+    tasks=8000,
+    time="10:00:00",
+    logging_dir=f"{MAIN_OUTPUT_PATH}/logs/base_processing/{DUMP_TO_PROCESS}",
+    slurm_logs_folder=f"logs/base_processing/{DUMP_TO_PROCESS}/slurm_logs",  # must be local
+    randomize_start=True,  # don't hit the bucket all at once with the list requests
+    mem_per_cpu_gb=2,
+    partition="hopper-cpu",
+)
+main_processing_executor.run()
+
+"""
+    we then applied minhash deduplication to each individual dump,
+"""
+
+# you can also change ngrams or the number of buckets and their size here
+minhash_config = MinhashConfig(
+    use_64bit_hashes=True,  # better precision -> fewer false positives (collisions)
+    num_buckets=14,
+    hashes_per_bucket=8,
+    n_grams=5,
+)
+
+S3_MINHASH_BASE_PATH = f"{MAIN_OUTPUT_PATH}/minhash"
+
+S3_LOGS_FOLDER = f"{MAIN_OUTPUT_PATH}/logs/minhash"
+LOCAL_LOGS_FOLDER = "logs/minhash"
+
+TOTAL_TASKS = 1000
+
+# this is the original data that we want to deduplicate
+INPUT_READER = JsonlReader(
+    f"{FILTERING_OUTPUT_PATH}/output/{DUMP_TO_PROCESS}"
+)  # this is the output from the first part
+
+# stage 1 computes minhash signatures for each task (each task gets a set of files)
+stage1 = SlurmPipelineExecutor(
+    job_name=f"mh1_{DUMP_TO_PROCESS}",
+    pipeline=[
+        INPUT_READER,
+        MinhashDedupSignature(
+            output_folder=f"{S3_MINHASH_BASE_PATH}/{DUMP_TO_PROCESS}/signatures", config=minhash_config
+        ),
+    ],
+    tasks=TOTAL_TASKS,
+    time="5:00:00",
+    partition="hopper-cpu",
+    logging_dir=f"{S3_LOGS_FOLDER}/signatures",
+    slurm_logs_folder=f"{LOCAL_LOGS_FOLDER}/signatures/slurm_logs",
+    randomize_start=True,
+    depends=main_processing_executor,  # only start after the first one completes
+)
+
+stage2 = SlurmPipelineExecutor(
+    job_name=f"mh2_{DUMP_TO_PROCESS}",
+    pipeline=[
+        MinhashDedupBuckets(
+            input_folder=f"{S3_MINHASH_BASE_PATH}/{DUMP_TO_PROCESS}/signatures",
+            output_folder=f"{S3_MINHASH_BASE_PATH}/{DUMP_TO_PROCESS}/buckets",
+            config=MinhashConfig(use_64bit_hashes=True),
+        ),
+    ],
+    tasks=minhash_config.num_buckets * 50,  # the code supports parallelizing each bucket. here we run 50
+    # workers per bucket
+    randomize_start=True,
+    logging_dir=f"{S3_LOGS_FOLDER}/buckets",
+    partition="hopper-cpu",
+    time="02:00:00",
+    mem_per_cpu_gb=4,
+    cpus_per_task=3,  # you can add run more (smaller) tasks if you do not have a lot of memory
+    depends=stage1,
+)
+
+
+stage3 = SlurmPipelineExecutor(
+    job_name=f"mh3_{DUMP_TO_PROCESS}",
+    pipeline=[
+        MinhashDedupCluster(
+            input_folder=f"{S3_MINHASH_BASE_PATH}/{DUMP_TO_PROCESS}/buckets",
+            output_folder=f"{S3_MINHASH_BASE_PATH}/{DUMP_TO_PROCESS}/remove_ids",
+            config=minhash_config,
+        ),
+    ],
+    tasks=1,  # this step runs on a single task
+    logging_dir=f"{S3_LOGS_FOLDER}/clustering",
+    partition="hopper-cpu",
+    time="30:00:00",  # and can also be quite slow. Usually not this slow though
+    mem_per_cpu_gb=25,
+    cpus_per_task=8,  # if you dedup a full dump, you do need a lot of memory for this one
+    depends=stage2,
+)
+
+
+stage4 = SlurmPipelineExecutor(
+    job_name=f"mh4_{DUMP_TO_PROCESS}",
+    pipeline=[
+        INPUT_READER,
+        TokensCounter(),  # you can remove this one, it's just a nice way to know how many tokens we have
+        # before and after dedup
+        MinhashDedupFilter(input_folder=f"{S3_MINHASH_BASE_PATH}/{DUMP_TO_PROCESS}/remove_ids"),
+        # run the PII removal
+        PIIFormatter(),
+        JsonlWriter(f"{S3_MINHASH_BASE_PATH}/{DUMP_TO_PROCESS}/deduped_output"),
+    ],
+    tasks=TOTAL_TASKS,
+    logging_dir=f"{S3_LOGS_FOLDER}/filtering",
+    partition="hopper-cpu",
+    time="5:00:00",
+    mem_per_cpu_gb=4,
+    depends=stage3,
+)
+
+# launch dedup pipelines
+stage4.run()
diff --git a/examples/multilingual/README.md b/examples/multilingual/README.md
@@ -0,0 +1,122 @@
+# Multilingual CommonCrawl cleaning pipeline
+
+To extend the [RefinedWeb](https://arxiv.org/pdf/2306.01116.pdf) pipeline to support multilingual data, we build on top of the `datatrove` Python library. To effectively process multilingual data, we use per-language word tokenizers and adjust the Gopher quality filter thresholds for each language. Our implementation and filter thresholds are outlined in further sections.
+
+
+## Language-specific word tokenizers
+
+The filters used in the cleaning pipeline are sensitive to word tokenization, which can impact the results. Therefore, we use several word tokenization libraries to support almost 100 languages. In order to process large volumes of data efficiently, we utilize fast and reliable tokenization libraries: [NLTK](https://www.nltk.org/), [SpaCy](https://spacy.io/), [Stanza](https://stanfordnlp.github.io/stanza/), [Indic NLP Library](https://github.com/anoopkunchukuttan/indic_nlp_library), [Jieba](https://github.com/fxsjy/jieba), [NLPashto](https://pypi.org/project/nlpashto/), [PyVi](https://pypi.org/project/pyvi/) and [Anbani](https://github.com/Anbani/anbani.py). For languages that aren't officially supported by the libraries, we use a tokenizer of a supported language that is written in the same script and is from a close language family.
+
+To further analyze the implementation of word tokenizers, inspect the [word tokenizer source code](https://github.com/beme248/datatrove/blob/multilingual/src/datatrove/tools/word_tokenizers.py).
+
+
+## Multilingual Gopher quality filter: language-specific adjustments
+
+In our implementation of the multilingual Gopher quality filter, we make language-specific adjustments based on the statistical analysis of the Wikipedia data for the top 100 high-resource languages.
+
+We [extract the statistics](https://github.com/beme248/datatrove/blob/multilingual/examples/multilingual/lang_stats/wiki_lang_stats.py) for each language from their respective [Wikipedia dataset](https://huggingface.co/datasets/wikimedia/wikipedia). We use the [language statistics visualization tool](https://huggingface.co/spaces/ZR0zNqSGMI/mlo-language-statistics) to analyze the statistics. By comparing the statistic values among different languages, we identify that the following filter values should be tweaked per-language: `stop_words`, `min_avg_word_length`, `max_avg_word_length` and `max_non_alpha_words_ratio`.
+
+Further subsections explain the choice for filter threshold values. Other filter threshold values are set to the default values from the original Gopher quality filter.
+
+To further analyze the implementation of the filters, inspect the [Gopher quality filter source code](https://github.com/beme248/datatrove/blob/multilingual/src/datatrove/pipeline/filters/gopher_quality_filter.py) and [multilingual Gopher quality filter source code](https://github.com/beme248/datatrove/blob/multilingual/src/datatrove/pipeline/filters/multilingual_gopher_quality_filter.py).
+
+### `stop_words`
+
+To obtain stop words for each language, we count the occurrences of each word in the Wikipedia dataset. We choose stop word candidates as the highest frequency words. To account for differences among languages (e.g., English uses "the", while German uses "der", "die" and "das"), we select words with a frequency higher than 0.8% of the total word count frequency instead of a fixed number of stop words with the highest frequencies. We also remove whitespaces and symbols (e.g. "«" and "»") from the stop words.
+
+To reduce the risk of overfiltering the data, if there are less than 8 stop words in the cleaned stop words list, we choose words that appear more frequently than 0.3% of the total word count frequency. We remove whitespaces and symbols from them as well.
+
+To further analyze word frequencies, use the [language statistics visualization tool](https://huggingface.co/spaces/ZR0zNqSGMI/mlo-language-statistics) (tab *Word frequency*).
+
+
+### `min_avg_word_length` and `max_avg_word_length`
+
+We calculate the language-specific thresholds for `min_avg_word_length` and `max_avg_word_length` as one standard deviation below (for minimum) and one standard deviation above (for maximum) the mean word length value rounded to the closest integer. When computed for the English language, these values are similar to the original Gopher quality filter thresholds: 2 (for minimum) and 8 (for maximum).
+
+
+### `max_non_alpha_words_ratio`
+
+We calculate the `max_non_alpha_words_ratio` filter threshold for each language as three standard deviations below the mean `alpha_ratio` rounded to one decimal place. When computed for the English language, the value is equal to the default Gopher quality filter threshold: 0.8.
+
+# Running the pipeline
+
+## Install conda
+
+Follow [Quick command line install](https://docs.anaconda.com/free/miniconda/#quick-command-line-install) tutorial for Linux to set up `conda`.
+
+Restart your shell after running `~/miniconda3/bin/conda init bash` to be able to use `conda`.
+
+## Clone the repository
+
+```bash
+git clone -b multilingual https://github.com/beme248/datatrove
+cd datatrove
+```
+
+## Set up conda environment
+
+```bash
+conda create -n datatrove python=3.11
+conda activate datatrove
+pip install -e ".[all]" # Install dependencies
+```
+
+## Run the pipeline
+
+To generate language-specific filter thresholds (optional, filter thresholds are already provided in folder `filters`), run
+```bash
+python wiki_lang_stats.py filters
+```
+
+To start the CommonCrawl cleaning pipeline, run
+```bash
+python process_non_english.py DUMP_NAME
+```
+
+
+<!-- ## Running on the CSCS Slurm cluster
+
+### Set up access to CSCS Clariden cluster
+
+Follow the [tutorial](https://github.com/swiss-ai/documentation/blob/main/getting_started_with_clariden/setup_clariden.md) to set up the access to the Clariden cluster.
+
+By the end of the tutorial, you should be able to `ssh` into your account on the cluster.
+```bash
+ssh clariden
+```
+
+### Install conda
+
+Follow [Quick command line install](https://docs.anaconda.com/free/miniconda/#quick-command-line-install) tutorial for Linux to set up `conda` under your user on the cluster.
+
+Restart your shell after running `~/miniconda3/bin/conda init bash` to be able to use `conda`.
+
+### Clone the repository
+
+```bash
+git clone -b multilingual https://github.com/beme248/datatrove
+cd datatrove
+```
+
+### Set up conda environment
+
+```bash
+conda create -n datatrove python=3.11
+conda activate datatrove
+pip install -e ".[all]" # Install dependencies
+```
+
+### Run the pipeline
+
+
+```bash
+cd examples/multilingual
+```
+
+To generate language statistics (optional, language statistics are already provided), run
+```bash
+export HF_DATASETS_CACHE="$SCRATCH/hf_datasets"
+python wiki_lang_stats.py
+```
+
+Note that we change the HuggingFace datasets library cache to the `$SCRATCH` directory becuase the datasets will not fit in `$HOME` directory. -->