From 9ccda5eec2a66c2dbd87db612e0bc712893c8e9f Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 1 Nov 2024 21:19:50 +0100 Subject: [PATCH] improve docs for MinHashDedup --- src/distilabel/steps/filtering/_datasketch.py | 8 ++++---- src/distilabel/steps/filtering/minhash.py | 9 +++------ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/distilabel/steps/filtering/_datasketch.py b/src/distilabel/steps/filtering/_datasketch.py index 623cd45f0..5e2194049 100644 --- a/src/distilabel/steps/filtering/_datasketch.py +++ b/src/distilabel/steps/filtering/_datasketch.py @@ -13,8 +13,8 @@ # limitations under the License. """ -`dataskech` (https://github.com/ekzhu/datasketch) doesn't offer a way to store the hash tables in disk. This -is a custom implementation that uses `shelve` to store the hash tables in disk. +`datasketch` (https://github.com/ekzhu/datasketch) doesn't offer a way to store the hash tables in disk. This +is a custom implementation that uses `diskcache` to store the hash tables in disk. Note: This implementation is not optimized for performance, but could be worth creating a PR to `datasketch`. """ @@ -98,7 +98,7 @@ def insert(self, key, *vals, **kwargs): def ordered_storage(config, name=None): - """Copy of `datasketch.storage.ordered_storage` with the addition of `ShelveListStorage`.""" + """Copy of `datasketch.storage.ordered_storage` with the addition of `DiskCacheListStorage`.""" tp = config["type"] if tp == "disk": return DiskCacheListStorage(config, name=name) @@ -106,7 +106,7 @@ def ordered_storage(config, name=None): def unordered_storage(config, name=None): - """Copy of `datasketch.storage.ordered_storage` with the addition of `ShelveSetStorage`.""" + """Copy of `datasketch.storage.ordered_storage` with the addition of `DiskCacheSetStorage`.""" tp = config["type"] if tp == "disk": return DiskCacheSetStorage(config, name=name) diff --git a/src/distilabel/steps/filtering/minhash.py b/src/distilabel/steps/filtering/minhash.py index 5b779168a..e6bb8038a 100644 --- a/src/distilabel/steps/filtering/minhash.py +++ b/src/distilabel/steps/filtering/minhash.py @@ -92,12 +92,11 @@ class MinHashDedup(Step): Attributes: num_perm: the number of permutations to use. Defaults to `128`. - seed: the seed to use for the MinHash. This seed must be the same - used for `MinHash`, keep in mind when both steps are created. Defaults to `1`. + seed: the seed to use for the MinHash. Defaults to `1`. tokenizer: the tokenizer to use. Available ones are `words` or `ngrams`. - If `words` is selected, it tokenize the text into words using nltk's + If `words` is selected, it tokenizes the text into words using nltk's word tokenizer. `ngram` estimates the ngrams (together with the size - `n`) using. Defaults to `words`. + `n`). Defaults to `words`. n: the size of the ngrams to use. Only relevant if `tokenizer="ngrams"`. Defaults to `5`. threshold: the threshold to consider two MinHashes as duplicates. Values closer to 0 detect more duplicates. Defaults to `0.9`. @@ -106,8 +105,6 @@ class MinHashDedup(Step): not defined in `datasketch`, that is based on DiskCache's `Index` class. It should work as a `dict`, but backed by disk, but depending on the system it can be slower. Defaults to `dict`. - which uses a custom `shelve` backend. Note the `disk` - is an experimetal feature that may cause issues. Defaults to `dict`. Input columns: - text (`str`): the texts to be filtered.