From 9ccda5eec2a66c2dbd87db612e0bc712893c8e9f Mon Sep 17 00:00:00 2001
From: anakin87 <stefanofiorucci@gmail.com>
Date: Fri, 1 Nov 2024 21:19:50 +0100
Subject: [PATCH] improve docs for MinHashDedup

---
 src/distilabel/steps/filtering/_datasketch.py | 8 ++++----
 src/distilabel/steps/filtering/minhash.py     | 9 +++------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/distilabel/steps/filtering/_datasketch.py b/src/distilabel/steps/filtering/_datasketch.py
index 623cd45f0..5e2194049 100644
--- a/src/distilabel/steps/filtering/_datasketch.py
+++ b/src/distilabel/steps/filtering/_datasketch.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 """
-`dataskech` (https://github.com/ekzhu/datasketch) doesn't offer a way to store the hash tables in disk. This
-is a custom implementation that uses `shelve` to store the hash tables in disk.
+`datasketch` (https://github.com/ekzhu/datasketch) doesn't offer a way to store the hash tables in disk. This
+is a custom implementation that uses `diskcache` to store the hash tables in disk.
 Note: This implementation is not optimized for performance, but could be worth
 creating a PR to `datasketch`.
 """
@@ -98,7 +98,7 @@ def insert(self, key, *vals, **kwargs):
 
 
 def ordered_storage(config, name=None):
-    """Copy of `datasketch.storage.ordered_storage` with the addition of `ShelveListStorage`."""
+    """Copy of `datasketch.storage.ordered_storage` with the addition of `DiskCacheListStorage`."""
     tp = config["type"]
     if tp == "disk":
         return DiskCacheListStorage(config, name=name)
@@ -106,7 +106,7 @@ def ordered_storage(config, name=None):
 
 
 def unordered_storage(config, name=None):
-    """Copy of `datasketch.storage.ordered_storage` with the addition of `ShelveSetStorage`."""
+    """Copy of `datasketch.storage.ordered_storage` with the addition of `DiskCacheSetStorage`."""
     tp = config["type"]
     if tp == "disk":
         return DiskCacheSetStorage(config, name=name)
diff --git a/src/distilabel/steps/filtering/minhash.py b/src/distilabel/steps/filtering/minhash.py
index 5b779168a..e6bb8038a 100644
--- a/src/distilabel/steps/filtering/minhash.py
+++ b/src/distilabel/steps/filtering/minhash.py
@@ -92,12 +92,11 @@ class MinHashDedup(Step):
 
     Attributes:
         num_perm: the number of permutations to use. Defaults to `128`.
-        seed: the seed to use for the MinHash. This seed must be the same
-            used for `MinHash`, keep in mind when both steps are created. Defaults to `1`.
+        seed: the seed to use for the MinHash. Defaults to `1`.
         tokenizer: the tokenizer to use. Available ones are `words` or `ngrams`.
-            If `words` is selected, it tokenize the text into words using nltk's
+            If `words` is selected, it tokenizes the text into words using nltk's
             word tokenizer. `ngram` estimates the ngrams (together with the size
-            `n`) using. Defaults to `words`.
+            `n`). Defaults to `words`.
         n: the size of the ngrams to use. Only relevant if `tokenizer="ngrams"`. Defaults to `5`.
         threshold: the threshold to consider two MinHashes as duplicates.
             Values closer to 0 detect more duplicates. Defaults to `0.9`.
@@ -106,8 +105,6 @@ class MinHashDedup(Step):
             not defined in `datasketch`, that is based on DiskCache's `Index` class.
             It should work as a `dict`, but backed by disk, but depending on the system
             it can be slower. Defaults to `dict`.
-            which uses a custom `shelve` backend. Note the `disk`
-            is an experimetal feature that may cause issues. Defaults to `dict`.
 
     Input columns:
         - text (`str`): the texts to be filtered.