Improve READMEs for from_pretrained

huggingface · Aug 31, 2021 · ad7090a · ad7090a
1 parent a4d0f3d
commit ad7090a
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 0 deletions.
diff --git a/bindings/python/README.md b/bindings/python/README.md
@@ -68,6 +68,14 @@ pip install setuptools_rust
 python setup.py install
 ```
 
+### Load a pretrained tokenizer from the Hub
+
+```python
+from tokenizers import Tokenizer
+
+tokenizer = Tokenizer.from_pretrained("bert-base-cased")
+```
+
 ### Using the provided Tokenizers
 
 We provide some pre-build tokenizers to cover the most common cases. You can easily load one of

diff --git a/tokenizers/README.md b/tokenizers/README.md
@@ -33,6 +33,20 @@ The various steps of the pipeline are:
 4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant
    that, for example, a language model would need, such as special tokens.
 
+### Loading a pretrained tokenizer from the Hub
+```rust
+use tokenizers::tokenizer::{Result, Tokenizer};
+
+fn main() -> Result<()> {
+    let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None)?;
+
+    let encoding = tokenizer.encode("Hey there!", false)?;
+    println!("{:?}", encoding.get_tokens());
+
+    Ok(())
+}
+```
+
 ### Deserialization and tokenization example
 
 ```rust

diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs
@@ -21,6 +21,20 @@
 //! 4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant
 //!    that, for example, a language model would need, such as special tokens.
 //!
+//! ## Loading a pretrained tokenizer from the Hub
+//! ```
+//! use tokenizers::tokenizer::{Result, Tokenizer};
+//!
+//! fn main() -> Result<()> {
+//!     let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None)?;
+//!
+//!     let encoding = tokenizer.encode("Hey there!", false)?;
+//!     println!("{:?}", encoding.get_tokens());
+//!
+//!     Ok(())
+//! }
+//! ```
+//!
 //! ## Deserialization and tokenization example
 //!
 //! ```no_run

diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
@@ -397,6 +397,13 @@ impl Tokenizer {
         let content = read_to_string(file)?;
         Ok(serde_json::from_str(&content)?)
     }
+    pub fn from_pretrained<S: AsRef<str>>(
+        identifier: S,
+        params: Option<FromPretrainedParameters>,
+    ) -> Result<Self> {
+        let tokenizer_file = from_pretrained(identifier, params)?;
+        Tokenizer::from_file(tokenizer_file)
+    }
 }
 
 impl std::str::FromStr for Tokenizer {