huggingface · n1t0 · Feb 14, 2020 · Feb 13, 2020 · Feb 13, 2020 · Feb 13, 2020
diff --git a/bindings/node/lib/bindings/decoders.d.ts b/bindings/node/lib/bindings/decoders.d.ts
@@ -13,8 +13,10 @@ export function byteLevelDecoder(): Decoder;
 /**
  * Instantiate a new WordPiece Decoder
  * @param [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
+ * @param [cleanup=true] Whether to cleanup some tokenization artifacts.
+ * Mainly spaces before punctuation, and some abbreviated english forms.
  */
-export function wordPieceDecoder(prefix?: string): Decoder;
+export function wordPieceDecoder(prefix?: string, cleanup?: boolean): Decoder;
 
 /**
  * Instantiate a new Metaspace

diff --git a/bindings/node/native/src/decoders.rs b/bindings/node/native/src/decoders.rs
@@ -30,19 +30,25 @@ fn byte_level(mut cx: FunctionContext) -> JsResult<JsDecoder> {
     Ok(decoder)
 }
 
-/// wordpiece(prefix: String = "##")
+/// wordpiece(prefix: String = "##", cleanup: bool)
 fn wordpiece(mut cx: FunctionContext) -> JsResult<JsDecoder> {
     let mut prefix = String::from("##");
     if let Some(args) = cx.argument_opt(0) {
         prefix = args.downcast::<JsString>().or_throw(&mut cx)?.value() as String;
     }
+    let mut cleanup = true;
+    if let Some(args) = cx.argument_opt(1) {
+        cleanup = args.downcast::<JsBoolean>().or_throw(&mut cx)?.value();
+    }
 
     let mut decoder = JsDecoder::new::<_, JsDecoder, _>(&mut cx, vec![])?;
     let guard = cx.lock();
     decoder
         .borrow_mut(&guard)
         .decoder
-        .to_owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix)));
+        .to_owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
+            prefix, cleanup,
+        )));
     Ok(decoder)
 }
 

diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
@@ -43,18 +43,24 @@ pub struct WordPiece {}
 #[pymethods]
 impl WordPiece {
     #[new]
-    #[args(kwargs="**")]
+    #[args(kwargs = "**")]
     fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
         let mut prefix = String::from("##");
+        let mut cleanup = true;
 
         if let Some(kwargs) = kwargs {
             if let Some(p) = kwargs.get_item("prefix") {
                 prefix = p.extract()?;
             }
+            if let Some(c) = kwargs.get_item("cleanup") {
+                cleanup = c.extract()?;
+            }
         }
 
         Ok(obj.init(Decoder {
-            decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix))),
+            decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
+                prefix, cleanup,
+            ))),
         }))
     }
 }

diff --git a/bindings/python/tokenizers/decoders/__init__.pyi b/bindings/python/tokenizers/decoders/__init__.pyi
@@ -22,12 +22,15 @@ class WordPiece(Decoder):
     """ WordPiece Decoder """
 
     @staticmethod
-    def __init__(self, prefix: str = "##") -> Decoder:
+    def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
         """ Instantiate a new WordPiece Decoder
 
         Args:
             prefix: str:
                 The prefix to use for subwords that are not a beginning-of-word
+            cleanup: bool:
+                Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
+                and some abbreviated english forms.
         """
         pass
 

diff --git a/tokenizers/src/decoders/wordpiece.rs b/tokenizers/src/decoders/wordpiece.rs
@@ -1,25 +1,47 @@
 use crate::tokenizer::{Decoder, Result};
 
+/// The WordPiece decoder takes care of decoding a list of wordpiece tokens
+/// back into a readable string.
 pub struct WordPiece {
+    /// The prefix to be used for continuing subwords
     prefix: String,
+    /// Whether to cleanup some tokenization artifacts (spaces before punctuation, ...)
+    cleanup: bool,
 }
 
 impl WordPiece {
-    pub fn new(prefix: String) -> Self {
-        Self { prefix }
+    pub fn new(prefix: String, cleanup: bool) -> Self {
+        Self { prefix, cleanup }
     }
 }
 
 impl Default for WordPiece {
     fn default() -> Self {
         Self {
             prefix: String::from("##"),
+            cleanup: true,
         }
     }
 }
 
 impl Decoder for WordPiece {
     fn decode(&self, tokens: Vec<String>) -> Result<String> {
-        Ok(tokens.join(" ").replace(&format!(" {}", self.prefix), ""))
+        let mut output = tokens.join(" ").replace(&format!(" {}", self.prefix), "");
+        if self.cleanup {
+            output = output
+                .replace(" .", ".")
+                .replace(" ?", "?")
+                .replace(" !", "!")
+                .replace(" ,", ",")
+                .replace(" ' ", "'")
+                .replace(" n't", "n't")
+                .replace(" 'm", "'m")
+                .replace(" do not", " don't")
+                .replace(" 's", "'s")
+                .replace(" 've", "'ve")
+                .replace(" 're", "'re");
+        }
+
+        Ok(output)
     }
 }