diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index c240b40dd..d3814115f 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -107,7 +107,7 @@ jobs: working-directory: ./bindings/python run: | source .env/bin/activate - pip install black==20.8b1 + pip install black==20.8b1 click==8.0.4 make check-style - name: Run tests diff --git a/bindings/node/CHANGELOG.md b/bindings/node/CHANGELOG.md index 88d0df3bb..1a4327dbe 100644 --- a/bindings/node/CHANGELOG.md +++ b/bindings/node/CHANGELOG.md @@ -1,3 +1,15 @@ +## [0.12.0] + +Bump minor version because of a breaking change. +Using `0.12` to match other bindings. + +- [#938] **Breaking change**. Decoder trait is modified to be composable. This is only breaking if you are using decoders on their own. tokenizers should be error free. +- [#939] Making the regex in `ByteLevel` pre_tokenizer optional (necessary for BigScience) + +- [#952] Fixed the vocabulary size of UnigramTrainer output (to respect added tokens) +- [#954] Fixed not being able to save vocabularies with holes in vocab (ConvBert). Yell warnings instead, but stop panicking. +- [#961] Added link for Ruby port of `tokenizers` + # [0.8.0](https://github.com/huggingface/tokenizers/compare/node-v0.7.0...node-v0.8.0) (2021-09-02) ### BREACKING CHANGES @@ -142,3 +154,12 @@ The files must now be provided first when calling `tokenizer.train(files, traine - Fix default special tokens in `BertWordPieceTokenizer` ([10e2d28](https://github.com/huggingface/tokenizers/commit/10e2d286caf517f0977c04cf8e1924aed90403c9)) - Fix return type of `getSpecialTokensMask` on `Encoding` ([9770be5](https://github.com/huggingface/tokenizers/commit/9770be566175dc9c44dd7dcaa00a57d0e4ca632b)) - Actually add special tokens in tokenizers implementations ([acef252](https://github.com/huggingface/tokenizers/commit/acef252dacc43adc414175cfc325668ad1488753)) + + +[#938]: https://github.com/huggingface/tokenizers/pull/938 +[#939]: https://github.com/huggingface/tokenizers/pull/939 +[#952]: https://github.com/huggingface/tokenizers/pull/952 +[#954]: https://github.com/huggingface/tokenizers/pull/954 +[#962]: https://github.com/huggingface/tokenizers/pull/962 +[#961]: https://github.com/huggingface/tokenizers/pull/961 +[#960]: https://github.com/huggingface/tokenizers/pull/960 diff --git a/bindings/node/package.json b/bindings/node/package.json index 1df9ac438..780a3d13c 100644 --- a/bindings/node/package.json +++ b/bindings/node/package.json @@ -1,6 +1,6 @@ { "name": "tokenizers", - "version": "0.8.3", + "version": "0.12.0", "description": "", "main": "./dist/index.js", "types": "./dist/index.d.ts", diff --git a/bindings/python/CHANGELOG.md b/bindings/python/CHANGELOG.md index 7e45f94dd..3ec6c2528 100644 --- a/bindings/python/CHANGELOG.md +++ b/bindings/python/CHANGELOG.md @@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.12.0] + +Bump minor version because of a breaking change. + +- [#938] **Breaking change**. Decoder trait is modified to be composable. This is only breaking if you are using decoders on their own. tokenizers should be error free. +- [#939] Making the regex in `ByteLevel` pre_tokenizer optional (necessary for BigScience) + +- [#952] Fixed the vocabulary size of UnigramTrainer output (to respect added tokens) +- [#954] Fixed not being able to save vocabularies with holes in vocab (ConvBert). Yell warnings instead, but stop panicking. +- [#962] Fix tests for python 3.10 +- [#961] Added link for Ruby port of `tokenizers` + ## [0.11.6] - [#919] Fixing single_word AddedToken. (regression from 0.11.2) @@ -360,6 +372,13 @@ delimiter (Works like `.split(delimiter)`) - Fix a bug that was causing crashes in Python 3.5 +[#938]: https://github.com/huggingface/tokenizers/pull/938 +[#939]: https://github.com/huggingface/tokenizers/pull/939 +[#952]: https://github.com/huggingface/tokenizers/pull/952 +[#954]: https://github.com/huggingface/tokenizers/pull/954 +[#962]: https://github.com/huggingface/tokenizers/pull/962 +[#961]: https://github.com/huggingface/tokenizers/pull/961 +[#960]: https://github.com/huggingface/tokenizers/pull/960 [#919]: https://github.com/huggingface/tokenizers/pull/919 [#916]: https://github.com/huggingface/tokenizers/pull/916 [#895]: https://github.com/huggingface/tokenizers/pull/895 diff --git a/bindings/python/py_src/tokenizers/__init__.py b/bindings/python/py_src/tokenizers/__init__.py index 81b9084f6..0341bb97b 100644 --- a/bindings/python/py_src/tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.11.6" +__version__ = "0.12.0" from typing import Tuple, Union, Tuple, List from enum import Enum diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 208ceb0d0..3013ef180 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -7,7 +7,7 @@ setup( name="tokenizers", - version="0.11.6", + version="0.12.0", description="Fast and Customizable Tokenizers", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/tokenizers/CHANGELOG.md b/tokenizers/CHANGELOG.md index 19ea5430a..b00f5e6d7 100644 --- a/tokenizers/CHANGELOG.md +++ b/tokenizers/CHANGELOG.md @@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.12.0] + +Bump minor version because of a breaking change. + +- [#938] **Breaking change**. Decoder trait is modified to be composable. This is only breaking if you are using decoders on their own. tokenizers should be error free. +- [#939] Making the regex in `ByteLevel` pre_tokenizer optional (necessary for BigScience) + +- [#952] Fixed the vocabulary size of UnigramTrainer output (to respect added tokens) +- [#954] Fixed not being able to save vocabularies with holes in vocab (ConvBert). Yell warnings instead, but stop panicking. +- [#961] Added link for Ruby port of `tokenizers` +- [#960] Feature gate for `cli` and its `clap` dependency + ## [0.11.3] - [#919] Fixing single_word AddedToken. (regression from 0.11.2) @@ -140,6 +152,13 @@ advised, but that's not the question) split up in multiple bytes - [#174]: The `LongestFirst` truncation strategy had a bug + +[#938]: https://github.com/huggingface/tokenizers/pull/938 +[#939]: https://github.com/huggingface/tokenizers/pull/939 +[#952]: https://github.com/huggingface/tokenizers/pull/952 +[#954]: https://github.com/huggingface/tokenizers/pull/954 +[#961]: https://github.com/huggingface/tokenizers/pull/961 +[#960]: https://github.com/huggingface/tokenizers/pull/960 [#919]: https://github.com/huggingface/tokenizers/pull/919 [#916]: https://github.com/huggingface/tokenizers/pull/916 [#884]: https://github.com/huggingface/tokenizers/pull/884 diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index c9c9c7e87..b681d3d0e 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -2,7 +2,7 @@ authors = ["Anthony MOI "] edition = "2018" name = "tokenizers" -version = "0.11.3" +version = "0.12.0" homepage = "https://github.com/huggingface/tokenizers" repository = "https://github.com/huggingface/tokenizers" documentation = "https://docs.rs/tokenizers/"