Skip to content

Commit

Permalink
deduplication and made style
Browse files Browse the repository at this point in the history
  • Loading branch information
Juneja Sarjil committed Jan 19, 2025
1 parent 2bcea3a commit 15dd3e6
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 6 deletions.
4 changes: 2 additions & 2 deletions docs/source/modules/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,8 @@ of vocabs.
- 70
- অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ০১২৩৪৫৬৭৮৯
* - gujarati
- 107
- અઆઇઈઉઊઋએઐઓઔઅંઅઃકખગઘચછજઝઞટઠડઢણતથદધનપફબભમયરલવશષસહળક્ષજ્ઞ૦૧૨૩૪૫૬૭૮૯!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~૰ઽ◌ંઃ॥ૐ઼ ઁ૱
- 103
- અઆઇઈઉઊઋએઐઓઔકખગઘચછજઝઞટઠડઢણતથદધનપફબભમયરલવશષસહળક્ષજ્ઞ૦૧૨૩૪૫૬૭૮૯!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~૰ઽ◌ંઃ॥ૐ઼ ઁ૱
* - multilingual
- 195
- english & french & german & italian & spanish & portuguese & czech & polish & dutch & norwegian & danish & finnish & swedish & §
14 changes: 10 additions & 4 deletions doctr/datasets/vocabs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
"hindi_letters": "अआइईउऊऋॠऌॡएऐओऔंःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह",
"hindi_digits": "०१२३४५६७८९",
"hindi_punctuation": "।,?!:्ॐ॰॥",
"gujarati_vowels": "અઆઇઈઉઊઋએઐઓઔઅંઅઃ",
"gujarati_consonants":"કખગઘચછજઝઞટઠડઢણતથદધનપફબભમયરલવશષસહળક્ષજ્ઞ",
"gujarati_digits":"૦૧૨૩૪૫૬૭૮૯",
"gujarati_vowels": "અઆઇઈઉઊઋએઐઓ",
"gujarati_consonants": "કખગઘચછજઝઞટઠડઢણતથદધનપફબભમયરલવશષસહળક્ષજ્ઞ",
"gujarati_digits": "૦૧૨૩૪૫૬૭૮૯",
"gujarati_punctuation": "૰ઽ◌ંઃ॥ૐ઼ઁ" + "૱",
"bangla_letters": "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ",
"bangla_digits": "০১২৩৪৫৬৭৮৯",
Expand Down Expand Up @@ -62,7 +62,13 @@
)
VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪"
VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"]
VOCABS["gujarati"] = VOCABS["gujarati_vowels"] + VOCABS["gujarati_consonants"] + VOCABS["gujarati_digits"] + VOCABS["gujarati_punctuation"] + VOCABS["punctuation"]
VOCABS["gujarati"] = (
VOCABS["gujarati_vowels"]
+ VOCABS["gujarati_consonants"]
+ VOCABS["gujarati_digits"]
+ VOCABS["gujarati_punctuation"]
+ VOCABS["punctuation"]
)
VOCABS["bangla"] = VOCABS["bangla_letters"] + VOCABS["bangla_digits"]
VOCABS["ukrainian"] = (
VOCABS["generic_cyrillic_letters"] + VOCABS["digits"] + VOCABS["punctuation"] + VOCABS["currency"] + "ґіїєҐІЇЄ₴"
Expand Down

0 comments on commit 15dd3e6

Please sign in to comment.