Skip to content

Commit

Permalink
update and refactor
Browse files Browse the repository at this point in the history
Signed-off-by: stevehuang52 <[email protected]>
  • Loading branch information
stevehuang52 committed Oct 31, 2023
1 parent baea154 commit 31740ef
Show file tree
Hide file tree
Showing 4 changed files with 3 additions and 3 deletions.
1 change: 0 additions & 1 deletion nemo/collections/asr/data/huggingface/hf_audio_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
from typing import Callable, Dict, List, Optional, Tuple, Union

import datasets as hf_datasets
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/asr/parts/mixins/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def get_vocab():

hf_tokenizer_kwargs = tokenizer_cfg.get('hf_kwargs', {})
tokenizer = tokenizers.AutoTokenizer(
pretrained_model_name='bert-base-cased',
pretrained_model_name=hf_tokenizer_kwargs.get('pretrained_model_name', 'bert-base-cased'),
vocab_file=vocab_path,
mask_token=hf_tokenizer_kwargs.get('mask_token', None),
bos_token=hf_tokenizer_kwargs.get('bos_token', None),
Expand Down
1 change: 1 addition & 0 deletions requirements/requirements_asr.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
braceexpand
datasets
editdistance
g2p_en
ipywidgets
Expand Down
2 changes: 1 addition & 1 deletion scripts/tokenizers/conf/huggingface_data_tokenizer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
num_workers: 8

# simple text cleaning, by default converts all chars to lower-case and only keeps alpha-numeric chars.
normalize_text: true
normalize_text: false
symbols_to_keep: ["'"] # a list of symbols to keep during text cleaning.

# the key for groundtruth transcription, e.g., MCV usually uses "sentence" while some others use "text"
Expand Down

0 comments on commit 31740ef

Please sign in to comment.