Skip to content

Commit

Permalink
add [UNK] token if token encoding results in empty list
Browse files Browse the repository at this point in the history
  • Loading branch information
markus-eberts committed May 31, 2021
1 parent 8cfbbd9 commit e0d9aee
Showing 1 changed file with 2 additions and 0 deletions.
2 changes: 2 additions & 0 deletions spert/input_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,8 @@ def _parse_tokens(jtokens, dataset, tokenizer):
# parse tokens
for i, token_phrase in enumerate(jtokens):
token_encoding = tokenizer.encode(token_phrase, add_special_tokens=False)
if not token_encoding:
token_encoding = [tokenizer.convert_tokens_to_ids('[UNK]')]
span_start, span_end = (len(doc_encoding), len(doc_encoding) + len(token_encoding))

token = dataset.create_token(i, span_start, span_end, token_phrase)
Expand Down

0 comments on commit e0d9aee

Please sign in to comment.