Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
fxmarty committed May 19, 2022
1 parent 57aed85 commit 2a120e4
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 10 deletions.
7 changes: 2 additions & 5 deletions optimum/utils/preprocessing/text_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,14 @@ def load_datasets(self):
)

# Preprocessing the raw_datasets
def preprocess_function(
examples, data_keys: Dict[str, str], tokenizer: PreTrainedTokenizerBase, max_length: int
):
def preprocess_function(examples, data_keys: Dict[str, str], tokenizer: PreTrainedTokenizerBase):
# Tokenize the texts

tokenized_inputs = tokenizer(
text=examples[data_keys["primary"]],
text_pair=examples[data_keys["secondary"]] if data_keys["secondary"] else None,
padding="max_length",
max_length=min(max_length, tokenizer.model_max_length),
max_length=tokenizer.model_max_length,
truncation=True,
)
return tokenized_inputs
Expand All @@ -74,7 +72,6 @@ def preprocess_function(
preprocess_function,
tokenizer=self.tokenizer,
data_keys=self.data_keys,
max_length=self.max_seq_length,
),
batched=True,
load_from_cache_file=True,
Expand Down
7 changes: 2 additions & 5 deletions optimum/utils/preprocessing/token_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,14 @@ def get_label_list(labels):
max_eval_samples = 100 # TODO remove this

# Preprocessing the raw_datasets
def preprocess_function(
examples, data_keys: Dict[str, str], tokenizer: PreTrainedTokenizerBase, max_length: Optional[int] = None
):
def preprocess_function(examples, data_keys: Dict[str, str], tokenizer: PreTrainedTokenizerBase):
# Tokenize the texts
tokenized_inputs = tokenizer(
text=examples[data_keys["primary"]],
text_pair=examples[data_keys["secondary"]] if data_keys["secondary"] else None,
padding="max_length",
truncation=True,
max_length=min(max_length, tokenizer.model_max_length),
max_length=tokenizer.model_max_length,
is_split_into_words=True,
)
return tokenized_inputs
Expand All @@ -79,7 +77,6 @@ def preprocess_function(
preprocess_function,
tokenizer=self.tokenizer,
data_keys=self.data_keys,
max_length=self.max_seq_length,
),
batched=True,
load_from_cache_file=True,
Expand Down

0 comments on commit 2a120e4

Please sign in to comment.