Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Replace existing WordSplitter with Tokenizers (#3361)
Browse files Browse the repository at this point in the history
* WIP: Remove splitter

* Convert WordSplitters to Tokenizers

Remove WordSplitter and move the existing splitters to tokenizer.

* Move Tokenizers to separate files.

Move legacy handling into Tokenizer.from_params.

* Add legacy tokenizer loading test.

And move tokenizer tests to individual files.

* Rename white_space_tokenizer to whitespace_tokenizer

And add it to registry under "whitespace".
  • Loading branch information
sai-prasanna authored and matt-gardner committed Oct 16, 2019
1 parent 6373bbb commit 2850579
Show file tree
Hide file tree
Showing 76 changed files with 744 additions and 1,165 deletions.
6 changes: 3 additions & 3 deletions allennlp/data/dataset_readers/copynet_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import TextField, ArrayField, MetadataField, NamespaceSwappingField
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.data.tokenizers import Token, Tokenizer, SpacyTokenizer
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer


Expand Down Expand Up @@ -65,7 +65,7 @@ class CopyNetDatasetReader(DatasetReader):
in order to construct the NamespaceSwappingField.
source_tokenizer : ``Tokenizer``, optional
Tokenizer to use to split the input sequences into words or other kinds of tokens. Defaults
to ``WordTokenizer()``.
to ``SpacyTokenizer()``.
target_tokenizer : ``Tokenizer``, optional
Tokenizer to use to split the output sequences (during training) into words or other kinds
of tokens. Defaults to ``source_tokenizer``.
Expand Down Expand Up @@ -105,7 +105,7 @@ def __init__(
) -> None:
super().__init__(lazy)
self._target_namespace = target_namespace
self._source_tokenizer = source_tokenizer or WordTokenizer()
self._source_tokenizer = source_tokenizer or SpacyTokenizer()
self._target_tokenizer = target_tokenizer or self._source_tokenizer
self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()}
if "tokens" not in self._source_token_indexers or not isinstance(
Expand Down
6 changes: 3 additions & 3 deletions allennlp/data/dataset_readers/event2mind.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import TextField
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.data.tokenizers import Token, Tokenizer, SpacyTokenizer
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -42,7 +42,7 @@ class Event2MindDatasetReader(DatasetReader):
----------
source_tokenizer : ``Tokenizer``, optional
Tokenizer to use to split the input sequences into words or other kinds of tokens. Defaults
to ``WordTokenizer()``.
to ``SpacyTokenizer()``.
target_tokenizer : ``Tokenizer``, optional
Tokenizer to use to split the output sequences (during training) into words or other kinds
of tokens. Defaults to ``source_tokenizer``.
Expand Down Expand Up @@ -74,7 +74,7 @@ def __init__(
lazy: bool = False,
) -> None:
super().__init__(lazy)
self._source_tokenizer = source_tokenizer or WordTokenizer()
self._source_tokenizer = source_tokenizer or SpacyTokenizer()
self._target_tokenizer = target_tokenizer or self._source_tokenizer
self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()}
self._target_token_indexers = target_token_indexers or self._source_token_indexers
Expand Down
6 changes: 3 additions & 3 deletions allennlp/data/dataset_readers/language_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from allennlp.common.tqdm import Tqdm
from allennlp.data.instance import Instance
from allennlp.data.tokenizers.tokenizer import Tokenizer
from allennlp.data.tokenizers import WordTokenizer
from allennlp.data.tokenizers import SpacyTokenizer
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.token_indexers.token_indexer import TokenIndexer
from allennlp.data.fields import TextField
Expand All @@ -33,7 +33,7 @@ class LanguageModelingReader(DatasetReader):
not ``None``, we will instead take all sentences, including their start and stop tokens,
line them up, and split the tokens into groups of this number, for more efficient training
of the language model.
tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
tokenizer : ``Tokenizer``, optional (default=``SpacyTokenizer()``)
We use this ``Tokenizer`` for the text. See :class:`Tokenizer`.
token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
We use this to define the input representation for the text. See :class:`TokenIndexer`.
Expand All @@ -50,7 +50,7 @@ def __init__(
lazy: bool = False,
) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer()
self._tokenizer = tokenizer or SpacyTokenizer()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
self._tokens_per_instance = tokens_per_instance

Expand Down
8 changes: 4 additions & 4 deletions allennlp/data/dataset_readers/masked_language_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from allennlp.data.instance import Instance
from allennlp.data.tokenizers.tokenizer import Tokenizer
from allennlp.data.tokenizers import Token, WordTokenizer
from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
from allennlp.data.tokenizers import Token
from allennlp.data.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.token_indexers.token_indexer import TokenIndexer
from allennlp.data.fields import IndexField, Field, ListField, TextField
Expand Down Expand Up @@ -36,7 +36,7 @@ class MaskedLanguageModelingReader(DatasetReader):
Parameters
----------
tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
tokenizer : ``Tokenizer``, optional (default=``WhitespaceTokenizer()``)
We use this ``Tokenizer`` for the text. See :class:`Tokenizer`.
token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
We use this to define the input representation for the text, and to get ids for the mask
Expand All @@ -50,7 +50,7 @@ def __init__(
lazy: bool = False,
) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer(word_splitter=JustSpacesWordSplitter())
self._tokenizer = tokenizer or WhitespaceTokenizer()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

@overrides
Expand Down
8 changes: 4 additions & 4 deletions allennlp/data/dataset_readers/next_token_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from allennlp.data.instance import Instance
from allennlp.data.tokenizers.tokenizer import Tokenizer
from allennlp.data.tokenizers import Token, WordTokenizer
from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
from allennlp.data.tokenizers import Token
from allennlp.data.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.token_indexers.token_indexer import TokenIndexer
from allennlp.data.fields import Field, TextField
Expand All @@ -32,7 +32,7 @@ class NextTokenLmReader(DatasetReader):
Parameters
----------
tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
tokenizer : ``Tokenizer``, optional (default=``WhitespaceTokenizer()``)
We use this ``Tokenizer`` for the text. See :class:`Tokenizer`.
token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
We use this to define the input representation for the text, and to get ids for the mask
Expand All @@ -46,7 +46,7 @@ def __init__(
lazy: bool = False,
) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer(word_splitter=JustSpacesWordSplitter())
self._tokenizer = tokenizer or WhitespaceTokenizer()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

@overrides
Expand Down
8 changes: 4 additions & 4 deletions allennlp/data/dataset_readers/quora_paraphrase.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
from allennlp.data.tokenizers import Tokenizer
from allennlp.data.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer

logger = logging.getLogger(__name__)
Expand All @@ -34,7 +34,7 @@ class QuoraParaphraseDatasetReader(DatasetReader):
in memory.
tokenizer : ``Tokenizer``, optional
Tokenizer to use to split the premise and hypothesis into words or other kinds of tokens.
Defaults to ``WordTokenizer(JustSpacesWordSplitter())``.
Defaults to ``WhitespaceTokenizer``.
token_indexers : ``Dict[str, TokenIndexer]``, optional
Indexers used to define input token representations. Defaults to ``{"tokens":
SingleIdTokenIndexer()}``.
Expand All @@ -47,7 +47,7 @@ def __init__(
token_indexers: Dict[str, TokenIndexer] = None,
) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter())
self._tokenizer = tokenizer or WhitespaceTokenizer()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

@overrides
Expand Down
8 changes: 4 additions & 4 deletions allennlp/data/dataset_readers/reading_comprehension/drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
)
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.data.tokenizers import Token, Tokenizer, SpacyTokenizer


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -72,9 +72,9 @@ class DropReader(DatasetReader):
Parameters
----------
tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
tokenizer : ``Tokenizer``, optional (default=``SpacyTokenizer()``)
We use this ``Tokenizer`` for both the question and the passage. See :class:`Tokenizer`.
Default is ```WordTokenizer()``.
Default is ```SpacyTokenizer()``.
token_indexers : ``Dict[str, TokenIndexer]``, optional
We similarly use this for both the question and the passage. See :class:`TokenIndexer`.
Default is ``{"tokens": SingleIdTokenIndexer()}``.
Expand Down Expand Up @@ -122,7 +122,7 @@ def __init__(
relaxed_span_match_for_finding_labels: bool = True,
) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer()
self._tokenizer = tokenizer or SpacyTokenizer()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
self.passage_length_limit = passage_length_limit
self.question_length_limit = question_length_limit
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from allennlp.data.instance import Instance
from allennlp.data.fields import Field, TextField, ListField, MetadataField, IndexField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from allennlp.data.tokenizers import Tokenizer, SpacyTokenizer

logger = logging.getLogger(__name__)

Expand All @@ -24,9 +24,9 @@ class QangarooReader(DatasetReader):
Parameters
----------
tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
tokenizer : ``Tokenizer``, optional (default=``SpacyTokenizer()``)
We use this ``Tokenizer`` for both the question and the passage. See :class:`Tokenizer`.
Default is ```WordTokenizer()``.
Default is ```SpacyTokenizer()``.
token_indexers : ``Dict[str, TokenIndexer]``, optional
We similarly use this for both the question and the passage. See :class:`TokenIndexer`.
Default is ``{"tokens": SingleIdTokenIndexer()}``.
Expand All @@ -40,7 +40,7 @@ def __init__(
) -> None:

super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer()
self._tokenizer = tokenizer or SpacyTokenizer()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

@overrides
Expand Down
8 changes: 4 additions & 4 deletions allennlp/data/dataset_readers/reading_comprehension/quac.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from allennlp.data.instance import Instance
from allennlp.data.dataset_readers.reading_comprehension import util
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.data.tokenizers import Token, Tokenizer, SpacyTokenizer

logger = logging.getLogger(__name__)

Expand All @@ -29,9 +29,9 @@ class QuACReader(DatasetReader):
Parameters
----------
tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
tokenizer : ``Tokenizer``, optional (default=``SpacyTokenizer()``)
We use this ``Tokenizer`` for both the question and the passage. See :class:`Tokenizer`.
Default is ```WordTokenizer()``.
Default is ```SpacyTokenizer()``.
token_indexers : ``Dict[str, TokenIndexer]``, optional
We similarly use this for both the question and the passage. See :class:`TokenIndexer`.
Default is ``{"tokens": SingleIdTokenIndexer()}``.
Expand All @@ -47,7 +47,7 @@ def __init__(
num_context_answers: int = 0,
) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer()
self._tokenizer = tokenizer or SpacyTokenizer()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
self._num_context_answers = num_context_answers

Expand Down
8 changes: 4 additions & 4 deletions allennlp/data/dataset_readers/reading_comprehension/squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from allennlp.data.instance import Instance
from allennlp.data.dataset_readers.reading_comprehension import util
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.data.tokenizers import Token, Tokenizer, SpacyTokenizer

logger = logging.getLogger(__name__)

Expand All @@ -36,9 +36,9 @@ class SquadReader(DatasetReader):
Parameters
----------
tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
tokenizer : ``Tokenizer``, optional (default=``SpacyTokenizer()``)
We use this ``Tokenizer`` for both the question and the passage. See :class:`Tokenizer`.
Default is ```WordTokenizer()``.
Default is ```SpacyTokenizer()``.
token_indexers : ``Dict[str, TokenIndexer]``, optional
We similarly use this for both the question and the passage. See :class:`TokenIndexer`.
Default is ``{"tokens": SingleIdTokenIndexer()}``.
Expand All @@ -63,7 +63,7 @@ def __init__(
skip_invalid_examples: bool = False,
) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer()
self._tokenizer = tokenizer or SpacyTokenizer()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
self.passage_length_limit = passage_length_limit
self.question_length_limit = question_length_limit
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from allennlp.data.instance import Instance
from allennlp.data.dataset_readers.reading_comprehension import util
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.data.tokenizers import Token, Tokenizer, SpacyTokenizer

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -44,7 +44,7 @@ class TriviaQaReader(DatasetReader):
tarball.
tokenizer : ``Tokenizer``, optional
We'll use this tokenizer on questions and evidence passages, defaulting to
``WordTokenizer`` if none is provided.
``SpacyTokenizer`` if none is provided.
token_indexers : ``Dict[str, TokenIndexer]``, optional
Determines how both the question and the evidence passages are represented as arrays. See
:class:`TokenIndexer`. Default is to have a single word ID for every token.
Expand All @@ -61,7 +61,7 @@ def __init__(
super().__init__(lazy)
self._base_tarball_path = base_tarball_path
self._unfiltered_tarball_path = unfiltered_tarball_path
self._tokenizer = tokenizer or WordTokenizer()
self._tokenizer = tokenizer or SpacyTokenizer()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

@overrides
Expand Down
6 changes: 3 additions & 3 deletions allennlp/data/dataset_readers/seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import TextField
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.data.tokenizers import Token, Tokenizer, SpacyTokenizer
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer

logger = logging.getLogger(__name__)
Expand All @@ -34,7 +34,7 @@ class Seq2SeqDatasetReader(DatasetReader):
----------
source_tokenizer : ``Tokenizer``, optional
Tokenizer to use to split the input sequences into words or other kinds of tokens. Defaults
to ``WordTokenizer()``.
to ``SpacyTokenizer()``.
target_tokenizer : ``Tokenizer``, optional
Tokenizer to use to split the output sequences (during training) into words or other kinds
of tokens. Defaults to ``source_tokenizer``.
Expand Down Expand Up @@ -63,7 +63,7 @@ def __init__(
lazy: bool = False,
) -> None:
super().__init__(lazy)
self._source_tokenizer = source_tokenizer or WordTokenizer()
self._source_tokenizer = source_tokenizer or SpacyTokenizer()
self._target_tokenizer = target_tokenizer or self._source_tokenizer
self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()}
self._target_token_indexers = target_token_indexers or self._source_token_indexers
Expand Down
6 changes: 3 additions & 3 deletions allennlp/data/dataset_readers/simple_language_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.token_indexers.token_indexer import TokenIndexer
from allennlp.data.tokenizers import WordTokenizer
from allennlp.data.tokenizers import SpacyTokenizer
from allennlp.data.tokenizers.tokenizer import Tokenizer

logger = logging.getLogger(__name__)
Expand All @@ -26,7 +26,7 @@ class SimpleLanguageModelingDatasetReader(DatasetReader):
----------
tokenizer : ``Tokenizer``, optional
Tokenizer to use to split the input sentences into words or other kinds of tokens. Defaults
to ``WordTokenizer()``.
to ``SpacyTokenizer()``.
token_indexers : ``Dict[str, TokenIndexer]``, optional
Indexers used to define input token representations. Defaults to
``{"tokens": SingleIdTokenIndexer()}``.
Expand All @@ -47,7 +47,7 @@ def __init__(
end_tokens: List[str] = None,
) -> None:
super().__init__(True)
self._tokenizer = tokenizer or WordTokenizer()
self._tokenizer = tokenizer or SpacyTokenizer()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
if max_sequence_length is not None:
self._max_sequence_length: Union[float, Optional[int]] = max_sequence_length
Expand Down
6 changes: 3 additions & 3 deletions allennlp/data/dataset_readers/snli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from allennlp.data.fields import Field, TextField, LabelField, MetadataField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from allennlp.data.tokenizers import Tokenizer, SpacyTokenizer

logger = logging.getLogger(__name__)

Expand All @@ -25,7 +25,7 @@ class SnliReader(DatasetReader):
Parameters
----------
tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
tokenizer : ``Tokenizer``, optional (default=``SpacyTokenizer()``)
We use this ``Tokenizer`` for both the premise and the hypothesis. See :class:`Tokenizer`.
token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
We similarly use this for both the premise and the hypothesis. See :class:`TokenIndexer`.
Expand All @@ -38,7 +38,7 @@ def __init__(
lazy: bool = False,
) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer()
self._tokenizer = tokenizer or SpacyTokenizer()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

@overrides
Expand Down
Loading

0 comments on commit 2850579

Please sign in to comment.